18d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/****************************************************************************** 28d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 38d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Copyright (C) 2015 The Android Open Source Project 48d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 58d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Licensed under the Apache License, Version 2.0 (the "License"); 68d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * you may not use this file except in compliance with the License. 78d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * You may obtain a copy of the License at: 88d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 98d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * http://www.apache.org/licenses/LICENSE-2.0 108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Unless required by applicable law or agreed to in writing, software 128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * distributed under the License is distributed on an "AS IS" BASIS, 138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * See the License for the specific language governing permissions and 158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * limitations under the License. 168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ***************************************************************************** 188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*/ 208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/** 218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ******************************************************************************* 228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @file 238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * ih264_resi_trans_quant_sse42.c 248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @brief 268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Contains function definitions single stage forward transform for H.264 278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * It will calculate the residue, do the cf and then do quantization 288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @author 308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Mohit [100664] 318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @par List of Functions: 338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * - ih264_resi_trans_quant_4x4_sse42() 348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * - ih264_resi_trans_quant_chroma_4x4_sse42() 358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @remarks 378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * None 388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ******************************************************************************* 408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S */ 418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* System include files */ 428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S#include <stddef.h> 438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* User include files */ 458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S#include "ih264_typedefs.h" 468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S#include "ih264_defs.h" 478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S#include "ih264_size_defs.h" 488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S#include "ih264_macros.h" 498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S#include "ih264_trans_macros.h" 508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S#include "ih264_trans_data.h" 518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S#include "ih264_structs.h" 528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S#include "ih264_trans_quant_itrans_iquant.h" 538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S#include <immintrin.h> 548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/** 558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ******************************************************************************* 568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @brief 588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * This function performs forward transform and quantization on a 4*4 block 598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @par Description: 618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * The function accepts source buffer and estimation buffer. From these, it 628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * computes the residue. This is residue is then transformed and quantized. 638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * The transform and quantization are in placed computed. They use the residue 648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * buffer for this. 658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] pu1_src 678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Pointer to source sub-block 688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] pu1_pred 708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Pointer to prediction sub-block 718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] pi2_out 738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Pointer to residual sub-block 748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] src_strd 768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Source stride 778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] pred_strd 798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Prediction stride 808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] dst_strd 828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Destination stride 838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] u4_qbits 858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * QP_BITS_h264_4x4 + floor(QP/6) 868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] pu2_threshold_matrix 888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Pointer to Forward Quant Threshold Matrix 898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] pu2_scale_matrix 918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Pointer to Forward Quant Scale Matrix 928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] u4_round_factor 948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Quantization Round factor 958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[out] pu1_nnz 978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Total non-zero coefficients in the current sub-block 988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @returns 1008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 1018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @remarks 1028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * None 1038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 1048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ******************************************************************************* 1058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S */ 1068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Svoid ih264_resi_trans_quant_4x4_sse42(UWORD8 *pu1_src, UWORD8 *pu1_pred, 1078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD16 *pi2_out, WORD32 src_strd, WORD32 pred_strd, 1088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S const UWORD16 *pu2_scale_matrix, const UWORD16 *pu2_threshold_matrix, 1098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S UWORD32 u4_qbits, UWORD32 u4_round_factor, UWORD8 *pu1_nnz, 1108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD16 *pi2_alt_dc_addr) 1118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S{ 1128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 tmp_dc, u4_zero_coeff, u4_nonzero_coeff = 0; 1138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 mask0, mask1; 1148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i sum0, sum1, sum2, cmp0, cmp1; 1158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i rnd_fact = _mm_set1_epi32(u4_round_factor); 1168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i temp_2 = _mm_set1_epi16(2); 1178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i temp_1 = _mm_set1_epi16(1); 1188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i src_r0, src_r1, src_r2, src_r3; 1198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i pred_r0, pred_r1, pred_r2, pred_r3; 1208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i temp0, temp1, temp2, temp3; 1218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i zero_8x16b = _mm_setzero_si128(); // all bits reset to zero 1228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i sign_reg0, sign_reg2; 1238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i scalemat_r0_r1, scalemat_r2_r3; 12425e8adb631df325607216ad6f3d6638442d9f453Harish Mahendrakar 12525e8adb631df325607216ad6f3d6638442d9f453Harish Mahendrakar UNUSED (pu2_threshold_matrix); 12625e8adb631df325607216ad6f3d6638442d9f453Harish Mahendrakar 1278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix)); //b00 b01 b02 b03 b10 b11 b12 b13 -- the scaling matrix 0th,1st row 1288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix + 8)); //b20 b21 b22 b23 b30 b31 b32 b33 -- the scaling matrix 2nd,3rd row 1298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r0 = _mm_loadl_epi64((__m128i *) (&pu1_src[0])); //a00 a01 a02 a03 0 0 0 0 0 0 0 0 -- all 8 bits 1308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r1 = _mm_loadl_epi64((__m128i *) (&pu1_src[src_strd])); //a10 a11 a12 a13 0 0 0 0 0 0 0 0 -- all 8 bits 1318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r2 = _mm_loadl_epi64((__m128i *) (&pu1_src[2 * src_strd])); //a20 a21 a22 a23 0 0 0 0 0 0 0 0 -- all 8 bits 1328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r3 = _mm_loadl_epi64((__m128i *) (&pu1_src[3 * src_strd])); //a30 a31 a32 a33 0 0 0 0 0 0 0 0 -- all 8 bits 1338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r0 = _mm_cvtepu8_epi16(src_r0); 1358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r1 = _mm_cvtepu8_epi16(src_r1); 1368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r2 = _mm_cvtepu8_epi16(src_r2); 1378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r3 = _mm_cvtepu8_epi16(src_r3); 1388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0])); //p00 p01 p02 p03 0 0 0 0 0 0 0 0 -- all 8 bits 1408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[pred_strd])); //p10 p11 p12 p13 0 0 0 0 0 0 0 0 -- all 8 bits 1418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * pred_strd])); //p20 p21 p22 p23 0 0 0 0 0 0 0 0 -- all 8 bits 1428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * pred_strd])); //p30 p31 p32 p33 0 0 0 0 0 0 0 0 -- all 8 bits 1438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pred_r0 = _mm_cvtepu8_epi16(pred_r0); //p00 p01 p02 p03 -- all 16 bits 1458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pred_r1 = _mm_cvtepu8_epi16(pred_r1); //p10 p11 p12 p13 -- all 16 bits 1468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pred_r2 = _mm_cvtepu8_epi16(pred_r2); //p20 p21 p22 p23 -- all 16 bits 1478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pred_r3 = _mm_cvtepu8_epi16(pred_r3); //p30 p31 p32 p33 -- all 16 bits 1488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r0 = _mm_sub_epi16(src_r0, pred_r0); 1508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r1 = _mm_sub_epi16(src_r1, pred_r1); 1518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r2 = _mm_sub_epi16(src_r2, pred_r2); 1528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r3 = _mm_sub_epi16(src_r3, pred_r3); 1538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /* Perform Forward transform */ 1558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /*-------------------------------------------------------------*/ 1568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /* DCT [ Horizontal transformation ] */ 1578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /*-------------------------------------------------------------*/ 1588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // Matrix transpose 1598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /* 1608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * a0 a1 a2 a3 1618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * b0 b1 b2 b3 1628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * c0 c1 c2 c3 1638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * d0 d1 d2 d3 1648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S */ 1658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp0 = _mm_unpacklo_epi16(src_r0, src_r1); //a0 b0 a1 b1 a2 b2 a3 b3 1668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_unpacklo_epi16(src_r2, src_r3); //c0 d0 c1 d1 c2 d2 c3 d3 1678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_unpacklo_epi32(temp0, temp2); //a0 b0 c0 d0 a1 b1 c1 d1 1688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_unpackhi_epi32(temp0, temp2); //a2 b2 c2 d2 a3 b3 c3 d3 1698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r0 = _mm_unpacklo_epi64(temp1, zero_8x16b); //a0 b0 c0 d0 1718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r1 = _mm_unpackhi_epi64(temp1, zero_8x16b); //a1 b1 c1 d1 1728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r2 = _mm_unpacklo_epi64(temp3, zero_8x16b); //a2 b2 c2 d2 1738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r3 = _mm_unpackhi_epi64(temp3, zero_8x16b); //a3 b3 c3 d3 1748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /*----------------------------------------------------------*/ 1768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /* x0 = z0 + z3 */ 1778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp0 = _mm_add_epi16(src_r0, src_r3); 1788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /* x1 = z1 + z2 */ 1798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(src_r1, src_r2); 1808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /* x2 = z1 - z2 */ 1818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_sub_epi16(src_r1, src_r2); 1828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /* x3 = z0 - z3 */ 1838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_sub_epi16(src_r0, src_r3); 1848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /* z0 = x0 + x1 */ 1868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r0 = _mm_add_epi16(temp0, temp1); 1878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /* z1 = (x3 << 1) + x2 */ 1888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r1 = _mm_slli_epi16(temp3, 1); //(x3<<1) 1898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r1 = _mm_add_epi16(src_r1, temp2); 1908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /* z2 = x0 - x1 */ 1918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r2 = _mm_sub_epi16(temp0, temp1); 1928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /* z3 = x3 - (x2 << 1) */ 1938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r3 = _mm_slli_epi16(temp2, 1); //(x2<<1) 1948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r3 = _mm_sub_epi16(temp3, src_r3); 1958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // Matrix transpose 1978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /* 1988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * a0 b0 c0 d0 1998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * a1 b1 c1 d1 2008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * a2 b2 c2 d2 2018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * a3 b3 c3 d3 2028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S */ 2038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp0 = _mm_unpacklo_epi16(src_r0, src_r1); //a0 a1 b0 b1 c0 c1 d0 d1 2048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_unpacklo_epi16(src_r2, src_r3); //a2 a3 b2 b3 c2 c3 d2 d3 2058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_unpacklo_epi32(temp0, temp2); //a0 a1 a2 a3 b0 b1 b2 b3 2068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_unpackhi_epi32(temp0, temp2); //c0 c1 c2 c3 d0 d1 d2 d3 2078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r0 = _mm_unpacklo_epi64(temp1, zero_8x16b); //a0 a1 a2 a3 2098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r1 = _mm_unpackhi_epi64(temp1, zero_8x16b); //b0 b1 b2 b3 2108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r2 = _mm_unpacklo_epi64(temp3, zero_8x16b); //c0 c1 c2 c3 2118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r3 = _mm_unpackhi_epi64(temp3, zero_8x16b); //d0 d1 d2 d3 2128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /*----------------------------------------------------------*/ 2148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /* x0 = z0 + z3 */ 2158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp0 = _mm_add_epi16(src_r0, src_r3); 2168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /* x1 = z1 + z2 */ 2178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(src_r1, src_r2); 2188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /* x2 = z1 - z2 */ 2198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_sub_epi16(src_r1, src_r2); 2208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /* x3 = z0 - z3 */ 2218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_sub_epi16(src_r0, src_r3); 2228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /* z0 = x0 + x1 */ 2248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r0 = _mm_add_epi16(temp0, temp1); 2258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /* z1 = (x3 << 1) + x2 */ 2268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r1 = _mm_slli_epi16(temp3, 1); //(x3<<1) 2278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r1 = _mm_add_epi16(src_r1, temp2); 2288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /* z2 = x0 - x1 */ 2298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r2 = _mm_sub_epi16(temp0, temp1); 2308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /* z3 = x3 - (x2 << 1) */ 2318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r3 = _mm_slli_epi16(temp2, 1); //(x2<<1) 2328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r3 = _mm_sub_epi16(temp3, src_r3); 2338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S tmp_dc = _mm_extract_epi16(src_r0,0); //a0 2358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *pi2_alt_dc_addr = tmp_dc; 2368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r0 = _mm_unpacklo_epi64(src_r0, src_r1); //a0 a1 a2 a3 b0 b1 b2 b3 2388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r2 = _mm_unpacklo_epi64(src_r2, src_r3); //c0 c1 c2 c3 d0 d1 d2 d3 2398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sign_reg0 = _mm_cmpgt_epi16(zero_8x16b,src_r0); 2408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sign_reg2 = _mm_cmpgt_epi16(zero_8x16b,src_r2); 2418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sign_reg0 = _mm_mullo_epi16(temp_2,sign_reg0); 2438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sign_reg2 = _mm_mullo_epi16(temp_2,sign_reg2); 2448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sign_reg0 = _mm_add_epi16(temp_1,sign_reg0); 2468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sign_reg2 = _mm_add_epi16(temp_1,sign_reg2); 2478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r0 = _mm_abs_epi16(src_r0); 2498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r2 = _mm_abs_epi16(src_r2); 2508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r1 = _mm_srli_si128(src_r0, 8); 2528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r0 = _mm_cvtepu16_epi32(src_r0); 2538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r1 = _mm_cvtepu16_epi32(src_r1); 2548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r3 = _mm_srli_si128(src_r2, 8); 2558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r2 = _mm_cvtepu16_epi32(src_r2); 2568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r3 = _mm_cvtepu16_epi32(src_r3); 2578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp0 = _mm_cvtepu16_epi32(scalemat_r0_r1); 2598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S scalemat_r0_r1 = _mm_srli_si128(scalemat_r0_r1, 8); 2608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_cvtepu16_epi32(scalemat_r2_r3); 2618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S scalemat_r2_r3 = _mm_srli_si128(scalemat_r2_r3, 8); 2628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_cvtepu16_epi32(scalemat_r0_r1); 2638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_cvtepu16_epi32(scalemat_r2_r3); 2648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp0 = _mm_mullo_epi32(temp0, src_r0); 2668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_mullo_epi32(temp1, src_r1); 2678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_mullo_epi32(temp2, src_r2); 2688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_mullo_epi32(temp3, src_r3); 2698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp0 = _mm_add_epi32(temp0,rnd_fact); 2718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi32(temp1,rnd_fact); 2728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi32(temp2,rnd_fact); 2738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_add_epi32(temp3,rnd_fact); 2748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp0 = _mm_srli_epi32(temp0,u4_qbits); 2768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_srli_epi32(temp1,u4_qbits); 2778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_srli_epi32(temp2,u4_qbits); 2788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_srli_epi32(temp3,u4_qbits); 2798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp0 = _mm_packs_epi32 (temp0,temp1); 2818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_packs_epi32 (temp2,temp3); 2828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp0 = _mm_sign_epi16(temp0, sign_reg0); 2848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_sign_epi16(temp2, sign_reg2); 2858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_storeu_si128((__m128i *) (&pi2_out[0]), temp0); 2878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_storeu_si128((__m128i *) (&pi2_out[8]), temp2); 2888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S cmp0 = _mm_cmpeq_epi16(temp0, zero_8x16b); 2908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S cmp1 = _mm_cmpeq_epi16(temp2, zero_8x16b); 2918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mask0 = _mm_movemask_epi8(cmp0); 2938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mask1 = _mm_movemask_epi8(cmp1); 2948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S u4_zero_coeff = 0; 2958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S if(mask0) 2968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S { 2978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S if(mask0 == 0xffff) 2988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S u4_zero_coeff+=8; 2998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S else 3008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S { 3018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S cmp0 = _mm_and_si128(temp_1, cmp0); 3028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sum0 = _mm_hadd_epi16(cmp0, zero_8x16b); 3038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sum1 = _mm_hadd_epi16(sum0, zero_8x16b); 3048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sum2 = _mm_hadd_epi16(sum1, zero_8x16b); 3058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S u4_zero_coeff += _mm_cvtsi128_si32(sum2); 3068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S } 3078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S } 3088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S if(mask1) 3098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S { 3108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S if(mask1 == 0xffff) 3118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S u4_zero_coeff+=8; 3128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S else 3138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S { 3148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S cmp1 = _mm_and_si128(temp_1, cmp1); 3158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sum0 = _mm_hadd_epi16(cmp1, zero_8x16b); 3168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sum1 = _mm_hadd_epi16(sum0, zero_8x16b); 3178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sum2 = _mm_hadd_epi16(sum1, zero_8x16b); 3188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S u4_zero_coeff += _mm_cvtsi128_si32(sum2); 3198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S } 3208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S } 3218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 3228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /* Return total nonzero coefficients in the current sub block */ 3238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S u4_nonzero_coeff = 16 - u4_zero_coeff; 3248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *pu1_nnz = u4_nonzero_coeff; 3258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S} 3268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 3278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/** 3288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ******************************************************************************* 3298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 3308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @brief 3318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * This function performs forward transform and quantization on a 4*4 chroma block 3328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 3338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @par Description: 3348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * The function accepts source buffer and estimation buffer. From these, it 3358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * computes the residue. This is residue is then transformed and quantized. 3368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * The transform and quantization are in placed computed. They use the residue 3378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * buffer for this. 3388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 3398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] pu1_src 3408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Pointer to source sub-block 3418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 3428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] pu1_pred 3438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Pointer to prediction sub-block 3448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 3458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] pi2_out 3468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Pointer to residual sub-block 3478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 3488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] src_strd 3498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Source stride 3508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 3518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] pred_strd 3528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Prediction stride 3538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 3548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] dst_strd 3558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Destination stride 3568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 3578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] u4_qbits 3588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * QP_BITS_h264_4x4 + floor(QP/6) 3598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 3608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] pu2_threshold_matrix 3618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Pointer to Forward Quant Threshold Matrix 3628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 3638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] pu2_scale_matrix 3648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Pointer to Forward Quant Scale Matrix 3658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 3668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] u4_round_factor 3678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Quantization Round factor 3688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 3698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[out] pu1_nnz 3708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Total non-zero coefficients in the current sub-block 3718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 3728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @returns 3738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 3748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @remarks 3758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * None 3768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 3778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ******************************************************************************* 3788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S */ 3798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Svoid ih264_resi_trans_quant_chroma_4x4_sse42(UWORD8 *pu1_src,UWORD8 *pu1_pred,WORD16 *pi2_out, 3808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 src_strd,WORD32 pred_strd, 3818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S const UWORD16 *pu2_scale_matrix, 3828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S const UWORD16 *pu2_threshold_matrix, 3838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S UWORD32 u4_qbits,UWORD32 u4_round_factor, 3848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S UWORD8 *pu1_nnz, WORD16 *pi2_alt_dc_addr) 3858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S{ 3868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 tmp_dc, u4_zero_coeff, u4_nonzero_coeff = 0; 3878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 mask0, mask1; 3888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i cmp0, cmp1, sum0, sum1, sum2; 3898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i rnd_fact = _mm_set1_epi32(u4_round_factor); 3908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i temp_2 = _mm_set1_epi16(2); 3918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i temp_1 = _mm_set1_epi16(1); 3928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i src_r0, src_r1, src_r2, src_r3; 3938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i pred_r0, pred_r1, pred_r2, pred_r3; 3948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i temp0, temp1, temp2, temp3; 3958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i zero_8x16b = _mm_setzero_si128(); // all bits reset to zero 3968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i sign_reg0, sign_reg2; 3978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i scalemat_r0_r1, scalemat_r2_r3; 3988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i chroma_mask = _mm_set1_epi16 (0xFF); 3998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 40025e8adb631df325607216ad6f3d6638442d9f453Harish Mahendrakar UNUSED (pu2_threshold_matrix); 40125e8adb631df325607216ad6f3d6638442d9f453Harish Mahendrakar 4028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix)); //b00 b01 b02 b03 b10 b11 b12 b13 -- the scaling matrix 0th,1st row 4038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix + 8)); //b20 b21 b22 b23 b30 b31 b32 b33 -- the scaling matrix 2nd,3rd row 4048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r0 = _mm_loadl_epi64((__m128i *) (&pu1_src[0])); //a00 a01 a02 a03 0 0 0 0 0 0 0 0 -- all 8 bits 4058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r1 = _mm_loadl_epi64((__m128i *) (&pu1_src[src_strd])); //a10 a11 a12 a13 0 0 0 0 0 0 0 0 -- all 8 bits 4068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r2 = _mm_loadl_epi64((__m128i *) (&pu1_src[2 * src_strd])); //a20 a21 a22 a23 0 0 0 0 0 0 0 0 -- all 8 bits 4078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r3 = _mm_loadl_epi64((__m128i *) (&pu1_src[3 * src_strd])); //a30 a31 a32 a33 0 0 0 0 0 0 0 0 -- all 8 bits 4088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r0 = _mm_and_si128(src_r0, chroma_mask); 4108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r1 = _mm_and_si128(src_r1, chroma_mask); 4118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r2 = _mm_and_si128(src_r2, chroma_mask); 4128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r3 = _mm_and_si128(src_r3, chroma_mask); 4138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S// src_r0 = _mm_cvtepu8_epi16(src_r0); 4148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S// src_r1 = _mm_cvtepu8_epi16(src_r1); 4158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S// src_r2 = _mm_cvtepu8_epi16(src_r2); 4168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S// src_r3 = _mm_cvtepu8_epi16(src_r3); 4178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0])); //p00 p01 p02 p03 0 0 0 0 0 0 0 0 -- all 8 bits 4198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[pred_strd])); //p10 p11 p12 p13 0 0 0 0 0 0 0 0 -- all 8 bits 4208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * pred_strd])); //p20 p21 p22 p23 0 0 0 0 0 0 0 0 -- all 8 bits 4218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * pred_strd])); //p30 p31 p32 p33 0 0 0 0 0 0 0 0 -- all 8 bits 4228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pred_r0 = _mm_and_si128(pred_r0, chroma_mask); 4248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pred_r1 = _mm_and_si128(pred_r1, chroma_mask); 4258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pred_r2 = _mm_and_si128(pred_r2, chroma_mask); 4268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pred_r3 = _mm_and_si128(pred_r3, chroma_mask); 4278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S// pred_r0 = _mm_cvtepu8_epi16(pred_r0); //p00 p01 p02 p03 -- all 16 bits 4288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S// pred_r1 = _mm_cvtepu8_epi16(pred_r1); //p10 p11 p12 p13 -- all 16 bits 4298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S// pred_r2 = _mm_cvtepu8_epi16(pred_r2); //p20 p21 p22 p23 -- all 16 bits 4308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S// pred_r3 = _mm_cvtepu8_epi16(pred_r3); //p30 p31 p32 p33 -- all 16 bits 4318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r0 = _mm_sub_epi16(src_r0, pred_r0); 4338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r1 = _mm_sub_epi16(src_r1, pred_r1); 4348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r2 = _mm_sub_epi16(src_r2, pred_r2); 4358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r3 = _mm_sub_epi16(src_r3, pred_r3); 4368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /* Perform Forward transform */ 4388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /*-------------------------------------------------------------*/ 4398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /* DCT [ Horizontal transformation ] */ 4408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /*-------------------------------------------------------------*/ 4418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // Matrix transpose 4428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /* 4438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * a0 a1 a2 a3 4448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * b0 b1 b2 b3 4458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * c0 c1 c2 c3 4468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * d0 d1 d2 d3 4478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S */ 4488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp0 = _mm_unpacklo_epi16(src_r0, src_r1); //a0 b0 a1 b1 a2 b2 a3 b3 4498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_unpacklo_epi16(src_r2, src_r3); //c0 d0 c1 d1 c2 d2 c3 d3 4508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_unpacklo_epi32(temp0, temp2); //a0 b0 c0 d0 a1 b1 c1 d1 4518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_unpackhi_epi32(temp0, temp2); //a2 b2 c2 d2 a3 b3 c3 d3 4528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r0 = _mm_unpacklo_epi64(temp1, zero_8x16b); //a0 b0 c0 d0 4548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r1 = _mm_unpackhi_epi64(temp1, zero_8x16b); //a1 b1 c1 d1 4558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r2 = _mm_unpacklo_epi64(temp3, zero_8x16b); //a2 b2 c2 d2 4568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r3 = _mm_unpackhi_epi64(temp3, zero_8x16b); //a3 b3 c3 d3 4578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /*----------------------------------------------------------*/ 4598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /* x0 = z0 + z3 */ 4608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp0 = _mm_add_epi16(src_r0, src_r3); 4618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /* x1 = z1 + z2 */ 4628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(src_r1, src_r2); 4638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /* x2 = z1 - z2 */ 4648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_sub_epi16(src_r1, src_r2); 4658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /* x3 = z0 - z3 */ 4668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_sub_epi16(src_r0, src_r3); 4678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /* z0 = x0 + x1 */ 4698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r0 = _mm_add_epi16(temp0, temp1); 4708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /* z1 = (x3 << 1) + x2 */ 4718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r1 = _mm_slli_epi16(temp3, 1); //(x3<<1) 4728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r1 = _mm_add_epi16(src_r1, temp2); 4738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /* z2 = x0 - x1 */ 4748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r2 = _mm_sub_epi16(temp0, temp1); 4758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /* z3 = x3 - (x2 << 1) */ 4768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r3 = _mm_slli_epi16(temp2, 1); //(x2<<1) 4778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r3 = _mm_sub_epi16(temp3, src_r3); 4788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // Matrix transpose 4808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /* 4818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * a0 b0 c0 d0 4828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * a1 b1 c1 d1 4838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * a2 b2 c2 d2 4848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * a3 b3 c3 d3 4858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S */ 4868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp0 = _mm_unpacklo_epi16(src_r0, src_r1); //a0 a1 b0 b1 c0 c1 d0 d1 4878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_unpacklo_epi16(src_r2, src_r3); //a2 a3 b2 b3 c2 c3 d2 d3 4888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_unpacklo_epi32(temp0, temp2); //a0 a1 a2 a3 b0 b1 b2 b3 4898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_unpackhi_epi32(temp0, temp2); //c0 c1 c2 c3 d0 d1 d2 d3 4908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r0 = _mm_unpacklo_epi64(temp1, zero_8x16b); //a0 a1 a2 a3 4928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r1 = _mm_unpackhi_epi64(temp1, zero_8x16b); //b0 b1 b2 b3 4938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r2 = _mm_unpacklo_epi64(temp3, zero_8x16b); //c0 c1 c2 c3 4948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r3 = _mm_unpackhi_epi64(temp3, zero_8x16b); //d0 d1 d2 d3 4958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /*----------------------------------------------------------*/ 4978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /* x0 = z0 + z3 */ 4988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp0 = _mm_add_epi16(src_r0, src_r3); 4998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /* x1 = z1 + z2 */ 5008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(src_r1, src_r2); 5018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /* x2 = z1 - z2 */ 5028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_sub_epi16(src_r1, src_r2); 5038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /* x3 = z0 - z3 */ 5048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_sub_epi16(src_r0, src_r3); 5058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /* z0 = x0 + x1 */ 5078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r0 = _mm_add_epi16(temp0, temp1); 5088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /* z1 = (x3 << 1) + x2 */ 5098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r1 = _mm_slli_epi16(temp3, 1); //(x3<<1) 5108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r1 = _mm_add_epi16(src_r1, temp2); 5118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /* z2 = x0 - x1 */ 5128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r2 = _mm_sub_epi16(temp0, temp1); 5138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /* z3 = x3 - (x2 << 1) */ 5148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r3 = _mm_slli_epi16(temp2, 1); //(x2<<1) 5158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r3 = _mm_sub_epi16(temp3, src_r3); 5168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S tmp_dc = _mm_extract_epi16(src_r0,0); //a0 5188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *pi2_alt_dc_addr = tmp_dc; 5198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r0 = _mm_unpacklo_epi64(src_r0, src_r1); //a0 a1 a2 a3 b0 b1 b2 b3 5218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r2 = _mm_unpacklo_epi64(src_r2, src_r3); //c0 c1 c2 c3 d0 d1 d2 d3 5228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sign_reg0 = _mm_cmpgt_epi16(zero_8x16b,src_r0); 5238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sign_reg2 = _mm_cmpgt_epi16(zero_8x16b,src_r2); 5248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sign_reg0 = _mm_mullo_epi16(temp_2,sign_reg0); 5268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sign_reg2 = _mm_mullo_epi16(temp_2,sign_reg2); 5278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sign_reg0 = _mm_add_epi16(temp_1,sign_reg0); 5298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sign_reg2 = _mm_add_epi16(temp_1,sign_reg2); 5308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r0 = _mm_abs_epi16(src_r0); 5328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r2 = _mm_abs_epi16(src_r2); 5338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r1 = _mm_srli_si128(src_r0, 8); 5358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r0 = _mm_cvtepu16_epi32(src_r0); 5368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r1 = _mm_cvtepu16_epi32(src_r1); 5378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r3 = _mm_srli_si128(src_r2, 8); 5388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r2 = _mm_cvtepu16_epi32(src_r2); 5398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r3 = _mm_cvtepu16_epi32(src_r3); 5408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp0 = _mm_cvtepu16_epi32(scalemat_r0_r1); 5428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S scalemat_r0_r1 = _mm_srli_si128(scalemat_r0_r1, 8); 5438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_cvtepu16_epi32(scalemat_r2_r3); 5448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S scalemat_r2_r3 = _mm_srli_si128(scalemat_r2_r3, 8); 5458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_cvtepu16_epi32(scalemat_r0_r1); 5468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_cvtepu16_epi32(scalemat_r2_r3); 5478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp0 = _mm_mullo_epi32(temp0, src_r0); 5498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_mullo_epi32(temp1, src_r1); 5508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_mullo_epi32(temp2, src_r2); 5518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_mullo_epi32(temp3, src_r3); 5528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp0 = _mm_add_epi32(temp0,rnd_fact); 5548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi32(temp1,rnd_fact); 5558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi32(temp2,rnd_fact); 5568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_add_epi32(temp3,rnd_fact); 5578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp0 = _mm_srli_epi32(temp0,u4_qbits); 5598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_srli_epi32(temp1,u4_qbits); 5608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_srli_epi32(temp2,u4_qbits); 5618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_srli_epi32(temp3,u4_qbits); 5628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp0 = _mm_packs_epi32 (temp0,temp1); 5648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_packs_epi32 (temp2,temp3); 5658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp0 = _mm_sign_epi16(temp0, sign_reg0); 5678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_sign_epi16(temp2, sign_reg2); 5688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S //temp0 = _mm_insert_epi16(temp0, tmp_dc, 0); 5708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_storeu_si128((__m128i *) (&pi2_out[0]), temp0); 5728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_storeu_si128((__m128i *) (&pi2_out[8]), temp2); 5738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S cmp0 = _mm_cmpeq_epi16(temp0, zero_8x16b); 5758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S cmp1 = _mm_cmpeq_epi16(temp2, zero_8x16b); 5768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mask0 = _mm_movemask_epi8(cmp0); 5788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mask1 = _mm_movemask_epi8(cmp1); 5798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S u4_zero_coeff = 0; 5808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S if(mask0) 5818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S { 5828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S if(mask0 == 0xffff) 5838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S u4_zero_coeff+=8; 5848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S else 5858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S { 5868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S cmp0 = _mm_and_si128(temp_1, cmp0); 5878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sum0 = _mm_hadd_epi16(cmp0, zero_8x16b); 5888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sum1 = _mm_hadd_epi16(sum0, zero_8x16b); 5898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sum2 = _mm_hadd_epi16(sum1, zero_8x16b); 5908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S u4_zero_coeff += _mm_cvtsi128_si32(sum2); 5918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S } 5928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S } 5938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S if(mask1) 5948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S { 5958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S if(mask1 == 0xffff) 5968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S u4_zero_coeff+=8; 5978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S else 5988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S { 5998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S cmp1 = _mm_and_si128(temp_1, cmp1); 6008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sum0 = _mm_hadd_epi16(cmp1, zero_8x16b); 6018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sum1 = _mm_hadd_epi16(sum0, zero_8x16b); 6028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sum2 = _mm_hadd_epi16(sum1, zero_8x16b); 6038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S u4_zero_coeff += _mm_cvtsi128_si32(sum2); 6048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S } 6058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S } 6068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /* Return total nonzero coefficients in the current sub block */ 6088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S u4_nonzero_coeff = 16 - u4_zero_coeff; 6098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *pu1_nnz = u4_nonzero_coeff; 6108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S} 6128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/** 6158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ******************************************************************************* 6168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 6178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @brief 6188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * This function performs forward hadamard transform and quantization on a 4*4 block 6198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 6208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @par Description: 6218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * The function accepts source buffer and estimation buffer. From these, it 6228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * computes the residue. This is residue is then transformed and quantized. 6238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * The transform and quantization are in placed computed. They use the residue 6248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * buffer for this. 6258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 6268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] pu1_src 6278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Pointer to source sub-block 6288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 6298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] pu1_pred 6308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Pointer to prediction sub-block 6318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 6328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] pi2_out 6338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Pointer to residual sub-block 6348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 6358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] src_strd 6368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Source stride 6378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 6388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] pred_strd 6398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Prediction stride 6408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 6418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] dst_strd 6428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Destination stride 6438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 6448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] u4_qbits 6458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * QP_BITS_h264_4x4 + floor(QP/6) 6468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 6478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] pu2_threshold_matrix 6488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Pointer to Forward Quant Threshold Matrix 6498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 6508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] pu2_scale_matrix 6518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Pointer to Forward Quant Scale Matrix 6528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 6538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] u4_round_factor 6548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Quantization Round factor 6558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 6568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[out] pu1_nnz 6578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Total non-zero coefficients in the current sub-block 6588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 6598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @returns 6608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 6618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @remarks 6628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * None 6638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 6648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S */ 6658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Svoid ih264_hadamard_quant_4x4_sse42(WORD16 *pi2_src, WORD16 *pi2_dst, 6678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S const UWORD16 *pu2_scale_matrix, 6688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S const UWORD16 *pu2_threshold_matrix, UWORD32 u4_qbits, 6698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S UWORD32 u4_round_factor,UWORD8 *pu1_nnz 6708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ) 6718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S{ 6728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 u4_zero_coeff,u4_nonzero_coeff=0; 6738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i cmp0, cmp1, sum0, sum1, sum2; 6748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 mask0, mask1; 6758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i src_r0_r1, src_r2_r3, sign_reg; 6768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i src_r0, src_r1, src_r2, src_r3; 6778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i zero_8x16b = _mm_setzero_si128(); 6788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i temp0, temp1, temp2, temp3; 6798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i sign_reg0, sign_reg1, sign_reg2, sign_reg3; 6808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i temp_1 = _mm_set1_epi16(1); 6818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i rnd_fact = _mm_set1_epi32(u4_round_factor); 6828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i scale_val = _mm_set1_epi32(pu2_scale_matrix[0]); 6838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 68425e8adb631df325607216ad6f3d6638442d9f453Harish Mahendrakar UNUSED (pu2_threshold_matrix); 68525e8adb631df325607216ad6f3d6638442d9f453Harish Mahendrakar 6868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src)); //a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row 6878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8)); //a20 a21 a22 a23 a30 a31 a32 a33 -- the source matrix 2nd,3rd row 6888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sign_reg = _mm_cmpgt_epi16(zero_8x16b, src_r0_r1); 6898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r0 = _mm_unpacklo_epi16(src_r0_r1, sign_reg); //a0 a1 a2 a3 6908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r1 = _mm_unpackhi_epi16(src_r0_r1, sign_reg); //b0 b1 b2 b3 6918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sign_reg = _mm_cmpgt_epi16(zero_8x16b, src_r2_r3); 6928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r2 = _mm_unpacklo_epi16(src_r2_r3, sign_reg); //c0 c1 c2 c3 6938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r3 = _mm_unpackhi_epi16(src_r2_r3, sign_reg); //d0 d1 d2 d3 6948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /* Perform Inverse transform */ 6968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /*-------------------------------------------------------------*/ 6978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /* Forward DC transform [ Horizontal transformation ] */ 6988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /*-------------------------------------------------------------*/ 6998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // Matrix transpose 7008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /* 7018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * a0 a1 a2 a3 7028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * b0 b1 b2 b3 7038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * c0 c1 c2 c3 7048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * d0 d1 d2 d3 7058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S */ 7068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp0 = _mm_unpacklo_epi32(src_r0, src_r1); //a0 b0 a1 b1 7078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_unpacklo_epi32(src_r2, src_r3); //c0 d0 c1 d1 7088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_unpackhi_epi32(src_r0, src_r1); //a2 b2 a3 b3 7098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_unpackhi_epi32(src_r2, src_r3); //c2 d2 c3 d3 7108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r0 = _mm_unpacklo_epi64(temp0, temp2); //a0 b0 c0 d0 7118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r1 = _mm_unpackhi_epi64(temp0, temp2); //a1 b1 c1 d1 7128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r2 = _mm_unpacklo_epi64(temp1, temp3); //a2 b2 c2 d2 7138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r3 = _mm_unpackhi_epi64(temp1, temp3); //a3 b3 c3 d3 7148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp0 = _mm_add_epi32(src_r0, src_r3); 7168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi32(src_r1, src_r2); 7178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_sub_epi32(src_r1, src_r2); 7188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_sub_epi32(src_r0, src_r3); 7198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r0 = _mm_add_epi32(temp0, temp1); 7218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r1 = _mm_add_epi32(temp2, temp3); 7228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r2 = _mm_sub_epi32(temp0, temp1); 7238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r3 = _mm_sub_epi32(temp3, temp2); 7248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /*-------------------------------------------------------------*/ 7268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /* Forward DC transform [ Vertical transformation ] */ 7278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /*-------------------------------------------------------------*/ 7288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // Matrix transpose 7298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /* 7308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * a0 b0 c0 d0 7318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * a1 b1 c1 d1 7328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * a2 b2 c2 d2 7338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * a3 b3 c3 d3 7348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S */ 7358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp0 = _mm_unpacklo_epi32(src_r0, src_r1); //a0 a1 b0 b1 7368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_unpacklo_epi32(src_r2, src_r3); //a2 a3 b2 b3 7378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_unpackhi_epi32(src_r0, src_r1); //c0 c1 d0 d1 7388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_unpackhi_epi32(src_r2, src_r3); //c2 c3 d2 d3 7398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r0 = _mm_unpacklo_epi64(temp0, temp2); //a0 a1 a2 a3 7408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r1 = _mm_unpackhi_epi64(temp0, temp2); //b0 b1 b2 b3 7418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r2 = _mm_unpacklo_epi64(temp1, temp3); //c0 c1 c2 c3 7428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r3 = _mm_unpackhi_epi64(temp1, temp3); //d0 d1 d2 d3 7438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp0 = _mm_add_epi32(src_r0, src_r3); 7458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi32(src_r1, src_r2); 7468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_sub_epi32(src_r1, src_r2); 7478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_sub_epi32(src_r0, src_r3); 7488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r0 = _mm_add_epi32(temp0, temp1); 7508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r1 = _mm_add_epi32(temp2, temp3); 7518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r2 = _mm_sub_epi32(temp0, temp1); 7528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r3 = _mm_sub_epi32(temp3, temp2); 7538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r0 = _mm_srai_epi32(src_r0, 1); 7558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r1 = _mm_srai_epi32(src_r1, 1); 7568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r2 = _mm_srai_epi32(src_r2, 1); 7578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r3 = _mm_srai_epi32(src_r3, 1); 7588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // Quantization 7608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sign_reg0 = _mm_cmpgt_epi32(zero_8x16b, src_r0); //Find sign of each value for later restoration 7618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sign_reg1 = _mm_cmpgt_epi32(zero_8x16b, src_r1); 7628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sign_reg2 = _mm_cmpgt_epi32(zero_8x16b, src_r2); 7638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sign_reg3 = _mm_cmpgt_epi32(zero_8x16b, src_r3); 7648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sign_reg0 = _mm_packs_epi32(sign_reg0, sign_reg1); //Sign = -1 or 0 depending on <0 or >0 respectively 7668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sign_reg2 = _mm_packs_epi32(sign_reg2, sign_reg3); 7678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sign_reg0 = _mm_slli_epi16(sign_reg0, 1); //Sign = -2 or 0 depending on <0 or >0 respectively 7698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sign_reg2 = _mm_slli_epi16(sign_reg2, 1); 7708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sign_reg0 = _mm_add_epi16(temp_1,sign_reg0); //Sign = -1 or 1 depending on <0 or >0 respectively 7728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sign_reg2 = _mm_add_epi16(temp_1,sign_reg2); 7738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r0 = _mm_abs_epi32(src_r0); //Absolute values 7758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r1 = _mm_abs_epi32(src_r1); 7768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r2 = _mm_abs_epi32(src_r2); 7778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r3 = _mm_abs_epi32(src_r3); 7788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp0 = _mm_mullo_epi32(scale_val, src_r0); //multiply by pu2_scale_matrix[0] 7808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_mullo_epi32(scale_val, src_r1); 7818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_mullo_epi32(scale_val, src_r2); 7828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_mullo_epi32(scale_val, src_r3); 7838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp0 = _mm_add_epi32(temp0,rnd_fact); //Add round factor 7858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi32(temp1,rnd_fact); 7868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi32(temp2,rnd_fact); 7878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_add_epi32(temp3,rnd_fact); 7888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp0 = _mm_srli_epi32(temp0,u4_qbits); //RIght shift by qbits, unsigned variable, so shift right immediate works 7908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_srli_epi32(temp1,u4_qbits); 7918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_srli_epi32(temp2,u4_qbits); 7928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_srli_epi32(temp3,u4_qbits); 7938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp0 = _mm_packs_epi32 (temp0,temp1); //Final values are 16-bits only. 7958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_packs_epi32 (temp2,temp3); 7968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp0 = _mm_sign_epi16(temp0, sign_reg0); //Sign restoration 7988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_sign_epi16(temp2, sign_reg2); 7998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 8008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_storeu_si128((__m128i *) (&pi2_dst[0]), temp0); 8018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_storeu_si128((__m128i *) (&pi2_dst[8]), temp2); 8028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 8038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S cmp0 = _mm_cmpeq_epi16(temp0, zero_8x16b); 8048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S cmp1 = _mm_cmpeq_epi16(temp2, zero_8x16b); 8058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 8068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mask0 = _mm_movemask_epi8(cmp0); 8078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mask1 = _mm_movemask_epi8(cmp1); 8088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S u4_zero_coeff = 0; 8098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S if(mask0) 8108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S { 8118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S if(mask0 == 0xffff) 8128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S u4_zero_coeff+=8; 8138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S else 8148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S { 8158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S cmp0 = _mm_and_si128(temp_1, cmp0); 8168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sum0 = _mm_hadd_epi16(cmp0, zero_8x16b); 8178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sum1 = _mm_hadd_epi16(sum0, zero_8x16b); 8188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sum2 = _mm_hadd_epi16(sum1, zero_8x16b); 8198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S u4_zero_coeff += _mm_cvtsi128_si32(sum2); 8208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S } 8218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S } 8228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S if(mask1) 8238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S { 8248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S if(mask1 == 0xffff) 8258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S u4_zero_coeff+=8; 8268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S else 8278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S { 8288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S cmp1 = _mm_and_si128(temp_1, cmp1); 8298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sum0 = _mm_hadd_epi16(cmp1, zero_8x16b); 8308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sum1 = _mm_hadd_epi16(sum0, zero_8x16b); 8318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sum2 = _mm_hadd_epi16(sum1, zero_8x16b); 8328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S u4_zero_coeff += _mm_cvtsi128_si32(sum2); 8338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S } 8348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S } 8358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 8368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /* Return total nonzero coefficients in the current sub block */ 8378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S u4_nonzero_coeff = 16 - u4_zero_coeff; 8388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_nnz[0] = u4_nonzero_coeff; 8398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S} 8408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 8418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 8428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/** 8438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ******************************************************************************* 8448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 8458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @brief 8468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * This function performs forward hadamard transform and quantization on a 2*2 block 8478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * for both U and V planes 8488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 8498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @par Description: 8508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * The function accepts source buffer and estimation buffer. From these, it 8518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * computes the residue. This is residue is then transformed and quantized. 8528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * The transform and quantization are in placed computed. They use the residue 8538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * buffer for this. 8548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 8558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] pu1_src 8568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Pointer to source sub-block 8578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 8588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] pu1_pred 8598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Pointer to prediction sub-block 8608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 8618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] pi2_out 8628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Pointer to residual sub-block 8638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 8648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] src_strd 8658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Source stride 8668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 8678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] pred_strd 8688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Prediction stride 8698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 8708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] dst_strd 8718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Destination stride 8728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 8738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] u4_qbits 8748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * QP_BITS_h264_4x4 + floor(QP/6) 8758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 8768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] pu2_threshold_matrix 8778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Pointer to Forward Quant Threshold Matrix 8788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 8798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] pu2_scale_matrix 8808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Pointer to Forward Quant Scale Matrix 8818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 8828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[in] u4_round_factor 8838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Quantization Round factor 8848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 8858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @param[out] pu1_nnz 8868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Total non-zero coefficients in the current sub-block 8878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 8888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @returns 8898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 8908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * @remarks 8918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * NNZ for dc is populated at 0 and 5th position of pu1_nnz 8928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 8938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S */ 8948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 8958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Svoid ih264_hadamard_quant_2x2_uv_sse42(WORD16 *pi2_src, WORD16 *pi2_dst, 8968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S const UWORD16 *pu2_scale_matrix, 8978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S const UWORD16 *pu2_threshold_matrix, UWORD32 u4_qbits, 8988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S UWORD32 u4_round_factor,UWORD8 *pu1_nnz) 8998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S{ 900caab4fe0688e7c4b023d979b617fb16a275614f2Hamsalekha S WORD32 val, nonzero_coeff_0=0, nonzero_coeff_1=0; 9018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i cmp, cmp0, cmp1; 9028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i sum0, sum1; 9038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 mask, mask0, mask1; 9048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i src, plane_0, plane_1, temp0, temp1, sign_reg; 9058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i zero_8x16b = _mm_setzero_si128(); 9068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i scale_val = _mm_set1_epi32(pu2_scale_matrix[0]); 9078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i sign_reg0, sign_reg1; 9088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i temp_1 = _mm_set1_epi16(1); 9098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i rnd_fact = _mm_set1_epi32(u4_round_factor); 9108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 91125e8adb631df325607216ad6f3d6638442d9f453Harish Mahendrakar UNUSED (pu2_threshold_matrix); 91225e8adb631df325607216ad6f3d6638442d9f453Harish Mahendrakar 9138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src = _mm_loadu_si128((__m128i *)pi2_src); //a0 a1 a2 a3 b0 b1 b2 b3 9148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sign_reg = _mm_cmpgt_epi16(zero_8x16b, src); 9158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S plane_0 = _mm_unpacklo_epi16(src, sign_reg); //a0 a1 a2 a3 -- 32 bits 9168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S plane_1 = _mm_unpackhi_epi16(src, sign_reg); //b0 b1 b2 b3 -- 32 bits 9178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 9188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp0 = _mm_hadd_epi32(plane_0, plane_1); //a0+a1 a2+a3 b0+b1 b2+b3 9198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_hsub_epi32(plane_0, plane_1); //a0-a1 a2-a3 b0-b1 b2-b3 9208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 9218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S plane_0 = _mm_hadd_epi32(temp0, temp1); //a0+a1+a2+a3 b0+b1+b2+b3 a0-a1+a2-a3 b0-b1+b2-b3 9228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S plane_1 = _mm_hsub_epi32(temp0, temp1); //a0+a1-a2-a3 b0+b1-b2-b3 a0-a1-a2+a3 b0-b1-b2+b3 9238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 9248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp0 = _mm_unpacklo_epi32(plane_0, plane_1); //a0+a1+a2+a3 a0+a1-a2-a3 b0+b1+b2+b3 b0+b1-b2-b3 9258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_unpackhi_epi32(plane_0, plane_1); //a0-a1+a2-a3 a0-a1-a2+a3 b0-b1+b2-b3 b0-b1-b2+b3 9268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 9278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S plane_0 = _mm_unpacklo_epi64(temp0, temp1); //a0+a1+a2+a3 a0+a1-a2-a3 a0-a1+a2-a3 a0-a1-a2+a3 9288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S plane_1 = _mm_unpackhi_epi64(temp0, temp1); //b0+b1+b2+b3 b0+b1-b2-b3 b0-b1+b2-b3 b0-b1-b2+b3 9298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 9308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S plane_0 = _mm_shuffle_epi32(plane_0, 0xd8); //a0+a1+a2+a3 a0-a1+a2-a3 a0+a1-a2-a3 a0-a1-a2+a3 9318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S plane_1 = _mm_shuffle_epi32(plane_1, 0xd8); //b0+b1+b2+b3 b0-b1+b2-b3 b0+b1-b2-b3 b0-b1-b2+b3 9328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // Quantization 9338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sign_reg0 = _mm_cmpgt_epi32(zero_8x16b, plane_0); //Find sign of each value for later restoration 9348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sign_reg1 = _mm_cmpgt_epi32(zero_8x16b, plane_1); 9358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 9368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sign_reg0 = _mm_packs_epi32(sign_reg0, sign_reg1); //Sign = -1 or 0 depending on <0 or >0 respectively 9378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sign_reg0 = _mm_slli_epi16(sign_reg0, 1); //Sign = -2 or 0 depending on <0 or >0 respectively 9388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sign_reg0 = _mm_add_epi16(temp_1,sign_reg0); //Sign = -1 or 1 depending on <0 or >0 respectively 9398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 9408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S plane_0 = _mm_abs_epi32(plane_0); //Absolute values 9418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S plane_1 = _mm_abs_epi32(plane_1); 9428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 9438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp0 = _mm_mullo_epi32(scale_val, plane_0); //multiply by pu2_scale_matrix[0] 9448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_mullo_epi32(scale_val, plane_1); //multiply by pu2_scale_matrix[0] 9458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 9468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp0 = _mm_add_epi32(temp0,rnd_fact); //Add round factor 9478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi32(temp1,rnd_fact); 9488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 9498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp0 = _mm_srli_epi32(temp0,u4_qbits); //RIght shift by qbits, unsigned variable, so shift right immediate works 9508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_srli_epi32(temp1,u4_qbits); 9518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 9528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp0 = _mm_packs_epi32 (temp0,temp1); //Final values are 16-bits only. 9538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp0 = _mm_sign_epi16(temp0, sign_reg0); //Sign restoration 9548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 9558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_storeu_si128((__m128i *) (&pi2_dst[0]), temp0); 9568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 9578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S cmp = _mm_cmpeq_epi16(temp0, zero_8x16b); 9588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mask = _mm_movemask_epi8(cmp); 9598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mask0 = mask & 0xff; 9608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mask1 = mask>>8; 9618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S if(mask0) 9628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S { 9638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S if(mask0 == 0xff) 9648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S nonzero_coeff_0 += 4; 9658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S else 9668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S { 9678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S cmp0 = _mm_and_si128(temp_1, cmp); 9688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sum0 = _mm_hadd_epi16(cmp0, zero_8x16b); 9698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sum1 = _mm_hadd_epi16(sum0, zero_8x16b); 9708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S val = _mm_cvtsi128_si32(sum1); 9718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S val = val & 0xffff; 9728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S nonzero_coeff_0 += val; 9738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S } 9748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S } 9758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S if(mask1) 9768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S { 9778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S if(mask1 == 0xff) 9788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S nonzero_coeff_1 += 4; 9798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S else 9808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S { 9818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S cmp1 = _mm_srli_si128(cmp, 8); 9828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S cmp1 = _mm_and_si128(temp_1, cmp1); 9838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sum0 = _mm_hadd_epi16(cmp1, zero_8x16b); 9848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sum1 = _mm_hadd_epi16(sum0, zero_8x16b); 9858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S nonzero_coeff_1 += _mm_cvtsi128_si32(sum1); 9868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S } 9878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S } 9888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 9898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_nnz[0] = 4 - nonzero_coeff_0; 9908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_nnz[1] = 4 - nonzero_coeff_1; 9918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 9928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S} 993