10d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/****************************************************************************** 20d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 30d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 40d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 50d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Licensed under the Apache License, Version 2.0 (the "License"); 60d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* you may not use this file except in compliance with the License. 70d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* You may obtain a copy of the License at: 80d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 90d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* http://www.apache.org/licenses/LICENSE-2.0 100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Unless required by applicable law or agreed to in writing, software 120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* distributed under the License is distributed on an "AS IS" BASIS, 130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* See the License for the specific language governing permissions and 150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* limitations under the License. 160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar******************************************************************************/ 180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/** 190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ******************************************************************************* 200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @file 210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * ihevc_itrans_recon_atom_intr.c 220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @brief 240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Contains function definitions for inverse quantization, inverse 250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * transform and reconstruction 260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @author 280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 100470 290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 100592 (edited by) 300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @par List of Functions: 320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * - ihevc_itrans_recon_4x4_ttype1_ssse3() 330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * - ihevc_itrans_recon_4x4_ssse3() 340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * - ihevc_itrans_recon_8x8_ssse3() 350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @remarks 370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * None 380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ******************************************************************************* 400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar */ 410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <stdio.h> 420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <string.h> 430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_typedefs.h" 440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_platform_macros.h" 450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_macros.h" 460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_defs.h" 470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_func_selector.h" 480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_trans_tables.h" 490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_iquant_itrans_recon.h" 500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_trans_macros.h" 510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <immintrin.h> 540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <emmintrin.h> 550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <tmmintrin.h> 570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/** 600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ******************************************************************************* 610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @brief 630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * This function performs inverse quantization, inverse transform 640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * type1(DST) and reconstruction for 4x4 input block 650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @par Description: 670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Performs inverse quantization , inverse transform type 1 and adds 680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * prediction data and clips output to 8 bit 690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] pi2_src 710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Input 4x4 coefficients 720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] pi2_tmp 740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Temporary 4x4 buffer for storing inverse 750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * transform 1st stage output 760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] pu1_pred 780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Prediction 4x4 block 790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] pi2_dequant_coeff 810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Dequant Coeffs 820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[out] pu1_dst 840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Output 4x4 block 850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] qp_div 870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Quantization parameter / 6 880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] qp_rem 900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Quantization parameter % 6 910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] src_strd 930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Input stride 940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] pred_strd 960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Prediction stride 970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] dst_strd 990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Output Stride 1000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 1010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] zero_cols 1020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Zero columns in pi2_src 1030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 1040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @returns Void 1050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 1060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @remarks 1070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * None 1080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 1090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ******************************************************************************* 1100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar */ 1110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarvoid ihevc_itrans_recon_4x4_ttype1_ssse3(WORD16 *pi2_src, 1130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD16 *pi2_tmp, 1140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UWORD8 *pu1_pred, 1150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UWORD8 *pu1_dst, 1160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 src_strd, 1170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 pred_strd, 1180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 dst_strd, 1190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 zero_cols, 1200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 zero_rows) 1210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar{ 1220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_0; 1230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_1; 1240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_2; 1250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_3; 1260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_4; 1270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_10; 1280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_11; 1290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_12; 1300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_13; 1310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_14; 1320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_20; 1330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_21; 1340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_22; 1350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_23; 1360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_24; 1370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_25; 1380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_30; 1390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_31; 1400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_32; 1410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_33; 1420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_34; 1430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_35; 1440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_36; 1450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_rdng_factor; 1460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_count; 1470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_ge_zero16b_flag_row0; 1490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_ge_zero16b_flag_row1; 1500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_ge_zero16b_flag_row2; 1510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_ge_zero16b_flag_row3; 1520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_zero = _mm_setzero_si128(); 1540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 i4_shift = IT_SHIFT_STAGE_1; 1560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UNUSED(zero_cols); 1570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UNUSED(zero_rows); 1580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UNUSED(pi2_tmp); 1590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pi2_src); 1600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src += src_strd; 1610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pi2_src); 1620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src += src_strd; 1630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pi2_src); 1640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src += src_strd; 1650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pi2_src); 1660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_ge_zero16b_flag_row0 = _mm_cmpgt_epi16(m_zero, m_temp_reg_0); 1680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_ge_zero16b_flag_row1 = _mm_cmpgt_epi16(m_zero, m_temp_reg_1); 1690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_ge_zero16b_flag_row2 = _mm_cmpgt_epi16(m_zero, m_temp_reg_2); 1700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_ge_zero16b_flag_row3 = _mm_cmpgt_epi16(m_zero, m_temp_reg_3); 1710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_0, m_ge_zero16b_flag_row0); 1730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_1, m_ge_zero16b_flag_row1); 1740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_2 = _mm_unpacklo_epi16(m_temp_reg_2, m_ge_zero16b_flag_row2); 1750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_3 = _mm_unpacklo_epi16(m_temp_reg_3, m_ge_zero16b_flag_row3); 1760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*m_temp_reg_0 = _mm_cvtepi16_epi32(m_temp_reg_0); 1780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_2 = _mm_cvtepi16_epi32(m_temp_reg_2); 1790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_cvtepi16_epi32(m_temp_reg_1); 1810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_3 = _mm_cvtepi16_epi32(m_temp_reg_3);*/ 1820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* c[4] in m_temp_reg_14 */ 1840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* c[4] = src[0] - src[2] + src[3] */ 1850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 1860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_14 = _mm_sub_epi32(m_temp_reg_0, m_temp_reg_2); 1870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 1880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* c[3] in m_temp_reg_13 */ 1900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 1910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_1, 6); 1920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_1, 3); 1930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_slli_epi32(m_temp_reg_1, 1); 1940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 1950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_13 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 1960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_13 = _mm_mullo_epi32(m_temp_reg_1, m_coeff3); 1970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 1980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* c[0] in m_temp_reg_10 */ 2000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 2010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_add_epi32(m_temp_reg_0, m_temp_reg_2); 2020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 2030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* c[1] in m_temp_reg_11 */ 2050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 2060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_11 = _mm_add_epi32(m_temp_reg_2, m_temp_reg_3); 2070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 2080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* c[2] in m_temp_reg_12 */ 2100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 2110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_12 = _mm_sub_epi32(m_temp_reg_0, m_temp_reg_3); 2120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 2130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* c[4] in m_temp_reg_14 */ 2150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* c[4] = src[0] - src[2] + src[3] */ 2160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 2170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_14 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_3); 2180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 2190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Stage 1 outputs stored in m_temp_reg_20-23 */ 2210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 2220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_10, 5); 2230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_10, 1); 2240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_10); 2250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_21); 2260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_30 = _mm_mullo_epi32(m_temp_reg_10, m_coeff1);//29*c0 2270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_11, 6); 2290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_11, 3); 2300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_11); 2310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_21); 2320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_31 = _mm_mullo_epi32(m_temp_reg_11, m_coeff2);//55*c1 2330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_11, 5); 2370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_11, 1); 2380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_11); 2390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_21); 2400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_32 = _mm_mullo_epi32(m_temp_reg_11, m_coeff1);//29*c1 2410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_12, 6); 2430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_12, 3); 2440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_12); 2450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_21); 2460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_33 = _mm_mullo_epi32(m_temp_reg_12, m_coeff2);//55*c2 2470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 2490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_10, 6); 2510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_10, 3); 2520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_10); 2530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_21); 2540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_34 = _mm_mullo_epi32(m_temp_reg_10, m_coeff2);//55*c0 2550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_12, 5); 2570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_12, 1); 2580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_12); 2590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_21); 2600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_35 = _mm_mullo_epi32(m_temp_reg_12, m_coeff1);//29*c2 2610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_14, 6); 2630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_14, 3); 2640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_slli_epi32(m_temp_reg_14, 1); 2650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 2660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 2670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_36 = _mm_mullo_epi32(m_temp_reg_14, m_coeff3);//74*c4 2680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 2700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 2710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 2730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_4 = _mm_add_epi32(m_rdng_factor, m_temp_reg_13); 2740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_4); 2750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32); 2770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_4); 2780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_35); 2800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_4 = _mm_sub_epi32(m_rdng_factor, m_temp_reg_13); 2810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_4); 2820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_add_epi32(m_temp_reg_36, m_rdng_factor); 2840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_sra_epi32(m_temp_reg_20, m_count); 2860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_sra_epi32(m_temp_reg_21, m_count); 2870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_sra_epi32(m_temp_reg_23, m_count); 2880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_sra_epi32(m_temp_reg_22, m_count); 2890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21); 2910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23); 2920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8); 2930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8); 2940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_22); 2960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_25 = _mm_unpacklo_epi16(m_temp_reg_21, m_temp_reg_23); 2970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_unpacklo_epi32(m_temp_reg_24, m_temp_reg_25); 2990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_unpackhi_epi32(m_temp_reg_24, m_temp_reg_25); 3000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 3020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Stage 2 */ 3040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 3050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar i4_shift = IT_SHIFT_STAGE_2; 3060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8); 3080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_cvtepi16_epi32(m_temp_reg_20); 3090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8); 3100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_cvtepi16_epi32(m_temp_reg_21); 3110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_cvtepi16_epi32(m_temp_reg_22); 3120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_cvtepi16_epi32(m_temp_reg_23);*/ 3130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_ge_zero16b_flag_row0 = _mm_cmpgt_epi16(m_zero, m_temp_reg_30); 3150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_ge_zero16b_flag_row1 = _mm_cmpgt_epi16(m_zero, m_temp_reg_31); 3160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_unpacklo_epi16(m_temp_reg_30, m_ge_zero16b_flag_row0); 3180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_unpacklo_epi16(m_temp_reg_31, m_ge_zero16b_flag_row1); 3190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_unpackhi_epi16(m_temp_reg_30, m_ge_zero16b_flag_row0); 3200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_unpackhi_epi16(m_temp_reg_31, m_ge_zero16b_flag_row1); 3210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* c[4] stored in m_temp_reg_4 */ 3240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 3250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_4 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21); 3260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 3270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* c[3] stored in m_temp_reg_3 */ 3290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 3300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_22, 6); 3310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_22, 3); 3320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_12 = _mm_slli_epi32(m_temp_reg_22, 1); 3330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_13 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_11); 3340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_3 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_13); 3350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_3 = _mm_mullo_epi32(m_temp_reg_22, m_coeff3); 3360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 3370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* c[0] stored in m_temp_reg_0 */ 3390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 3400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 3410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 3420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* c[1] stored in m_temp_reg_1 */ 3440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 3450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_21); 3460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 3470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* c[2] stored in m_temp_reg_2 */ 3490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 3500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_2 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_23); 3510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 3520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* c[4] stored in m_temp_reg_4 */ 3540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 3550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_4 = _mm_add_epi32(m_temp_reg_4, m_temp_reg_23); 3560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 3570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Stage 2 output generation */ 3590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 3600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_0, 5); 3610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_0, 1); 3620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_13 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_0); 3630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_11); 3640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_30 = _mm_mullo_epi32(m_temp_reg_0, m_coeff1);//29*c0 3650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_1, 6); 3670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_1, 3); 3680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_13 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_1); 3690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_11); 3700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_31 = _mm_mullo_epi32(m_temp_reg_1, m_coeff2);//55*c1 3710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 3730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_1, 5); 3750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_1, 1); 3760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_13 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_1); 3770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_11); 3780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_32 = _mm_mullo_epi32(m_temp_reg_1, m_coeff1);//29*c1 3790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_2, 6); 3810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_2, 3); 3820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_13 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_2); 3830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_11); 3840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_33 = _mm_mullo_epi32(m_temp_reg_2, m_coeff2);//55*c2 3850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 3870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_0, 6); 3890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_0, 3); 3900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_13 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_0); 3910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_11); 3920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_34 = _mm_mullo_epi32(m_temp_reg_0, m_coeff2);//55*c0 3930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_2, 5); 3950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_2, 1); 3960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_13 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_2); 3970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_11); 3980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_35 = _mm_mullo_epi32(m_temp_reg_2, m_coeff1);//29*c2 3990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_4, 6); 4010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_4, 3); 4020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_12 = _mm_slli_epi32(m_temp_reg_4, 1); 4030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_13 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_11); 4040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_36 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_13); 4050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_36 = _mm_mullo_epi32(m_temp_reg_4, m_coeff3);//74*c4 4060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 4080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 4090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_4 = _mm_add_epi32(m_rdng_factor, m_temp_reg_3); 4110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 4120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_4); 4130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32); 4150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_4); 4160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_4 = _mm_sub_epi32(m_rdng_factor, m_temp_reg_3); 4180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_35); 4190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_4); 4200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_add_epi32(m_temp_reg_36, m_rdng_factor); 4220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_sra_epi32(m_temp_reg_20, m_count); 4240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_sra_epi32(m_temp_reg_21, m_count); 4250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_sra_epi32(m_temp_reg_23, m_count); 4260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_sra_epi32(m_temp_reg_22, m_count); 4270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21); 4290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23); 4300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8); 4310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8); 4320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_22); 4340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_25 = _mm_unpacklo_epi16(m_temp_reg_21, m_temp_reg_23); 4350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_unpacklo_epi32(m_temp_reg_24, m_temp_reg_25); 4370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_unpackhi_epi32(m_temp_reg_24, m_temp_reg_25); 4380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 4390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Recon and store */ 4410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 4420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 *pi4_dst = (WORD32 *)pu1_dst; 4430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred); 4450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_pred += pred_strd; 4460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred); 4470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_pred += pred_strd; 4480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred); 4490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_pred += pred_strd; 4500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred); 4510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, m_zero); 4530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_unpacklo_epi8(m_temp_reg_1, m_zero); 4540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_2 = _mm_unpacklo_epi8(m_temp_reg_2, m_zero); 4550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_3 = _mm_unpacklo_epi8(m_temp_reg_3, m_zero); 4560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0); 4580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_cvtepu8_epi16(m_temp_reg_1); 4590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_2 = _mm_cvtepu8_epi16(m_temp_reg_2); 4600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_3 = _mm_cvtepu8_epi16(m_temp_reg_3);*/ 4610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_1); 4630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_3); 4640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi16(m_temp_reg_20, m_temp_reg_0); 4660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_add_epi16(m_temp_reg_21, m_temp_reg_1); 4670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_packus_epi16(m_temp_reg_20, m_temp_reg_21); 4690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_0); 4710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_srli_si128(m_temp_reg_0, 4); 4720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_2 = _mm_srli_si128(m_temp_reg_0, 8); 4730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_3 = _mm_srli_si128(m_temp_reg_0, 12); 4740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += dst_strd; 4750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi4_dst = (WORD32 *)(pu1_dst); 4760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_1); 4780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += dst_strd; 4790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi4_dst = (WORD32 *)(pu1_dst); 4800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_2); 4820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += dst_strd; 4830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi4_dst = (WORD32 *)(pu1_dst); 4840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_3); 4860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 4870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 4880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar} 4890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/** 4920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ******************************************************************************* 4930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 4940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @brief 4950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * This function performs inverse quantization, inverse transform 4960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * (DCT) and reconstruction for 4x4 input block 4970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 4980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @par Description: 4990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Performs inverse quantization , inverse transform and adds 5000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * prediction data and clips output to 8 bit 5010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 5020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] pi2_src 5030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Input 4x4 coefficients 5040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 5050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] pi2_tmp 5060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Temporary 4x4 buffer for storing inverse 5070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * transform 1st stage output 5080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 5090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] pu1_pred 5100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Prediction 4x4 block 5110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 5120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] pi2_dequant_coeff 5130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Dequant Coeffs 5140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 5150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[out] pu1_dst 5160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Output 4x4 block 5170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 5180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] qp_div 5190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Quantization parameter / 6 5200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 5210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] qp_rem 5220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Quantization parameter % 6 5230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 5240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] src_strd 5250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Input stride 5260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 5270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] pred_strd 5280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Prediction stride 5290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 5300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] dst_strd 5310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Output Stride 5320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 5330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] zero_cols 5340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Zero columns in pi2_src 5350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 5360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @returns Void 5370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 5380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @remarks 5390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * None 5400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 5410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ******************************************************************************* 5420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar */ 5430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarvoid ihevc_itrans_recon_4x4_ssse3(WORD16 *pi2_src, 5450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD16 *pi2_tmp, 5460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UWORD8 *pu1_pred, 5470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UWORD8 *pu1_dst, 5480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 src_strd, 5490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 pred_strd, 5500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 dst_strd, 5510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 zero_cols, 5520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 zero_rows) 5530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar{ 5540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_0; 5550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_1; 5560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_2; 5570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_3; 5580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_4; 5590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_10; 5600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_11; 5610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_12; 5620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_13; 5630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_14; 5640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_15; 5650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_20; 5660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_21; 5670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_22; 5680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_23; 5690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_24; 5700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_25; 5710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_30; 5720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_31; 5730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_33; 5740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_34; 5750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_rdng_factor; 5760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_count; 5770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_ge_zero16b_flag_row0; 5790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_ge_zero16b_flag_row1; 5800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_ge_zero16b_flag_row2; 5810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_ge_zero16b_flag_row3; 5820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_zero = _mm_setzero_si128(); 5840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 i4_shift = IT_SHIFT_STAGE_1; 5860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UNUSED(zero_rows); 5870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UNUSED(zero_cols); 5880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UNUSED(pi2_tmp); 5890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pi2_src); 5910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src += src_strd; 5920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pi2_src); 5930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src += src_strd; 5940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pi2_src); 5950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src += src_strd; 5960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pi2_src); 5970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_ge_zero16b_flag_row0 = _mm_cmpgt_epi16(m_zero, m_temp_reg_0); 5990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_ge_zero16b_flag_row1 = _mm_cmpgt_epi16(m_zero, m_temp_reg_1); 6000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_ge_zero16b_flag_row2 = _mm_cmpgt_epi16(m_zero, m_temp_reg_2); 6010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_ge_zero16b_flag_row3 = _mm_cmpgt_epi16(m_zero, m_temp_reg_3); 6020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_0, m_ge_zero16b_flag_row0); 6040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_1, m_ge_zero16b_flag_row1); 6050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_2 = _mm_unpacklo_epi16(m_temp_reg_2, m_ge_zero16b_flag_row2); 6060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_3 = _mm_unpacklo_epi16(m_temp_reg_3, m_ge_zero16b_flag_row3); 6070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*m_temp_reg_0 = _mm_cvtepi16_epi32(m_temp_reg_0); 6090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_2 = _mm_cvtepi16_epi32(m_temp_reg_2); 6100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_cvtepi16_epi32(m_temp_reg_1); 6120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_3 = _mm_cvtepi16_epi32(m_temp_reg_3);*/ 6130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e */ 6150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 6160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_0, 6); 6170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_2, 6); 6180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 6190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o */ 6210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 6220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_1, 5); 6230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_1, 2); 6240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_12 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 6250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_12 = _mm_mullo_epi32(m_temp_reg_1, m_coeff1);//src[1]*36 6260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_3, 6); 6280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_3, 4); 6290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_slli_epi32(m_temp_reg_3, 1); 6300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_3); 6310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_22); 6320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_13 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_24); 6330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_13 = _mm_mullo_epi32(m_temp_reg_3, m_coeff3);//src[3]*83 6340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_1, 6); 6360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_1, 4); 6370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_slli_epi32(m_temp_reg_1, 1); 6380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_1); 6390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_22); 6400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_14 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_24); 6410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_14 = _mm_mullo_epi32(m_temp_reg_1, m_coeff3);//src[1]*83 6420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_3, 5); 6440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_3, 2); 6450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_15 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 6460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_15 = _mm_mullo_epi32(m_temp_reg_3, m_coeff1);//src[3]*36 6470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 6480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 6500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e1 stored in m_temp_reg_31 */ 6520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 6530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_11); 6540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 6550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 6570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e0 stored in m_temp_reg_30 */ 6590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 6600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_11); 6610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 6620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 6640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 6650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o1 stored in m_temp_reg_33 */ 6670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 6680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_13); 6690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 6700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e1 + add */ 6720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 6730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 6740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 6750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e0 + add */ 6770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 6780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 6790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 6800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o0 stored in m_temp_reg_34 */ 6820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 6830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_15); 6840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 6850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Stage 1 outputs */ 6870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 6880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_add_epi32(m_temp_reg_31, m_temp_reg_33); 6890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_31, m_temp_reg_33); 6900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_34); 6920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_34); 6930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_sra_epi32(m_temp_reg_21, m_count); 6960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_sra_epi32(m_temp_reg_20, m_count); 6970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_sra_epi32(m_temp_reg_22, m_count); 6980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_sra_epi32(m_temp_reg_23, m_count); 6990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21); 7010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23); 7020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8); 7030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8); 7040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_22); 7060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_25 = _mm_unpacklo_epi16(m_temp_reg_21, m_temp_reg_23); 7070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_unpacklo_epi32(m_temp_reg_24, m_temp_reg_25); 7090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_unpackhi_epi32(m_temp_reg_24, m_temp_reg_25); 7100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 7110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Stage 2 */ 7130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 7140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar i4_shift = IT_SHIFT_STAGE_2; 7150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8); 7170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);*/ 7180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_ge_zero16b_flag_row0 = _mm_cmpgt_epi16(m_zero, m_temp_reg_30); 7200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_ge_zero16b_flag_row1 = _mm_cmpgt_epi16(m_zero, m_temp_reg_31); 7210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_unpacklo_epi16(m_temp_reg_30, m_ge_zero16b_flag_row0); 7230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_unpacklo_epi16(m_temp_reg_31, m_ge_zero16b_flag_row1); 7240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_unpackhi_epi16(m_temp_reg_30, m_ge_zero16b_flag_row0); 7250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_unpackhi_epi16(m_temp_reg_31, m_ge_zero16b_flag_row1); 7260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*m_temp_reg_20 = _mm_cvtepi16_epi32(m_temp_reg_20); 7280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_cvtepi16_epi32(m_temp_reg_21); 7290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_cvtepi16_epi32(m_temp_reg_22); 7310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_cvtepi16_epi32(m_temp_reg_23);*/ 7320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e */ 7340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 7350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_20, 6); 7360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 7370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o */ 7390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*{ 7400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_12 = _mm_mullo_epi32(m_temp_reg_22, m_coeff1);//src[1]*36 7410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_14 = _mm_mullo_epi32(m_temp_reg_22, m_coeff3);//src[1]*83 7420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_13 = _mm_mullo_epi32(m_temp_reg_23, m_coeff3);//src[3]*83 7430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_15 = _mm_mullo_epi32(m_temp_reg_23, m_coeff1);//src[3]*36 7440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar }*/ 7450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 7460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_slli_epi32(m_temp_reg_22, 5); 7470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_slli_epi32(m_temp_reg_22, 2); 7480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_12 = _mm_add_epi32(m_temp_reg_0, m_temp_reg_1); 7490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_12 = _mm_mullo_epi32(m_temp_reg_1, m_coeff1);//src[1]*36 7500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_slli_epi32(m_temp_reg_23, 6); 7520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_slli_epi32(m_temp_reg_23, 4); 7530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_2 = _mm_slli_epi32(m_temp_reg_23, 1); 7540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_3 = _mm_add_epi32(m_temp_reg_0, m_temp_reg_23); 7550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_4 = _mm_add_epi32(m_temp_reg_1, m_temp_reg_2); 7560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_13 = _mm_add_epi32(m_temp_reg_3, m_temp_reg_4); 7570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_13 = _mm_mullo_epi32(m_temp_reg_3, m_coeff3);//src[3]*83 7580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_slli_epi32(m_temp_reg_22, 6); 7600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_slli_epi32(m_temp_reg_22, 4); 7610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_2 = _mm_slli_epi32(m_temp_reg_22, 1); 7620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_3 = _mm_add_epi32(m_temp_reg_0, m_temp_reg_22); 7630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_4 = _mm_add_epi32(m_temp_reg_1, m_temp_reg_2); 7640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_14 = _mm_add_epi32(m_temp_reg_3, m_temp_reg_4); 7650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_14 = _mm_mullo_epi32(m_temp_reg_1, m_coeff3);//src[1]*83 7660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_slli_epi32(m_temp_reg_23, 5); 7680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_slli_epi32(m_temp_reg_23, 2); 7690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_15 = _mm_add_epi32(m_temp_reg_0, m_temp_reg_1); 7700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_15 = _mm_mullo_epi32(m_temp_reg_3, m_coeff1);//src[3]*36 7710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 7720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e */ 7740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 7750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_21, 6); 7760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 7770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 7790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e1 stored in m_temp_reg_31 */ 7810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 7820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_11); 7830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 7840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 7860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e0 stored in m_temp_reg_30 */ 7880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 7890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_11); 7900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 7910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 7930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 7940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o1 stored in m_temp_reg_33 */ 7960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 7970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_13); 7980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 7990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e1 + add */ 8010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 8020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 8030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 8040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e0 + add */ 8060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 8070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 8080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 8090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o0 stored in m_temp_reg_34 */ 8110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 8120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_15); 8130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 8140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Stage 2 outputs */ 8160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 8170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_add_epi32(m_temp_reg_31, m_temp_reg_33); 8180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_31, m_temp_reg_33); 8190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_34); 8200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_34); 8210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_sra_epi32(m_temp_reg_21, m_count); 8230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_sra_epi32(m_temp_reg_22, m_count); 8240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_sra_epi32(m_temp_reg_20, m_count); 8250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_sra_epi32(m_temp_reg_23, m_count); 8260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21); 8280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23); 8290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8); 8300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8); 8310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_22); 8330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_25 = _mm_unpacklo_epi16(m_temp_reg_21, m_temp_reg_23); 8340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_unpacklo_epi32(m_temp_reg_24, m_temp_reg_25); 8360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_unpackhi_epi32(m_temp_reg_24, m_temp_reg_25); 8370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 8380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Recon and store */ 8400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 8410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UWORD32 *pu4_dst = (UWORD32 *)pu1_dst; 8420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred); 8440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_pred += pred_strd; 8450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred); 8460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_pred += pred_strd; 8470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred); 8480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_pred += pred_strd; 8490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred); 8500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0); 8520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_1 = _mm_cvtepu8_epi16(m_temp_reg_1); 8530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_2 = _mm_cvtepu8_epi16(m_temp_reg_2); 8550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_3 = _mm_cvtepu8_epi16(m_temp_reg_3); 8560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, m_zero); 8580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_unpacklo_epi8(m_temp_reg_1, m_zero); 8590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_2 = _mm_unpacklo_epi8(m_temp_reg_2, m_zero); 8600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_3 = _mm_unpacklo_epi8(m_temp_reg_3, m_zero); 8610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_1); 8630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_3); 8640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi16(m_temp_reg_20, m_temp_reg_0); 8660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_add_epi16(m_temp_reg_21, m_temp_reg_1); 8670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_packus_epi16(m_temp_reg_20, m_temp_reg_21); 8690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *pu4_dst = _mm_cvtsi128_si32(m_temp_reg_0); 8710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_srli_si128(m_temp_reg_0, 4); 8720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_2 = _mm_srli_si128(m_temp_reg_0, 8); 8730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_3 = _mm_srli_si128(m_temp_reg_0, 12); 8740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += dst_strd; 8750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu4_dst = (UWORD32 *)(pu1_dst); 8760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *pu4_dst = _mm_cvtsi128_si32(m_temp_reg_1); 8780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += dst_strd; 8790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu4_dst = (UWORD32 *)(pu1_dst); 8800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *pu4_dst = _mm_cvtsi128_si32(m_temp_reg_2); 8820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += dst_strd; 8830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu4_dst = (UWORD32 *)(pu1_dst); 8840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *pu4_dst = _mm_cvtsi128_si32(m_temp_reg_3); 8860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 8870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 8880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar} 8890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/** 8930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ******************************************************************************* 8940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 8950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @brief 8960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * This function performs inverse quantization, inverse transform and 8970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * reconstruction for 8c8 input block 8980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 8990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @par Description: 9000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Performs inverse quantization , inverse transform and adds the 9010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * prediction data and clips output to 8 bit 9020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 9030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] pi2_src 9040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Input 8x8 coefficients 9050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 9060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] pi2_tmp 9070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Temporary 8x8 buffer for storing inverse 9080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * transform 1st stage output 9090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 9100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] pu1_pred 9110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Prediction 8x8 block 9120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 9130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] pi2_dequant_coeff 9140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Dequant Coeffs 9150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 9160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[out] pu1_dst 9170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Output 8x8 block 9180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 9190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] src_strd 9200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Input stride 9210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 9220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] qp_div 9230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Quantization parameter / 6 9240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 9250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] qp_rem 9260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Quantization parameter % 6 9270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 9280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] pred_strd 9290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Prediction stride 9300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 9310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] dst_strd 9320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Output Stride 9330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 9340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] zero_cols 9350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Zero columns in pi2_src 9360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 9370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @returns Void 9380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 9390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @remarks 9400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * None 9410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 9420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ******************************************************************************* 9430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar */ 9440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarvoid ihevc_itrans_recon_8x8_ssse3(WORD16 *pi2_src, 9470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD16 *pi2_tmp, 9480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UWORD8 *pu1_pred, 9490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UWORD8 *pu1_dst, 9500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 src_strd, 9510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 pred_strd, 9520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 dst_strd, 9530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 zero_cols, 9540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 zero_rows) 9550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar{ 9560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_0; 9570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_1; 9580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_2; 9590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_3; 9600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_5; 9610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_6; 9620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_7; 9630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_4; 9640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_10; 9650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_11; 9660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_12; 9670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_13; 9680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_14; 9690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_15; 9700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_16; 9710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_17; 9720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_20; 9730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_21; 9740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_22; 9750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_23; 9760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_24; 9770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_25; 9780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_26; 9790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_27; 9800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_30; 9810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_31; 9820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_32; 9830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_33; 9840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_34; 9850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_35; 9860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_36; 9870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_37; 9880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_40; 9890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_41; 9900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_42; 9910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_43; 9920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_44; 9930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_45; 9940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_46; 9950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_47; 9960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_50; 9970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_51; 9980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_52; 9990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_53; 10000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_54; 10010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_55; 10020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_56; 10030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_57; 10040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_60; 10050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_61; 10060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_62; 10070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_63; 10080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_64; 10090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_65; 10100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_66; 10110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_67; 10120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_70; 10130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_71; 10140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_72; 10150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_73; 10160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_74; 10170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_75; 10180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_76; 10190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_77; 10200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_coeff1, m_coeff2, m_coeff3, m_coeff4; 10210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 check_row_stage_1; /* Lokesh */ 10230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 check_row_stage_2; /* Lokesh */ 10240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_rdng_factor; 10260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //__m128i m_count; 10270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 i4_shift = IT_SHIFT_STAGE_1; 10280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UNUSED(zero_rows); 10290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UNUSED(zero_cols); 10300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UNUSED(pi2_tmp); 10310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar check_row_stage_1 = ((zero_rows & 0xF0) != 0xF0) ? 1 : 0; 10330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar check_row_stage_2 = ((zero_cols & 0xF0) != 0xF0) ? 1 : 0; 10340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_src); 10360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src += src_strd; 10370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_src); 10380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src += src_strd; 10390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_src); 10400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src += src_strd; 10410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_src); 10420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src += src_strd; 10430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_src); 10450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src += src_strd; 10460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_src); 10470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src += src_strd; 10480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_src); 10490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src += src_strd; 10500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_src); 10510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(!check_row_stage_2) 10530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 10540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(!check_row_stage_1) 10550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 10560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */ 10570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */ 10580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 10590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //Interleaving 0,4 row in 0 , 1 Rishab 10600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/ 10610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]); 10620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]); 10630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74); 10650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 10670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); 10680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 10700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */ 10730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */ 10740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* as upper 8 bytes are zeros so m_temp_reg_15 and m_temp_reg_17 are not used*/ 10750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 10760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83 10780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36 10790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Combining instructions to eliminate them based on zero_rows : Lokesh */ 10810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //Interleaving 2,6 row in 4, 5 Rishab 10820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); 10830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1); 10850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2); 10860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Loading coeff for computing o0, o1, o2 and o3 in the next block */ 10890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]); 10910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[3][0]); 10920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]); 10940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[1][0]); 10950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e */ 10990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */ 11010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */ 11020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */ 11030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */ 11040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16); 11050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16); 11060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14); 11080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14); 11090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 11110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o */ 11130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 11140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */ 11160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 11170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); 11190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //o0:1B*89+3B*75,5B*50+7B*18 11200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_60, m_coeff1); 11210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 11230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000); 11240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Column 0 of destination computed here */ 11280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* It is stored in m_temp_reg_50 */ 11290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Column 7 of destination computed here */ 11300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* It is stored in m_temp_reg_57 */ 11310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Upper 8 bytes of both registers are zero due to zero_cols*/ 11320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); 11360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); 11370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 11390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 11400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 11420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_63 = _mm_setzero_si128(); 11430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 11440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //o1:1B*75-3B*18,5B*89+7B*50 11460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_60, m_coeff3); 11470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 11490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63); 11500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Loading coeff for computing o2 in the next block */ 11520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]); 11540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[5][0]); 11550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */ 11570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Column 1 of destination computed here */ 11610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* It is stored in m_temp_reg_51 */ 11620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Column 6 of destination computed here */ 11630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* It is stored in m_temp_reg_56 */ 11640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32); 11660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32); 11670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 11690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 11700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 11720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 11730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //o2:1B*50-3B*89,5B*18+7B*75 11750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_60, m_coeff1); 11760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 11780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63); 11790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */ 11820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Loading coeff for computing o3 in the next block */ 11840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]); 11860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[7][0]); 11870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Column 2 of destination computed here */ 11910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* It is stored in m_temp_reg_52 */ 11920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Column 5 of destination computed here */ 11930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* It is stored in m_temp_reg_55 */ 11940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34); 11960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34); 11970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 11990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 12000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 12020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 12030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //o3:1B*18-3B*50,5B*75-7B*89 12050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_60, m_coeff3); 12060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 12080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63); 12090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */ 12130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Column 3 of destination computed here */ 12170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* It is stored in m_temp_reg_53 */ 12180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Column 4 of destination computed here */ 12190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* It is stored in m_temp_reg_54 */ 12200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36); 12220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36); 12230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 12250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 12260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 12280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 12290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 12320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63); 12330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 12340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 12350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Transpose of the destination 8x8 matrix done here */ 12370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */ 12380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* respectively */ 12390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 12400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51); 12410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53); 12420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51); 12430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53); 12440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11); 12450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11); 12460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15); 12470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15); 12480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55); 12500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57); 12510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55); 12520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57); 12530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13); 12540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13); 12550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17); 12560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17); 12570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4); 12590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4); 12600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5); 12610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5); 12620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*m_temp_reg_54 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6); 12640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_55 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6); 12650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_56 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7); 12660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_57 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7); 12670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar */ 12680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_54 = _mm_setzero_si128(); 12690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_55 = _mm_setzero_si128(); 12700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_56 = _mm_setzero_si128(); 12710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_57 = _mm_setzero_si128(); 12720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 12730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 12740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else 12750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 12760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */ 12770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */ 12780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 12790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //Interleaving 0,4 row in 0 , 1 Rishab 12800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/ 12810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]); 12820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]); 12830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74); 12850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 12870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); 12880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 12900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */ 12930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */ 12940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* as upper 8 bytes are zeros so m_temp_reg_15 and m_temp_reg_17 are not used*/ 12950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 12960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83 12980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36 12990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Combining instructions to eliminate them based on zero_rows : Lokesh */ 13010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //Interleaving 2,6 row in 4, 5 Rishab 13020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); 13030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1); 13050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2); 13060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Loading coeff for computing o0, o1, o2 and o3 in the next block */ 13090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]); 13110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[3][0]); 13120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]); 13140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[1][0]); 13150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e */ 13190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */ 13210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */ 13220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */ 13230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */ 13240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16); 13250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16); 13260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14); 13280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14); 13290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 13310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o */ 13330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 13340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */ 13360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 13370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); 13390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_64 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77); 13400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //o0:1B*89+3B*75,5B*50+7B*18 13410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1); 13420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2); 13430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 13450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000); 13460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24); 13480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Column 0 of destination computed here */ 13520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* It is stored in m_temp_reg_50 */ 13530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Column 7 of destination computed here */ 13540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* It is stored in m_temp_reg_57 */ 13550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Upper 8 bytes of both registers are zero due to zero_cols*/ 13560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); 13600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); 13610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 13630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 13640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 13660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_63 = _mm_setzero_si128(); 13670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 13680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //o1:1B*75-3B*18,5B*89+7B*50 13700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3); 13710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4); 13720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 13740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63); 13750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Loading coeff for computing o2 in the next block */ 13770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]); 13790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[5][0]); 13800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */ 13820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26); 13830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Column 1 of destination computed here */ 13870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* It is stored in m_temp_reg_51 */ 13880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Column 6 of destination computed here */ 13890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* It is stored in m_temp_reg_56 */ 13900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32); 13920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32); 13930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 13950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 13960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 13980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 13990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //o2:1B*50-3B*89,5B*18+7B*75 14010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1); 14020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2); 14030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 14050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63); 14060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */ 14090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Loading coeff for computing o3 in the next block */ 14110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]); 14130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[7][0]); 14140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24); 14160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Column 2 of destination computed here */ 14190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* It is stored in m_temp_reg_52 */ 14200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Column 5 of destination computed here */ 14210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* It is stored in m_temp_reg_55 */ 14220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34); 14240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34); 14250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 14270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 14280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 14300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 14310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //o3:1B*18-3B*50,5B*75-7B*89 14330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3); 14340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4); 14350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 14370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63); 14380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */ 14420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26); 14440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Column 3 of destination computed here */ 14470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* It is stored in m_temp_reg_53 */ 14480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Column 4 of destination computed here */ 14490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* It is stored in m_temp_reg_54 */ 14500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36); 14520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36); 14530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 14550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 14560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 14580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 14590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 14620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63); 14630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 14640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 14650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Transpose of the destination 8x8 matrix done here */ 14670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */ 14680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* respectively */ 14690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 14700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51); 14710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53); 14720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51); 14730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53); 14740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11); 14750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11); 14760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15); 14770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15); 14780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55); 14800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57); 14810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55); 14820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57); 14830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13); 14840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13); 14850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17); 14860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17); 14870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4); 14890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4); 14900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5); 14910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5); 14920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*m_temp_reg_54 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6); 14940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_55 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6); 14950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_56 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7); 14960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_57 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7); 14970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar */ 14980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_54 = _mm_setzero_si128(); 14990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_55 = _mm_setzero_si128(); 15000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_56 = _mm_setzero_si128(); 15010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_57 = _mm_setzero_si128(); 15020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 15030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 15040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Stage 2 */ 15060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar i4_shift = IT_SHIFT_STAGE_2; 15070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 15080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */ 15090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */ 15100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 15110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]); //add 15120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]); //sub 15130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_54); 15150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_54); 15160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 15180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); 15190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); 15200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2); 15210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]); 15240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]); 15250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 15260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */ 15290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */ 15300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 15310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_56); 15330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_56); 15340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 15370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); 15380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); 15390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_1, m_coeff2); 15400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Loading coeff for computing o0 in the next block */ 15420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]); 15430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_51, m_temp_reg_53); 15460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_51, m_temp_reg_53); 15470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e */ 15510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */ 15530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */ 15540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */ 15550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */ 15560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16); 15570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16); 15580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14); 15600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14); 15610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17); 15630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17); 15640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15); 15660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15); 15670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 15690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o */ 15710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 15720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_55,m_temp_reg_57); 15740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_55,m_temp_reg_57); 15750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */ 15770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 15780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //o0:1B*89+3B*75,1T*89+3T*75 15790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 15800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); 15810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 15830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000); 15840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Loading coeff for computing o1 in the next block */ 15850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]); 15860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Column 0 of destination computed here */ 15900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* It is stored in m_temp_reg_50 */ 15910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Column 7 of destination computed here */ 15920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* It is stored in m_temp_reg_57 */ 15930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_2 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); 15950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); 15960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_3 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31); 15980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31); 15990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor); 16010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor); 16020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor); 16030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor); 16040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //o1:1B*75-3B*18,1T*75-3T*18 16060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_0, m_coeff3); 16070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); 16080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift); 16100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift); 16110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift); 16120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift); 16130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3); 16150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7); 16160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */ 16190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Loading coeff for computing o2 in the next block */ 16220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]); 16230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Column 1 of destination computed here */ 16270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* It is stored in m_temp_reg_51 */ 16280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Column 6 of destination computed here */ 16290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* It is stored in m_temp_reg_56 */ 16300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_2 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32); 16320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32); 16330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_3 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33); 16350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33); 16360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor); 16380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor); 16390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor); 16400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor); 16410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //o2:1B*50-3B*89,5T*18+7T*75. 16430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 16440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); 16450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift); 16470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift); 16480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift); 16490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift); 16500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3); 16520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7); 16530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */ 16560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Loading coeff for computing o3 in the next block */ 16580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]); 16600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Column 2 of destination computed here */ 16630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* It is stored in m_temp_reg_52 */ 16640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Column 5 of destination computed here */ 16650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* It is stored in m_temp_reg_55 */ 16660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_2 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34); 16680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34); 16690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_3 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35); 16710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35); 16720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor); 16740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor); 16750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor); 16760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor); 16770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //o3:1B*18-3B*50,1T*18-3T*50 16790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_0, m_coeff3); 16800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_37 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); 16810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift); 16830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift); 16840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift); 16850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift); 16860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3); 16890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7); 16900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */ 16940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Column 3 of destination computed here */ 16970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* It is stored in m_temp_reg_53 */ 16980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Column 4 of destination computed here */ 16990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* It is stored in m_temp_reg_54 */ 17000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36); 17020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36); 17030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37); 17050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37); 17060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_rdng_factor); 17080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_rdng_factor); 17090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_rdng_factor); 17100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_rdng_factor); 17110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_srai_epi32(m_temp_reg_20, i4_shift); 17130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_srai_epi32(m_temp_reg_21, i4_shift); 17140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_srai_epi32(m_temp_reg_22, i4_shift); 17150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_srai_epi32(m_temp_reg_23, i4_shift); 17160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21); 17180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23); 17190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 17200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 17210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Transpose of the destination 8x8 matrix done here */ 17230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */ 17240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* respectively */ 17250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 17260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51); 17270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53); 17280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51); 17290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53); 17300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11); 17310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11); 17320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15); 17330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15); 17340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55); 17360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57); 17370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55); 17380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57); 17390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13); 17400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13); 17410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17); 17420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17); 17430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4); 17440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_11 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4); 17450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_12 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5); 17460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_13 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5); 17470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_14 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6); 17490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_15 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6); 17500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_16 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7); 17510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_17 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7); 17520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 17530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Recon and store */ 17550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 17560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred); 17570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_pred += pred_strd; 17580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred); 17590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_pred += pred_strd; 17600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred); 17610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_pred += pred_strd; 17620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred); 17630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_pred += pred_strd; 17640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_4 = _mm_loadl_epi64((__m128i *)pu1_pred); 17650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_pred += pred_strd; 17660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_5 = _mm_loadl_epi64((__m128i *)pu1_pred); 17670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_pred += pred_strd; 17680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_6 = _mm_loadl_epi64((__m128i *)pu1_pred); 17690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_pred += pred_strd; 17700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_7 = _mm_loadl_epi64((__m128i *)pu1_pred); 17710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_50 = _mm_setzero_si128(); 17730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, m_temp_reg_50); 17740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_unpacklo_epi8(m_temp_reg_1, m_temp_reg_50); 17750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_2 = _mm_unpacklo_epi8(m_temp_reg_2, m_temp_reg_50); 17760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_3 = _mm_unpacklo_epi8(m_temp_reg_3, m_temp_reg_50); 17770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_50); 17780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_50); 17790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_6, m_temp_reg_50); 17800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_7, m_temp_reg_50); 17810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_50 = _mm_add_epi16(m_temp_reg_10, m_temp_reg_0); 17830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_51 = _mm_add_epi16(m_temp_reg_11, m_temp_reg_1); 17840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_52 = _mm_add_epi16(m_temp_reg_12, m_temp_reg_2); 17850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_53 = _mm_add_epi16(m_temp_reg_13, m_temp_reg_3); 17860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_54 = _mm_add_epi16(m_temp_reg_14, m_temp_reg_4); 17870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_55 = _mm_add_epi16(m_temp_reg_15, m_temp_reg_5); 17880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_56 = _mm_add_epi16(m_temp_reg_16, m_temp_reg_6); 17890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_57 = _mm_add_epi16(m_temp_reg_17, m_temp_reg_7); 17900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_50 = _mm_packus_epi16(m_temp_reg_50, m_temp_reg_50); 17920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_51 = _mm_packus_epi16(m_temp_reg_51, m_temp_reg_51); 17930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_52 = _mm_packus_epi16(m_temp_reg_52, m_temp_reg_52); 17940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_53 = _mm_packus_epi16(m_temp_reg_53, m_temp_reg_53); 17950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_54 = _mm_packus_epi16(m_temp_reg_54, m_temp_reg_54); 17960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_55 = _mm_packus_epi16(m_temp_reg_55, m_temp_reg_55); 17970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_56 = _mm_packus_epi16(m_temp_reg_56, m_temp_reg_56); 17980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_57 = _mm_packus_epi16(m_temp_reg_57, m_temp_reg_57); 17990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_50); 18010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += dst_strd; 18020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_51); 18030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += dst_strd; 18040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_52); 18050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += dst_strd; 18060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_53); 18070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += dst_strd; 18080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_54); 18090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += dst_strd; 18100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_55); 18110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += dst_strd; 18120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_56); 18130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += dst_strd; 18140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_57); 18150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += dst_strd; 18160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 18170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 18180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 18190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else 18200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 18220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */ 18240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */ 18250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(!check_row_stage_1) 18260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 18270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */ 18280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */ 18290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 18300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //Interleaving 0,4 row in 0 , 1 Rishab 18310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/ 18320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]); 18330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]); 18340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74); 18360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74); 18370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 18390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); 18400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); 18430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2); 18440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 18450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */ 18480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */ 18490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 18500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83 18520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36 18530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Combining instructions to eliminate them based on zero_rows : Lokesh */ 18550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //Interleaving 2,6 row in 4, 5 Rishab 18560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); 18570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); 18580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1); 18600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2); 18610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_5, m_coeff1); 18630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_5, m_coeff2); 18640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Loading coeff for computing o0, o1, o2 and o3 in the next block */ 18680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]); 18700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_coeff4 = _mm_loadu_si128((__m128i *) &g_ai2_ihevc_trans_intr_odd_8[3][0]); 18710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]); 18730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_coeff2 = _mm_loadu_si128((__m128i *) &g_ai2_ihevc_trans_intr_odd_8[1][0]); 18740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 18760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e */ 18780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 18790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */ 18800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */ 18810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */ 18820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */ 18830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16); 18840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16); 18850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14); 18870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14); 18880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17); 18900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17); 18910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15); 18930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15); 18940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 18960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o */ 18980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 18990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */ 19010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 19020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); 19040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_61 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73); 19050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar // m_temp_reg_64 = _mm_unpacklo_epi16(m_temp_reg_75,m_temp_reg_77); 19060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar // m_temp_reg_65 = _mm_unpackhi_epi16(m_temp_reg_75,m_temp_reg_77); 19070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //o0:1B*89+3B*75,1T*89+3T*75 19080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_60, m_coeff1); 19090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_61, m_coeff1); 19100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2); 19110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_65, m_coeff2); 19120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 19150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000); 19160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24); 19180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_31 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25); 19190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 19200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Column 0 of destination computed here */ 19220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* It is stored in m_temp_reg_50 */ 19230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Column 7 of destination computed here */ 19240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* It is stored in m_temp_reg_57 */ 19250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 19260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); 19290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); 19300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_63 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31); 19320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31); 19330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 19350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor); 19360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 19370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor); 19380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 19400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift); 19410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 19420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift); 19430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50 19450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_60, m_coeff3); 19460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4); 19470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_61, m_coeff3); 19480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_65, m_coeff4); 19490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 19510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67); 19520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Loading coeff for computing o2 in the next block */ 19540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]); 19560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */ 19580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26); 19590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_27); 19600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 19610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Column 1 of destination computed here */ 19630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* It is stored in m_temp_reg_51 */ 19640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Column 6 of destination computed here */ 19650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* It is stored in m_temp_reg_56 */ 19660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 19670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32); 19680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32); 19690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_63 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33); 19710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33); 19720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 19740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 19750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor); 19760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor); 19770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 19790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift); 19800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 19810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift); 19820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //o2:1B*50-3B*89,1T*50-3T*89 19840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_60, m_coeff1); 19850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_madd_epi16(m_temp_reg_61, m_coeff1); 19860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 19880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67); 19890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */ 19920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Loading coeff for computing o3 in the next block */ 19950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]); 19970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24); 19990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_35 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25); 20000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 20010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Column 2 of destination computed here */ 20030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* It is stored in m_temp_reg_52 */ 20040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Column 5 of destination computed here */ 20050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* It is stored in m_temp_reg_55 */ 20060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 20070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34); 20080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34); 20090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_63 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35); 20110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35); 20120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 20140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor); 20150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 20160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor); 20170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 20190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift); 20200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 20210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift); 20220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //o3:1B*18-3B*50,1T*18-3T*50 20240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_60, m_coeff3); 20250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_37 = _mm_madd_epi16(m_temp_reg_61, m_coeff3); 20260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 20280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67); 20290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */ 20330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 20360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Column 3 of destination computed here */ 20380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* It is stored in m_temp_reg_53 */ 20390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Column 4 of destination computed here */ 20400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* It is stored in m_temp_reg_54 */ 20410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 20420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36); 20430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36); 20440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_63 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37); 20460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37); 20470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 20490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor); 20500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 20510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor); 20520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 20540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift); 20550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 20560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift); 20570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 20590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67); 20600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 20610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 20620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Transpose of the destination 8x8 matrix done here */ 20640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */ 20650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* respectively */ 20660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 20670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51); 20700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53); 20710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51); 20720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53); 20730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11); 20740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11); 20750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15); 20760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15); 20770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55); 20790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57); 20800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55); 20810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57); 20820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13); 20830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13); 20840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17); 20850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17); 20860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4); 20880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4); 20890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5); 20900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5); 20910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_54 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6); 20930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_55 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6); 20940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_56 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7); 20950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_57 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7); 20960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 20970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 20980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else 20990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 21000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */ 21020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */ 21030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 21040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //Interleaving 0,4 row in 0 , 1 Rishab 21050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/ 21060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]); 21070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]); 21080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74); 21100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74); 21110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 21130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); 21140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); 21170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2); 21180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 21190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */ 21220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */ 21230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 21240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83 21260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36 21270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Combining instructions to eliminate them based on zero_rows : Lokesh */ 21290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //Interleaving 2,6 row in 4, 5 Rishab 21300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); 21310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); 21320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1); 21340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2); 21350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_5, m_coeff1); 21370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_5, m_coeff2); 21380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Loading coeff for computing o0, o1, o2 and o3 in the next block */ 21420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]); 21440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[3][0]); 21450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]); 21470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[1][0]); 21480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 21500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e */ 21520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 21530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */ 21540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */ 21550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */ 21560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */ 21570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16); 21580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16); 21590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14); 21610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14); 21620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17); 21640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17); 21650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15); 21670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15); 21680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 21700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o */ 21720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 21730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */ 21750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 21760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); 21780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_61 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73); 21790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_64 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77); 21800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_65 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_77); 21810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //o0:1B*89+3B*75,1T*89+3T*75,5B*50+7B*18,5T*50+7T*18 21820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1); 21830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_61, m_coeff1); 21840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2); 21850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_65, m_coeff2); 21860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 21890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000); 21900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24); 21920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25); 21930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 21940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Column 0 of destination computed here */ 21960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* It is stored in m_temp_reg_50 */ 21970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Column 7 of destination computed here */ 21980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* It is stored in m_temp_reg_57 */ 21990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 22000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); 22030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); 22040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_63 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31); 22060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31); 22070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 22090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor); 22100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 22110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor); 22120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 22140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift); 22150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 22160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift); 22170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50 22190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3); 22200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4); 22210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_61, m_coeff3); 22220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_65, m_coeff4); 22230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 22250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67); 22260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Loading coeff for computing o2 in the next block */ 22280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]); 22300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[5][0]); 22310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */ 22330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26); 22340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_27); 22350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 22360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Column 1 of destination computed here */ 22380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* It is stored in m_temp_reg_51 */ 22390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Column 6 of destination computed here */ 22400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* It is stored in m_temp_reg_56 */ 22410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 22420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32); 22430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32); 22440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_63 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33); 22460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33); 22470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 22490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 22500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor); 22510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor); 22520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 22540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift); 22550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 22560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift); 22570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //o2:1B*50-3B*89,1T*50-3T*89,5B*18+7B*75,5T*18+7T*75 22590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1); 22600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2); 22610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_61, m_coeff1); 22620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_65, m_coeff2); 22630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 22650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67); 22660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */ 22690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Loading coeff for computing o3 in the next block */ 22720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]); 22740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[7][0]); 22750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24); 22770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25); 22780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 22790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Column 2 of destination computed here */ 22810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* It is stored in m_temp_reg_52 */ 22820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Column 5 of destination computed here */ 22830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* It is stored in m_temp_reg_55 */ 22840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 22850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34); 22860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34); 22870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_63 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35); 22890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35); 22900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 22920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor); 22930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 22940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor); 22950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 22970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift); 22980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 22990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift); 23000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //o3:1B*18-3B*50,1T*18-3T*50,5B*75-7B*89,5T*75-7T*89 23020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3); 23030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4); 23040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_61, m_coeff3); 23050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_65, m_coeff4); 23060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 23080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67); 23090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */ 23130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26); 23160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_37 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_27); 23170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 23180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Column 3 of destination computed here */ 23200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* It is stored in m_temp_reg_53 */ 23210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Column 4 of destination computed here */ 23220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* It is stored in m_temp_reg_54 */ 23230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 23240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36); 23250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36); 23260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_63 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37); 23280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37); 23290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 23310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor); 23320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 23330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor); 23340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 23360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift); 23370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 23380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift); 23390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 23410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67); 23420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 23430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 23440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Transpose of the destination 8x8 matrix done here */ 23460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */ 23470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* respectively */ 23480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 23490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51); 23520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53); 23530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51); 23540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53); 23550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11); 23560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11); 23570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15); 23580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15); 23590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55); 23610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57); 23620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55); 23630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57); 23640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13); 23650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13); 23660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17); 23670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17); 23680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4); 23700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4); 23710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5); 23720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5); 23730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_54 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6); 23750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_55 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6); 23760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_56 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7); 23770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_57 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7); 23780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 23790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 23800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Stage 2 */ 23810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar i4_shift = IT_SHIFT_STAGE_2; 23830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 23850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */ 23870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */ 23880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 23890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]); //add 23900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]); //sub 23910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_54); 23930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_54); 23940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 23960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); 23970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); 23980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2); 23990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]); 24020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]); 24030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 24040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */ 24070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */ 24080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 24090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_66 = _mm_mullo_epi32(m_temp_reg_2, m_coeff1); 24100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_64 = _mm_mullo_epi32(m_temp_reg_0, m_coeff2); 24110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_62 = _mm_mullo_epi32(m_temp_reg_2, m_coeff2); 24120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_60 = _mm_mullo_epi32(m_temp_reg_0, m_coeff1); 24130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_56); 24140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_56); 24150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 24180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); 24190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); 24200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_1, m_coeff2); 24210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_16 = _mm_sub_epi32(m_temp_reg_64, m_temp_reg_66); 24230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_14 = _mm_add_epi32(m_temp_reg_60, m_temp_reg_62); 24240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Loading coeff for computing o0 in the next block */ 24260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]); 24270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[1][0]); 24280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_51, m_temp_reg_53); 24310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_51, m_temp_reg_53); 24320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*m_temp_reg_3 = _mm_srli_si128(m_temp_reg_53, 8); 24330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_cvtepi16_epi32(m_temp_reg_1); 24340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_3 = _mm_cvtepi16_epi32(m_temp_reg_3); 24350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar */ 24360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 24380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e */ 24400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 24410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */ 24420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */ 24430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */ 24440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */ 24450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16); 24460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16); 24470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14); 24490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14); 24500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17); 24520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17); 24530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15); 24550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15); 24560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 24580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o */ 24600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 24610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* m_temp_reg_4 = _mm_cvtepi16_epi32(m_temp_reg_55); 24620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_5 = _mm_srli_si128(m_temp_reg_55, 8); 24630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_6 = _mm_cvtepi16_epi32(m_temp_reg_57); 24640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_7 = _mm_srli_si128(m_temp_reg_57, 8); 24650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_5 = _mm_cvtepi16_epi32(m_temp_reg_5); 24660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_7 = _mm_cvtepi16_epi32(m_temp_reg_7); 24670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar */ 24680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_55, m_temp_reg_57); 24690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_55, m_temp_reg_57); 24700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */ 24720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 24730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //o0:1B*89+3B*75,1T*89+3T*75,5B*50+7B*18,5T*50+7T*18 24740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 24750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); 24760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_4, m_coeff2); 24770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_5, m_coeff2); 24780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 24800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000); 24810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Loading coeff for computing o1 in the next block */ 24820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]); 24830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[3][0]); 24840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24); 24860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25); 24870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 24880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Column 0 of destination computed here */ 24900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* It is stored in m_temp_reg_50 */ 24910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Column 7 of destination computed here */ 24920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* It is stored in m_temp_reg_57 */ 24930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 24940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_2 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); 24950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); 24960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_3 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31); 24980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31); 24990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor); 25010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor); 25020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor); 25030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor); 25040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift); 25060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift); 25070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift); 25080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift); 25090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50 25110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff3); 25120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_4, m_coeff4); 25130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); 25140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_5, m_coeff4); 25150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3); 25170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7); 25180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */ 25210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Loading coeff for computing o2 in the next block */ 25240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]); 25250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[5][0]); 25260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26); 25280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_27); 25290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 25300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Column 1 of destination computed here */ 25320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* It is stored in m_temp_reg_51 */ 25330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Column 6 of destination computed here */ 25340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* It is stored in m_temp_reg_56 */ 25350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 25360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_2 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32); 25370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32); 25380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_3 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33); 25400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33); 25410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor); 25430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor); 25440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor); 25450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor); 25460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift); 25480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift); 25490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift); 25500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift); 25510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //o2:1B*50-3B*89,1T*50-3T*89,5B*18+7B*75,5T*18+7T*75 25530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 25540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_4, m_coeff2); 25550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); 25560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_5, m_coeff2); 25570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3); 25590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7); 25600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */ 25630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Loading coeff for computing o3 in the next block */ 25650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]); 25670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[7][0]); 25680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24); 25700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25); 25710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 25720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Column 2 of destination computed here */ 25740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* It is stored in m_temp_reg_52 */ 25750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Column 5 of destination computed here */ 25760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* It is stored in m_temp_reg_55 */ 25770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 25780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_2 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34); 25790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34); 25800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_3 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35); 25820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35); 25830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor); 25850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor); 25860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor); 25870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor); 25880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift); 25900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift); 25910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift); 25920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift); 25930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //o3:1B*18-3B*50,1T*18-3T*50,5B*75-7B*89,5T*75-7T*89 25950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff3); 25960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_4, m_coeff4); 25970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); 25980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_5, m_coeff4); 25990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3); 26010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7); 26020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */ 26060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26); 26090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_37 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_27); 26100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 26110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Column 3 of destination computed here */ 26130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* It is stored in m_temp_reg_53 */ 26140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Column 4 of destination computed here */ 26150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* It is stored in m_temp_reg_54 */ 26160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 26170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36); 26180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36); 26190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37); 26210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37); 26220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_rdng_factor); 26240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_rdng_factor); 26250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_rdng_factor); 26260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_rdng_factor); 26270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_srai_epi32(m_temp_reg_20, i4_shift); 26290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_srai_epi32(m_temp_reg_21, i4_shift); 26300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_srai_epi32(m_temp_reg_22, i4_shift); 26310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_srai_epi32(m_temp_reg_23, i4_shift); 26320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21); 26340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23); 26350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 26360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 26370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Transpose of the destination 8x8 matrix done here */ 26390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */ 26400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* respectively */ 26410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 26420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51); 26430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53); 26440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51); 26450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53); 26460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11); 26470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11); 26480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15); 26490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15); 26500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55); 26520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57); 26530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55); 26540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57); 26550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13); 26560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13); 26570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17); 26580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17); 26590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4); 26600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_11 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4); 26610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_12 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5); 26620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_13 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5); 26630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_14 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6); 26650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_15 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6); 26660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_16 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7); 26670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_17 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7); 26680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 26690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Recon and store */ 26710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 26720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred); 26730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_pred += pred_strd; 26740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred); 26750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_pred += pred_strd; 26760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred); 26770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_pred += pred_strd; 26780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred); 26790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_pred += pred_strd; 26800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_4 = _mm_loadl_epi64((__m128i *)pu1_pred); 26810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_pred += pred_strd; 26820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_5 = _mm_loadl_epi64((__m128i *)pu1_pred); 26830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_pred += pred_strd; 26840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_6 = _mm_loadl_epi64((__m128i *)pu1_pred); 26850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_pred += pred_strd; 26860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_7 = _mm_loadl_epi64((__m128i *)pu1_pred); 26870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_50 = _mm_setzero_si128(); 26900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, m_temp_reg_50); 26910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_unpacklo_epi8(m_temp_reg_1, m_temp_reg_50); 26920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_2 = _mm_unpacklo_epi8(m_temp_reg_2, m_temp_reg_50); 26930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_3 = _mm_unpacklo_epi8(m_temp_reg_3, m_temp_reg_50); 26940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_50); 26950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_50); 26960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_6, m_temp_reg_50); 26970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_7, m_temp_reg_50); 26980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_50 = _mm_add_epi16(m_temp_reg_10, m_temp_reg_0); 27000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_51 = _mm_add_epi16(m_temp_reg_11, m_temp_reg_1); 27010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_52 = _mm_add_epi16(m_temp_reg_12, m_temp_reg_2); 27020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_53 = _mm_add_epi16(m_temp_reg_13, m_temp_reg_3); 27030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_54 = _mm_add_epi16(m_temp_reg_14, m_temp_reg_4); 27040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_55 = _mm_add_epi16(m_temp_reg_15, m_temp_reg_5); 27050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_56 = _mm_add_epi16(m_temp_reg_16, m_temp_reg_6); 27060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_57 = _mm_add_epi16(m_temp_reg_17, m_temp_reg_7); 27070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_50 = _mm_packus_epi16(m_temp_reg_50, m_temp_reg_50); 27090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_51 = _mm_packus_epi16(m_temp_reg_51, m_temp_reg_51); 27100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_52 = _mm_packus_epi16(m_temp_reg_52, m_temp_reg_52); 27110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_53 = _mm_packus_epi16(m_temp_reg_53, m_temp_reg_53); 27120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_54 = _mm_packus_epi16(m_temp_reg_54, m_temp_reg_54); 27130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_55 = _mm_packus_epi16(m_temp_reg_55, m_temp_reg_55); 27140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_56 = _mm_packus_epi16(m_temp_reg_56, m_temp_reg_56); 27150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_57 = _mm_packus_epi16(m_temp_reg_57, m_temp_reg_57); 27160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_50); 27180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += dst_strd; 27190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_51); 27200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += dst_strd; 27210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_52); 27220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += dst_strd; 27230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_53); 27240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += dst_strd; 27250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_54); 27260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += dst_strd; 27270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_55); 27280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += dst_strd; 27290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_56); 27300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += dst_strd; 27310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_57); 27320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += dst_strd; 27330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 27350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 27380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 27410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar} 27420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2743