10d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/****************************************************************************** 20d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 30d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 40d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 50d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Licensed under the Apache License, Version 2.0 (the "License"); 60d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* you may not use this file except in compliance with the License. 70d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* You may obtain a copy of the License at: 80d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 90d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* http://www.apache.org/licenses/LICENSE-2.0 100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Unless required by applicable law or agreed to in writing, software 120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* distributed under the License is distributed on an "AS IS" BASIS, 130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* See the License for the specific language governing permissions and 150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* limitations under the License. 160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar******************************************************************************/ 180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/** 190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ******************************************************************************* 200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @file 210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * ihevc_itrans_recon_32x32_atom_intr.c 220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @brief 240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Contains function definitions for inverse quantization, inverse 250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * transform and reconstruction 260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @author 280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 100470 290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @par List of Functions: 310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * - ihevc_iquant_itrans_recon_32x32_ssse3() 320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @remarks 340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * None 350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ******************************************************************************* 370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar */ 380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <stdio.h> 390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <string.h> 400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_typedefs.h" 410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_platform_macros.h" 420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_macros.h" 430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_defs.h" 440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_trans_tables.h" 450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_iquant_itrans_recon.h" 460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_func_selector.h" 470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_trans_macros.h" 480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <immintrin.h> 530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <emmintrin.h> 540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <tmmintrin.h> 560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/** 600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ******************************************************************************* 610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @brief 630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * This function performs inverse quantization, inverse transform and 640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * reconstruction for 16x16 input block 650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @par Description: 670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Performs inverse quantization , inverse transform and adds the 680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * prediction data and clips output to 8 bit 690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] pi2_src 710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Input 16x16 coefficients 720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] pi2_tmp 740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Temporary 16x16 buffer for storing inverse 750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * transform 1st stage output 760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] pu1_pred 780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Prediction 16x16 block 790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] pi2_dequant_coeff 810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Dequant Coeffs 820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[out] pu1_dst 840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Output 16x16 block 850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] qp_div 870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Quantization parameter / 6 880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] qp_rem 900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Quantization parameter % 6 910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] src_strd 930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Input stride 940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] pred_strd 960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Prediction stride 970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] dst_strd 990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Output Stride 1000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 1010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] zero_cols 1020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Zero columns in pi2_src 1030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 1040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @returns Void 1050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 1060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @remarks 1070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * None 1080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 1090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ******************************************************************************* 1100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar */ 1110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/**/ 1120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarvoid ihevc_itrans_recon_32x32_ssse3(WORD16 *pi2_src, 1140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD16 *pi2_tmp, 1150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UWORD8 *pu1_pred, 1160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UWORD8 *pu1_dst, 1170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 src_strd, 1180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 pred_strd, 1190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 dst_strd, 1200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 zero_cols, 1210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 zero_rows) 1220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar{ 1230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Inverse Transform */ 1240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 j; 1260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD16 *pi2_tmp_orig; 1290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*MEM_ALIGN16 WORD32 temp_array[1024]; 1320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MEM_ALIGN16 WORD16 temp1_array[1024];*/ 1330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD16 *o_temp_ptr; 1340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD16 *temp_ptr; 1350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_0; 1370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_1; 1380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_2; 1390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_3; 1400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_4; 1410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_5; 1420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_6; 1430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_7; 1440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_10; 1450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_11; 1460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_12; 1470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_13; 1480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_14; 1490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_15; 1500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_16; 1510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_17; 1520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_18; 1530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_19; 1540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_20; 1550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_21; 1560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_22; 1570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_23; 1580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_30; 1590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_31; 1600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_32; 1610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_33; 1620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_34; 1630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_35; 1640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_36; 1650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_37; 1660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_40; 1670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_41; 1680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_42; 1690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_43; 1700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_44; 1710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_45; 1720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_46; 1730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_47; 1740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_70; 1760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_71; 1770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_72; 1780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_73; 1790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_74; 1800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_75; 1810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_76; 1820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_77; 1830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_80; 1850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_81; 1860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_82; 1870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_83; 1880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_84; 1890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_85; 1900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_86; 1910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_87; 1920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_90; 1940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_91; 1950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_92; 1960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_93; 1970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_94; 1980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_95; 1990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_96; 2000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_97; 2010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_rdng_factor; 2030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_count; 2040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_coeff1, m_coeff2, m_coeff3, m_coeff4; 2050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_coeff5, m_coeff6, m_coeff7, m_coeff8; 2060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i temp1, temp2, temp3, temp4; 2080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i temp5, temp6, temp7, temp8; 2090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i all_zero_reg; 2110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 i; 2120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*Lokesh*/ 2140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 zero_last24_cols_stg1; 2150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 zero_last24_rows_stg1; 2160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 zero_last28_rows_stg1; 2170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 zero_last28_rows_stg2; 2190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 zero_last24_rows_stg2; 2200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 trans_size_stg1; 2220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 i4_shift = IT_SHIFT_STAGE_1; 2240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 trans_size = TRANS_SIZE_32; 2250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Last 8 cols of 16x16 block are skipped based on the below flag : Lokesh */ 2280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar zero_last24_cols_stg1 = ((zero_cols & 0xFFFFFF00) == 0xFFFFFF00) ? 1 : 0; 2290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar zero_last24_rows_stg1 = ((zero_rows & 0xFFFFFF00) == 0xFFFFFF00) ? 1 : 0; 2300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar zero_last28_rows_stg1 = ((zero_rows & 0xFFFFFFF0) == 0xFFFFFFF0) ? 1 : 0; 2310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar zero_last28_rows_stg2 = ((zero_cols & 0xFFFFFFF0) == 0xFFFFFFF0) ? 1 : 0; 2330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar zero_last24_rows_stg2 = zero_last24_cols_stg1; 2340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if((zero_last28_rows_stg2) || (zero_last24_cols_stg1)) 2360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 2370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar trans_size_stg1 = 8; 2380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 2400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else 2410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 2420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar trans_size_stg1 = 32; 2430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 2440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar all_zero_reg = _mm_setzero_si128(); 2460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar o_temp_ptr = pi2_tmp; 2480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_ptr = (pi2_tmp + 1024); 2490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_tmp += 2048; 2510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_tmp_orig = pi2_tmp; 2520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(i = 0; i < trans_size_stg1; i += 8) 2540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 2550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 2580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD16 *pi2_tmp_src = pi2_src; 2590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_tmp_src); 2610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_tmp_src += (src_strd << 1); 2620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_tmp_src); 2630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_tmp_src += (src_strd << 1); 2640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_tmp_src); 2650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_tmp_src += (src_strd << 1); 2660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_tmp_src); 2670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_tmp_src += (src_strd << 1); 2680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_tmp_src); 2690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_tmp_src += (src_strd << 1); 2700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_tmp_src); 2710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_tmp_src += (src_strd << 1); 2720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_tmp_src); 2730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_tmp_src += (src_strd << 1); 2740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_tmp_src); 2750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_tmp_src += (src_strd << 1); 2760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_80 = _mm_load_si128((__m128i *)pi2_tmp_src); 2780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_tmp_src += (src_strd << 1); 2790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_81 = _mm_load_si128((__m128i *)pi2_tmp_src); 2800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_tmp_src += (src_strd << 1); 2810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_82 = _mm_load_si128((__m128i *)pi2_tmp_src); 2820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_tmp_src += (src_strd << 1); 2830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_83 = _mm_load_si128((__m128i *)pi2_tmp_src); 2840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_tmp_src += (src_strd << 1); 2850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_84 = _mm_load_si128((__m128i *)pi2_tmp_src); 2860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_tmp_src += (src_strd << 1); 2870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_85 = _mm_load_si128((__m128i *)pi2_tmp_src); 2880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_tmp_src += (src_strd << 1); 2890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_86 = _mm_load_si128((__m128i *)pi2_tmp_src); 2900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_tmp_src += (src_strd << 1); 2910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_87 = _mm_load_si128((__m128i *)pi2_tmp_src); 2920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 2930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(zero_last28_rows_stg1) 2950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 2960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeo */ 2970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */ 2980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */ 2990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 3000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64 3010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg); 3030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); 3050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/* eeeo[0]= m_temp_reg_20 */ 3070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/* eeeo[1]= m_temp_reg_21 */ 3080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/* eeee[0]= m_temp_reg_22 */ 3090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/* eeee[1]= m_temp_reg_23 */ 3100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eee[0] = eeee[0] + eeeo[0]; */ 3120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = m_temp_reg_14; 3130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eee[3] = eeee[0] - eeeo[0]; */ 3150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_43 = m_temp_reg_14; 3160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eee[2] = eeee[1] - eeeo[1]; */ 3180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = m_temp_reg_14; //m_temp_reg_16; 3190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eee[1] = eeee[1] + eeeo[1];*/ 3210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_41 = m_temp_reg_14; //m_temp_reg_16; 3220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8); 3240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg); 3260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); 3280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/* eeeo[0]= m_temp_reg_20 */ 3300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/* eeeo[1]= m_temp_reg_21 */ 3310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/* eeee[0]= m_temp_reg_22 */ 3320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/* eeee[1]= m_temp_reg_23 */ 3330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eee[0] = eeee[0] + eeeo[0]; */ 3350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_44 = m_temp_reg_14; 3360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eee[3] = eeee[0] - eeeo[0]; */ 3380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_47 = m_temp_reg_14; 3390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eee[2] = eeee[1] - eeeo[1]; */ 3410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_46 = m_temp_reg_14; //m_temp_reg_16; 3420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eee[1] = eeee[1] + eeeo[1];*/ 3440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_45 = m_temp_reg_14; //m_temp_reg_16; 3450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 3480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo */ 3490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 3500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD16 *pi2_scratch = o_temp_ptr; 3510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 3530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87 3540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80 3550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70 3560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57 3570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43 3580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25 3590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //9 3600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_10 = _mm_cvtepi16_epi32(m_temp_reg_71); 3620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, all_zero_reg); 3630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8); 3650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo0[0-3] */ 3670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 3680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 3690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_14 = _mm_cvtepi16_epi32(m_temp_reg_71); 3710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_71, all_zero_reg); 3720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); 3740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); 3750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 3770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 3780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 3790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 3800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 3820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo0[4-7] */ 3840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 3850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 3860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30); 3880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30); 3890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 3910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 3920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 3930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 3940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 3960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo1[0-3] */ 3970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 3980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff2); 3990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30); 4010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30); 4020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 4040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 4050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 4060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 4070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 4090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo1[4-7] */ 4110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 4120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff2); 4130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30); 4150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30); 4160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 4180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 4190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 4200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 4210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 4230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo2[0-3] */ 4250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 4260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 4270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30); 4290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30); 4300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 4320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 4330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 4340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 4350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 4370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo2[4-7] */ 4390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 4400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3); 4410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30); 4430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30); 4440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 4460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 4470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 4480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 4490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 4510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /**************************************************************************/ 4530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo3[0-3] */ 4560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 4570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff4); 4580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30); 4600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30); 4610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 4630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 4640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 4650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 4660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 4680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo3[4-7] */ 4700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 4710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff4); 4720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30); 4740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30); 4750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 4770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 4780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 4790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 4800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 4820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo4[0-3] */ 4850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 4860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 4870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30); 4890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30); 4900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 4920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 4930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 4940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 4950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 4970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo4[4-7] */ 4980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 4990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff5); 5000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30); 5020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30); 5030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 5050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 5060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 5070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 5080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 5100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /***********************************************************************/ 5120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo5[0-3] */ 5140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 5150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff6); 5160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30); 5180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30); 5190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 5210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 5220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 5230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 5240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 5260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo5[4-7] */ 5290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 5300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff6); 5310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30); 5330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30); 5340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 5360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 5370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 5380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 5390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 5410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo6[0-3] */ 5430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 5440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff7); 5450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30); 5470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30); 5480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 5500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 5510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 5520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 5530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 5550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo6[4-7] */ 5580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 5590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff7); 5600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30); 5620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30); 5630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 5650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 5660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 5670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 5680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 5700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo7[0-3] */ 5730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 5740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff8); 5750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); 5770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); 5780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 5800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 5810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 5820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 5830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 5850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo7[4-7] */ 5880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 5890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff8); 5900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30); 5920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30); 5930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 5950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 5960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 5970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 5980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 6000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 6020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 6030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else if(zero_last24_rows_stg1) 6040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 6050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 6060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeo */ 6070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */ 6080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */ 6090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83 36 6110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36 -83 6120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64 64 6140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg); 6160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); 6180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeeo[0]= m_temp_reg_20 */ 6200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeeo[1]= m_temp_reg_21 */ 6210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeee[0]= m_temp_reg_22 */ 6220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeee[1]= m_temp_reg_23 */ 6230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eee[0] = eeee[0] + eeeo[0]; */ 6250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = m_temp_reg_14; 6260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eee[3] = eeee[0] - eeeo[0]; */ 6280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_43 = m_temp_reg_14; 6290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eee[2] = eeee[1] - eeeo[1]; */ 6310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = m_temp_reg_14; //m_temp_reg_16; 6320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eee[1] = eeee[1] + eeeo[1];*/ 6340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_41 = m_temp_reg_14; //m_temp_reg_16; 6350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* for row 4 to 7 */ 6370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8); 6390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg); 6410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); 6430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeeo[0]= m_temp_reg_20 */ 6450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeeo[1]= m_temp_reg_21 */ 6460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeee[0]= m_temp_reg_22 */ 6470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeee[1]= m_temp_reg_23 */ 6480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eee[0] = eeee[0] + eeeo[0]; */ 6500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_44 = m_temp_reg_14; 6510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eee[3] = eeee[0] - eeeo[0]; */ 6530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_47 = m_temp_reg_14; 6540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eee[2] = eeee[1] - eeeo[1]; */ 6560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_46 = m_temp_reg_14; //m_temp_reg_16; 6570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eee[1] = eeee[1] + eeeo[1];*/ 6590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_45 = m_temp_reg_14; //m_temp_reg_16; 6600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeo[] */ 6630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* for(k = 0; k < 4; k++) */ 6640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75 6660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75 6670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18 6680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[5][0]); //50 18 6690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, all_zero_reg); 6710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8); 6730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_72, all_zero_reg); 6750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_33 = _mm_setzero_si128(); 6770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeo */ 6790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 6800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeo0[0-3] */ 6810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 6820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 6830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); 6850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); 6860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_90 = m_temp_reg_34; 6880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_97 = m_temp_reg_35; 6890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 6900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeo0[4-7] */ 6910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 6920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 6930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30); 6950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30); 6960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_91 = m_temp_reg_34; 6980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_96 = m_temp_reg_35; 6990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 7010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeo1[0-3] */ 7030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 7040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff2); 7050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[1][0-3] stored in pi2_tmp[2][0-7] */ 7070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[6][0-3] stored in pi2_tmp[2][8-15] */ 7080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30); 7090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30); 7100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_92 = m_temp_reg_34; 7120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_95 = m_temp_reg_35; 7130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 7150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo1[4-7] */ 7170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 7180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff2); 7190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[1][4-7] stored in pi2_tmp[3][0-7] */ 7210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[6][4-7] stored in pi2_tmp[3][8-15] */ 7220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30); 7230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30); 7240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_93 = m_temp_reg_34; 7260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_94 = m_temp_reg_35; 7270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 7300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo2[0-3] */ 7320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 7330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff4); 7340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[2][0-3] stored in pi2_tmp[4][0-7] */ 7360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[5][0-3] stored in pi2_tmp[4][8-15] */ 7370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30); 7380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30); 7390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp1 = m_temp_reg_34; 7410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp7 = m_temp_reg_35; 7420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 7440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo2[4-7] */ 7460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 7470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff4); 7480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[2][4-7] stored in pi2_tmp[5][0-7] */ 7500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[5][4-7] stored in pi2_tmp[5][8-15] */ 7510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30); 7520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30); 7530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp2 = m_temp_reg_34; 7550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp6 = m_temp_reg_35; 7560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 7580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo3[0-3] */ 7600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 7610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 7620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[3][0-3] stored in pi2_tmp[6][0-7] */ 7640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[4][0-3] stored in pi2_tmp[6][8-15] */ 7650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30); 7660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30); 7670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp3 = m_temp_reg_34; 7690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp5 = m_temp_reg_35; 7700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 7720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo3[4-7] */ 7750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 7760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3); 7770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[3][4-7] stored in pi2_tmp[7][0-7] */ 7790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[4][4-7] stored in pi2_tmp[7][8-15] */ 7800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30); 7810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30); 7820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp4 = m_temp_reg_34; 7840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp8 = m_temp_reg_35; 7850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 7880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* All values of ee[] array in pi2_temp */ 7890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87 7910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[1][0]); //80 70 7920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57 43 7930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25 9 7940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); 7960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8); 7980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8); 7990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 8010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 8020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo */ 8030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 8040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD16 *pi2_scratch = o_temp_ptr; 8050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo0[0-3] */ 8070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 8080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 8090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_30); 8110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_30); 8120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 8140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 8150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 8160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 8170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 8190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo0[4-7] */ 8220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 8230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); 8240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 8260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_30); 8280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_30); 8290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 8310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 8320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 8330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 8340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 8360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87 57 8380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo1[0-3] */ 8400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 8410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 8420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_30); 8440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_30); 8450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 8470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 8480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 8490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 8500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 8520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo1[4-7] */ 8550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 8560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 8570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_30); 8590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_30); 8600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 8620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 8630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 8640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 8650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 8670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80 9 8690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo2[0-3] */ 8710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 8720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 8740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(temp1, m_temp_reg_30); 8760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(temp1, m_temp_reg_30); 8770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 8790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 8800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 8810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 8820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 8840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo2[4-7] */ 8860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 8870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 8890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(temp2, m_temp_reg_30); 8910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(temp2, m_temp_reg_30); 8920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 8940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 8950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 8960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 8970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 8990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /**************************************************************************/ 9010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70 -43 9050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo3[0-3] */ 9070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 9080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 9100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(temp3, m_temp_reg_30); 9120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(temp3, m_temp_reg_30); 9130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 9150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 9160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 9170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 9180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 9200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo3[4-7] */ 9230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 9240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 9260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(temp4, m_temp_reg_30); 9280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(temp4, m_temp_reg_30); 9290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 9310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 9320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 9330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 9340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 9360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57 -80 9380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo4[0-3] */ 9400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 9410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 9420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(temp5, m_temp_reg_30); 9440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(temp5, m_temp_reg_30); 9450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 9470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 9480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 9490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 9500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 9520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo4[4-7] */ 9530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 9540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 9550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(temp8, m_temp_reg_30); 9570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(temp8, m_temp_reg_30); 9580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 9600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 9610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 9620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 9630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 9650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /***********************************************************************/ 9670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43 -90 9690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo5[0-3] */ 9710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 9720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 9740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(temp7, m_temp_reg_30); 9760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(temp7, m_temp_reg_30); 9770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 9790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 9800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 9810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 9820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 9840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo5[4-7] */ 9870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 9880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 9890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(temp6, m_temp_reg_30); 9910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(temp6, m_temp_reg_30); 9920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 9940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 9950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 9960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 9970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 9990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25 -70 10010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo6[0-3] */ 10030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 10040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 10050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_30); 10070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_30); 10080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 10100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 10110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 10120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 10130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 10150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo6[4-7] */ 10180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 10190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 10210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_30); 10230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_30); 10240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 10260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 10270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 10280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 10290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 10310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9 -25 10330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo7[0-3] */ 10350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 10360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 10380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_30); 10400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_30); 10410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 10430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 10440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 10450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 10460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 10480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo7[4-7] */ 10510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 10520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 10530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_30); 10550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_30); 10560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 10580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 10590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 10600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 10610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 10630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 10650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 10670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else 10680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 10690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 10710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeo */ 10720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */ 10730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */ 10740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83 36 10760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36 -83 10770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64 64 10790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[3][0]); //64 -64 10800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_84); 10820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_80); 10840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); /* eeeo[0] */ 10860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); /* eeeo[1] */ 10870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); /* eeee[0] */ 10890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff4); /* eeee[1] */ 10900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeeo[0]= m_temp_reg_20 */ 10930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeeo[1]= m_temp_reg_21 */ 10940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeee[0]= m_temp_reg_22 */ 10950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeee[1]= m_temp_reg_23 */ 10960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eee[0] = eeee[0] + eeeo[0]; */ 10980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); /* eeeo[0] */ 10990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eee[3] = eeee[0] - eeeo[0]; */ 11010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_43 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20); /* eeeo[1] */ 11020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eee[2] = eeee[1] - eeeo[1]; */ 11040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_22); /* eeee[1] */ 11050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eee[1] = eeee[1] + eeeo[1];*/ 11070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_41 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_22); /* eeee[0] */ 11080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* for row 4 to 7 */ 11100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_74 = _mm_srli_si128(m_temp_reg_74, 8); 11120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_84 = _mm_srli_si128(m_temp_reg_84, 8); 11130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Interleaving row 8 and row 24*/ 11150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_84); 11160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8); 11180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_80 = _mm_srli_si128(m_temp_reg_80, 8); 11190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_80); 11210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); /* eeeo[0] */ 11230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); /* eeeo[1] */ 11240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); /* eeee[0] */ 11260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff4); /* eeee[1] */ 11270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeeo[0]= m_temp_reg_20 */ 11300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeeo[1]= m_temp_reg_21 */ 11310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeee[0]= m_temp_reg_22 */ 11320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeee[1]= m_temp_reg_23 */ 11330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eee[0] = eeee[0] + eeeo[0]; */ 11350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_44 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); /* eeeo[0] */ 11360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eee[3] = eeee[0] - eeeo[0]; */ 11380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20); /* eeeo[1] */ 11390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eee[2] = eeee[1] - eeeo[1]; */ 11410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_22); /* eeee[1] */ 11420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eee[1] = eeee[1] + eeeo[1];*/ 11440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_45 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_22); /* eeee[0] */ 11450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar // eeo[] 11480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* for(k = 0; k < 4; k++) */ 11490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75 11520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[5][0]); //50 18 11530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeo */ 11550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 11560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeo0[0-3] */ 11570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 11580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); 11590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_86); 11600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 11620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 11630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 11650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_90 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); 11670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); 11680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 11700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8); 11720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_76 = _mm_srli_si128(m_temp_reg_76, 8); 11730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_82 = _mm_srli_si128(m_temp_reg_82, 8); 11740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_86 = _mm_srli_si128(m_temp_reg_86, 8); 11750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeo0[4-7] */ 11770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 11780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); 11790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_15 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_86); 11800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 11820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2); 11830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 11850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_91 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30); 11870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_96 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30); 11880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 11900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75 -18 11930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[7][0]); //89 50 11940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeo1[0-3] */ 11960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 11970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 11980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4); 11990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30); 12010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30); 12020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_92 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_31); 12040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_95 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_31); 12050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 12070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeo1[4-7] */ 12090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 12100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3); 12120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff4); 12130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30); 12150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30); 12160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_31); 12180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_94 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_31); 12190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 12220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[8][0]); //50 -89 12240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18 75 12250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeo2[0-3] */ 12270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 12280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 12300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4); 12310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[2][0-3] stored in pi2_tmp[4][0-7] */ 12330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[5][0-3] stored in pi2_tmp[4][8-15] */ 12340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30); 12360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30); 12370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp1 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31); 12390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp7 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31); 12400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 12420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeo2[4-7] */ 12440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 12450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3); 12470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff4); 12480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[2][4-7] stored in pi2_tmp[5][0-7] */ 12500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[5][4-7] stored in pi2_tmp[5][8-15] */ 12510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30); 12530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30); 12540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp2 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31); 12560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp6 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31); 12570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 12590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[10][0]); //18 -50 12610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[11][0]); //75 -89 12620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeo3[0-3] */ 12640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 12650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 12670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4); 12680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[3][0-3] stored in pi2_tmp[6][0-7] */ 12700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[4][0-3] stored in pi2_tmp[6][8-15] */ 12710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30); 12730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30); 12740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp3 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31); 12760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp5 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31); 12770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 12800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeo3[4-7] */ 12820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 12830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3); 12850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff4); 12860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[3][4-7] stored in pi2_tmp[7][0-7] */ 12880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[4][4-7] stored in pi2_tmp[7][8-15] */ 12890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30); 12910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30); 12920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp4 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31); 12930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp8 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31); 12940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 12960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* All values of ee[] array in pi2_temp */ 12990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* for(k = 0; k < 8; k++) */ 13010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87 13020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[1][0]); //80 70 13030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57 43 13040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25 9 13050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 13060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 13070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo */ 13080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 13090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD16 *pi2_scratch = o_temp_ptr; 13100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); 13120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77); 13130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_81, m_temp_reg_83); 13140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_85, m_temp_reg_87); 13150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8); 13170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8); 13180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_75 = _mm_srli_si128(m_temp_reg_75, 8); 13190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_77 = _mm_srli_si128(m_temp_reg_77, 8); 13200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_81 = _mm_srli_si128(m_temp_reg_81, 8); 13220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_83 = _mm_srli_si128(m_temp_reg_83, 8); 13230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_85 = _mm_srli_si128(m_temp_reg_85, 8); 13240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_87 = _mm_srli_si128(m_temp_reg_87, 8); 13250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo0[0-3] */ 13270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 13280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 13290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 13300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 13320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 13340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 13350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); 13370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 13390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_30); 13410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_30); 13420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 13440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 13450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 13460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 13470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 13490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo0[4-7] */ 13500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 13510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); 13520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_15 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77); 13530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_16 = _mm_unpacklo_epi16(m_temp_reg_81, m_temp_reg_83); 13540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_17 = _mm_unpacklo_epi16(m_temp_reg_85, m_temp_reg_87); 13550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 13570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2); 13580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 13600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3); 13620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4); 13630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); 13650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 13670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_30); 13690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_30); 13700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 13720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 13730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 13740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 13750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 13770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87 57 13790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //0 -43 13800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80 90 13810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70 25 13820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo1[0-3] */ 13840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 13850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 13870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 13880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 13900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 13920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 13930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); 13950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_32); 13970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_30); 13990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_30); 14000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 14020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 14030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 14040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 14050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 14070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo1[4-7] */ 14090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 14100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 14110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2); 14120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 14140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3); 14160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4); 14170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); 14190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_32); 14210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_30); 14230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_30); 14240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 14260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 14270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 14280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 14290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 14310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80 9 14330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[9][0]); //70 87 14340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[10][0]); //-25 57 14350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[11][0]); //90 43 14360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo2[0-3] */ 14380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 14390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 14400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 14410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31); 14430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 14450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 14460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); 14480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 14500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(temp1, m_temp_reg_30); 14520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(temp1, m_temp_reg_30); 14530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 14550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 14560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 14570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 14580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 14600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo2[4-7] */ 14630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 14640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 14660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2); 14670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31); 14690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3); 14710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4); 14720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); 14740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 14760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(temp2, m_temp_reg_30); 14780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(temp2, m_temp_reg_30); 14790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 14810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 14820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 14830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 14840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 14860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /**************************************************************************/ 14870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70 -43 14890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[13][0]); //-87 9 14900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[14][0]); //90 25 14910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[15][0]); //80 57 14920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo3[0-3] */ 14940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 14950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 14960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 14970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 14990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 15010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 15020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_32, m_temp_reg_33); 15040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 15060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(temp3, m_temp_reg_30); 15080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(temp3, m_temp_reg_30); 15090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 15110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 15120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 15130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 15140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 15160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo3[4-7] */ 15190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 15200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 15210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2); 15220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 15240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3); 15260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4); 15270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_32, m_temp_reg_33); 15290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 15310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(temp4, m_temp_reg_30); 15330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(temp4, m_temp_reg_30); 15340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 15360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 15370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 15380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 15390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 15410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57 -80 15430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[17][0]); //-25 90 15440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[18][0]); //9 87 15450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43 70 15460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo4[0-3] */ 15480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 15490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 15510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 15520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 15540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 15560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 15570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32); 15590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 15610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(temp5, m_temp_reg_30); 15630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(temp5, m_temp_reg_30); 15640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 15660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 15670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 15680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 15690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 15710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo4[4-7] */ 15740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 15750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 15760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2); 15770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 15790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3); 15810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4); 15820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32); 15840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 15860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(temp8, m_temp_reg_30); 15880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(temp8, m_temp_reg_30); 15890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 15910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 15920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 15930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 15940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 15960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /***********************************************************************/ 15980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43 -90 16000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[21][0]); //57 25 16010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[22][0]); //-87 70 16020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[23][0]); //9 -80 16030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo5[0-3] */ 16050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 16060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 16070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 16080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 16100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 16120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 16130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); 16150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 16170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(temp7, m_temp_reg_30); 16190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(temp7, m_temp_reg_30); 16200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 16220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 16230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 16240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 16250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 16270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo5[4-7] */ 16300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 16310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 16320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2); 16330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 16350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3); 16370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4); 16380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); 16400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 16420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(temp6, m_temp_reg_30); 16440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(temp6, m_temp_reg_30); 16450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 16470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 16480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 16490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 16500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 16520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25 -70 16540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[25][0]); //90 -80 16550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[26][0]); //43 9 16560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[27][0]); //-57 87 16570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo6[0-3] */ 16590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 16600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 16620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 16630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 16650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 16670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 16680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); 16700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 16720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_30); 16740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_30); 16750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 16770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 16780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 16790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 16800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 16820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo6[4-7] */ 16850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 16860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 16870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2); 16880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 16900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3); 16920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4); 16930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); 16950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 16970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_30); 16990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_30); 17000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 17020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 17030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 17040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 17050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 17070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9 -25 17090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[29][0]); //43 -57 17100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[30][0]); //70 -80 17110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[31][0]); //87 -90 17120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo7[0-3] */ 17140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 17150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 17170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 17180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 17200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 17220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 17230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); 17250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 17270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_30); 17290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_30); 17300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 17320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 17330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 17340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 17350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 17370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo7[4-7] */ 17400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 17410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 17430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2); 17440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 17460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3); 17480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4); 17490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); 17510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 17530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_30); 17550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_30); 17560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 17580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 17590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 17600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 17610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 17630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 17650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 17670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* All e[] are done */ 17680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /****************************/ 17690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 17720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD16 *pi2_tmp_src = pi2_src + src_strd; 17740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_tmp_src); 17760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_tmp_src += (src_strd << 1); 17770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_tmp_src); 17780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_tmp_src += (src_strd << 1); 17790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_tmp_src); 17800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_tmp_src += (src_strd << 1); 17810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_tmp_src); 17820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_tmp_src += (src_strd << 1); 17830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_tmp_src); 17840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_tmp_src += (src_strd << 1); 17850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_tmp_src); 17860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_tmp_src += (src_strd << 1); 17870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_tmp_src); 17880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_tmp_src += (src_strd << 1); 17890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_tmp_src); 17900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_tmp_src += (src_strd << 1); 17910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_80 = _mm_load_si128((__m128i *)pi2_tmp_src); 17930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_tmp_src += (src_strd << 1); 17940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_81 = _mm_load_si128((__m128i *)pi2_tmp_src); 17950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_tmp_src += (src_strd << 1); 17960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_82 = _mm_load_si128((__m128i *)pi2_tmp_src); 17970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_tmp_src += (src_strd << 1); 17980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_83 = _mm_load_si128((__m128i *)pi2_tmp_src); 17990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_tmp_src += (src_strd << 1); 18000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_84 = _mm_load_si128((__m128i *)pi2_tmp_src); 18010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_tmp_src += (src_strd << 1); 18020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_85 = _mm_load_si128((__m128i *)pi2_tmp_src); 18030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_tmp_src += (src_strd << 1); 18040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_86 = _mm_load_si128((__m128i *)pi2_tmp_src); 18050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_tmp_src += (src_strd << 1); 18060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_87 = _mm_load_si128((__m128i *)pi2_tmp_src); 18070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 18080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(zero_last28_rows_stg1) 18100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 18110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o & stage 1 out */ 18120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 18130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 j; 18140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD16 *pi2_src_scratch = o_temp_ptr; 18150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD16 *pi2_dst_scratch = temp_ptr; 18160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 out_stride = (trans_size << 1); 18170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 in_stride = trans_size; 18180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(j = 0; j < 2; j++) 18200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 18210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(j) 18220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 18230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8); 18240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8); 18250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 18260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); 18280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]); 18300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o0[0-3] */ 18320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 18330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 18340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 18360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 18370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 18390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 18400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 18420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 18430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 18440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 18450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 18470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 18480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 18490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 18500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 18520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 18540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 18550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 18570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]); 18590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o1[0-3] */ 18610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 18620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 18640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 18660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 18670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 18690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 18700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 18720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 18730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 18740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 18750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 18770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 18780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 18790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 18800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 18820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 18840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 18850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 18870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]); 18890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o2[0-3] */ 18910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 18920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 18940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 18960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 18970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 18990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 19000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 19020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 19030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 19040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 19050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 19070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 19080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 19090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 19100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 19120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 19140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 19150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 19170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]); 19190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o3[0-3] */ 19210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 19220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 19230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 19250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 19260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 19280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 19290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 19310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 19320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 19330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 19340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 19360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 19370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 19380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 19390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 19410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 19430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 19440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 19460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]); 19480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o4[0-3] */ 19500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 19510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 19520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 19540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 19550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 19570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 19580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 19600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 19610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 19620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 19630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 19650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 19660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 19670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 19680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 19700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 19720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 19730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 19750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]); 19770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o5[0-3] */ 19790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 19800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 19820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 19840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 19850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 19870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 19880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 19900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 19910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 19920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 19930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 19950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 19960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 19970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 19980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 20000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 20020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 20030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 20050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]); 20070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o6[0-3] */ 20090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 20100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 20110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 20130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 20140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 20160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 20170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 20190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 20200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 20210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 20220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 20240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 20250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 20260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 20270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 20290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 20310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 20320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 20340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]); 20360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o7[0-3] */ 20380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 20390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 20410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 20430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += 8; 20440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 20460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 20470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 20490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 20500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 20510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 20520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 20540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 20550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 20560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 20570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 20590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 20610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += 8; 20620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 20640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]); 20660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o8[0-3] */ 20680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 20690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 20700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 20720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch -= in_stride; 20730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 20750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 20760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 20780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 20790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 20800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 20810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 20830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 20840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 20850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 20860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 20880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 20900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch -= out_stride; 20910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 20920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]); 20940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o9[0-3] */ 20960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 20970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 20980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 21000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch -= in_stride; 21010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 21030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 21040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 21060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 21070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 21080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 21090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 21110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 21120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 21130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 21140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 21160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 21180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch -= out_stride; 21190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 21200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]); 21220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o10[0-3] */ 21240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 21250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 21260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 21280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch -= in_stride; 21290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 21310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 21320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 21340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 21350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 21360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 21370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 21390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 21400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 21410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 21420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 21440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 21460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch -= out_stride; 21470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 21480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]); 21500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o11[0-3] */ 21520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 21530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 21540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 21560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch -= in_stride; 21570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 21590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 21600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 21620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 21630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 21640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 21650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 21670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 21680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 21690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 21700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 21720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 21740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch -= out_stride; 21750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 21770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]); 21790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o12[0-3] */ 21810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 21820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 21830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 21850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch -= in_stride; 21860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 21880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 21890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 21910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 21920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 21930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 21940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 21960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 21970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 21980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 21990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 22010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 22030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch -= out_stride; 22040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 22060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]); 22080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o13[0-3] */ 22100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 22110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 22120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 22140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch -= in_stride; 22150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 22170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 22180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 22200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 22210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 22220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 22230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 22250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 22260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 22270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 22280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 22300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 22320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch -= out_stride; 22330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 22340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]); 22360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o14[0-3] */ 22380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 22390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 22400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 22420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch -= in_stride; 22430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 22450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 22460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 22480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 22490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 22500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 22510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 22530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 22540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 22550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 22560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 22580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 22600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch -= out_stride; 22610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 22630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]); 22650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o15[0-3] */ 22670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 22680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 22690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 22710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += 8; 22720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 22740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 22750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 22770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 22780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 22790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 22800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 22820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 22830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 22840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 22850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 22870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 22890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += 8; 22900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 22910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 22930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 22940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 22950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else if(zero_last24_rows_stg1) 22960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 22970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o & stage 1 out */ 22980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 22990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 j; 23000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD16 *pi2_src_scratch = o_temp_ptr; 23010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD16 *pi2_dst_scratch = temp_ptr; 23020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 out_stride = (trans_size << 1); 23030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 in_stride = trans_size; 23040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(j = 0; j < 2; j++) 23060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 23070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(j) 23080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 23090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8); 23100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8); 23110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8); 23120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8); 23130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 23140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved 23160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 interleaved 23170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]); 23190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]); 23200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o0[0-3] */ 23220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 23230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 23250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 23260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 23280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 23300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 23310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 23330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 23340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 23360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 23370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 23380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 23390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 23410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 23420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 23430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 23440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 23460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 23480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 23490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 23510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]); 23530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]); 23540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o1[0-3] */ 23560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 23570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 23580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 23590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 23610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 23630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 23640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 23660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 23670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 23690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 23700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 23710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 23720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 23740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 23750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 23760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 23770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 23790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 23810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 23820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 23840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]); 23860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]); 23870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o2[0-3] */ 23890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 23900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 23910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 23920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20); 23940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 23960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 23970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 23990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 24000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 24020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 24030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 24040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 24050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 24070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 24080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 24090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 24100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 24120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 24140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 24150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 24170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]); 24190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]); 24200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o3[0-3] */ 24220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 24230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 24240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 24250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20); 24270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 24290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 24300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 24320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 24330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 24350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 24360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 24370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 24380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 24400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 24410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 24420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 24430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 24450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 24470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 24480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 24500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]); 24520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]); 24530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o4[0-3] */ 24550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 24560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 24570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 24580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 24600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 24620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 24630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 24650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 24660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 24680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 24690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 24700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 24710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 24730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 24740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 24750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 24760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 24780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 24800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 24810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 24830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]); 24850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]); 24860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o5[0-3] */ 24880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 24890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 24900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 24910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 24930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 24950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 24960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 24980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 24990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 25010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 25020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 25030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 25040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 25060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 25070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 25080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 25090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 25110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 25130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 25140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 25160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]); 25180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]); 25190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o6[0-3] */ 25210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 25220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 25230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 25240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 25260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 25280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 25290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 25310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 25320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 25340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 25350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 25360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 25370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 25390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 25400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 25410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 25420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 25440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 25460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 25470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 25490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]); 25510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]); 25520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o7[0-3] */ 25540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 25550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 25560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 25570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 25590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 25610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += 8; 25620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 25640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 25650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 25670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 25680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 25690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 25700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 25720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 25730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 25740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 25750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 25770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 25790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += 8; 25800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 25820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]); 25840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]); 25850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o8[0-3] */ 25870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 25880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 25890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 25900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 25920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 25940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch -= in_stride; 25950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 25970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 25980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 26000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 26010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 26020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 26030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 26050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 26060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 26070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 26080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 26100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 26120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch -= out_stride; 26130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 26140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]); 26160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]); 26170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o9[0-3] */ 26190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 26200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 26210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 26220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 26240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 26260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch -= in_stride; 26270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 26290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 26300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 26320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 26330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 26340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 26350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 26370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 26380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 26390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 26400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 26420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 26440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch -= out_stride; 26450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 26460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]); 26480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]); 26490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o10[0-3] */ 26510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 26520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 26530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 26540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 26560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 26580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch -= in_stride; 26590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 26610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 26620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 26640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 26650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 26660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 26670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 26690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 26700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 26710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 26720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 26740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 26760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch -= out_stride; 26770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 26780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]); 26800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]); 26810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o11[0-3] */ 26830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 26840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 26860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 26870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 26890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 26910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch -= in_stride; 26920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 26940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 26950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 26970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 26980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 26990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 27000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 27020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 27030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 27040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 27050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 27070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 27090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch -= out_stride; 27100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 27120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]); 27140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]); 27150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o12[0-3] */ 27170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 27180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 27190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 27200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 27220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 27240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch -= in_stride; 27250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 27270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 27280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 27300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 27310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 27320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 27330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 27350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 27360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 27370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 27380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 27400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 27420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch -= out_stride; 27430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 27450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]); 27470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]); 27480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o13[0-3] */ 27500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 27510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 27520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 27530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 27550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 27570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch -= in_stride; 27580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 27600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 27610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 27630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 27640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 27650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 27660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 27680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 27690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 27700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 27710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 27730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 27750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch -= out_stride; 27760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 27770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]); 27790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]); 27800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o14[0-3] */ 27820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 27830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 27840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 27850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 27870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 27890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch -= in_stride; 27900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 27920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 27930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 27950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 27960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 27970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 27980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 28000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 28010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 28020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 28030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 28040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 28050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 28060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 28070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch -= out_stride; 28080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 28090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 28100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 28110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]); 28120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]); 28130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 28140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o15[0-3] */ 28150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 28160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 28170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 28180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 28190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 28200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 28210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 28220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += 8; 28230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 28240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 28250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 28260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 28270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 28280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 28290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 28300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 28310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 28320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 28330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 28340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 28350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 28360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 28370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 28380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 28390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 28400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += 8; 28410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 28420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 28430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 28440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 28450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 28460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else 28470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 28480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o & stage 1 out */ 28490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 28500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 j; 28510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD16 *pi2_src_scratch = o_temp_ptr; 28520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD16 *pi2_dst_scratch = temp_ptr; 28530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 out_stride = (trans_size << 1); 28540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 in_stride = trans_size; 28550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 28560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 28570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(j = 0; j < 2; j++) 28580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 28590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(j) 28600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 28610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8); 28620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8); 28630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8); 28640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8); 28650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_74 = _mm_srli_si128(m_temp_reg_74, 8); 28660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_75 = _mm_srli_si128(m_temp_reg_75, 8); 28670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_76 = _mm_srli_si128(m_temp_reg_76, 8); 28680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_77 = _mm_srli_si128(m_temp_reg_77, 8); 28690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 28700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_80 = _mm_srli_si128(m_temp_reg_80, 8); 28710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_81 = _mm_srli_si128(m_temp_reg_81, 8); 28720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_82 = _mm_srli_si128(m_temp_reg_82, 8); 28730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_83 = _mm_srli_si128(m_temp_reg_83, 8); 28740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_84 = _mm_srli_si128(m_temp_reg_84, 8); 28750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_85 = _mm_srli_si128(m_temp_reg_85, 8); 28760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_86 = _mm_srli_si128(m_temp_reg_86, 8); 28770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_87 = _mm_srli_si128(m_temp_reg_87, 8); 28780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 28790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 28800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]); 28810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]); 28820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[2][0]); 28830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[3][0]); 28840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[4][0]); 28850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[5][0]); 28860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[6][0]); 28870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[7][0]); 28880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 28890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved 28900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 interleaved 28910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 interleaved 28920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 interleaved 28930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp1 = _mm_unpacklo_epi16(m_temp_reg_80, m_temp_reg_81); //row 17 and row 19 interleaved 28940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp2 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_83); //row 21 and row 23 interleaved 28950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp3 = _mm_unpacklo_epi16(m_temp_reg_84, m_temp_reg_85); //row 25 and row 27 interleaved 28960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp4 = _mm_unpacklo_epi16(m_temp_reg_86, m_temp_reg_87); //row 29 and row 31 interleaved 28970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 28980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 28990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o0[0-3] */ 29000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 29010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 29020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 29030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 29040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 29050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 29060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 29070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 29080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 29090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 29100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 29110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5); 29120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6); 29130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7); 29140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8); 29150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 29160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 29170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 29180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 29190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 29200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 29210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 29220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 29230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 29240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 29250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 29260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 29270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 29280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 29290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 29300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 29310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 29320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 29330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 29340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 29350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 29360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 29370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 29380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 29390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 29400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 29410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 29420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 29430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 29440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 29450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 29460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]); 29470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]); 29480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[10][0]); 29490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[11][0]); 29500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[12][0]); 29510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[13][0]); 29520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[14][0]); 29530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[15][0]); 29540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 29550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 29560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o1[0-3] */ 29570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 29580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 29590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 29600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 29610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 29620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 29630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 29640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 29650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 29660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_20); 29670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 29680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5); 29690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6); 29700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7); 29710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8); 29720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 29730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 29740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 29750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 29760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 29770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 29780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 29790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 29800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 29810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 29820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 29830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 29840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 29850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 29860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 29870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 29880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 29890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 29900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 29910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 29920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 29930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 29940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 29950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 29960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 29970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 29980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 29990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 30000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 30010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 30020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 30030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]); 30040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]); 30050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[18][0]); 30060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[19][0]); 30070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[20][0]); 30080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[21][0]); 30090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[22][0]); 30100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[23][0]); 30110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 30120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o2[0-3] */ 30130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 30140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 30150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 30160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 30170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 30180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 30190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20); 30200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 30210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 30220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 30230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 30240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5); 30250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6); 30260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7); 30270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8); 30280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 30290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_41); 30300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 30310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 30320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_42); 30330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 30340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 30350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 30360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 30370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 30380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 30390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 30400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 30410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 30420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 30430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 30440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 30450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 30460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 30470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 30480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 30490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 30500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 30510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 30520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 30530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 30540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 30550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 30560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 30570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 30580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 30590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 30600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]); 30610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]); 30620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[26][0]); 30630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[27][0]); 30640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[28][0]); 30650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[29][0]); 30660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[30][0]); 30670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[31][0]); 30680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 30690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o3[0-3] */ 30700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 30710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 30720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 30730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 30740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 30750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 30760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20); 30770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 30780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 30790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 30800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 30810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5); 30820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6); 30830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7); 30840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8); 30850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 30860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_40); 30870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 30880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 30890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 30900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 30910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 30920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 30930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 30940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 30950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 30960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 30970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 30980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 30990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 31000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 31010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 31020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 31030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 31040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 31050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 31060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 31070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 31080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 31090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 31100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 31110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 31120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 31130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 31140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 31150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 31160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]); 31170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]); 31180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[34][0]); 31190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[35][0]); 31200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[36][0]); 31210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[37][0]); 31220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[38][0]); 31230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[39][0]); 31240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 31250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o4[0-3] */ 31260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 31270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 31280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 31290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 31300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 31310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 31320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 31330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 31340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 31350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 31360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 31370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5); 31380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6); 31390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7); 31400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8); 31410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 31420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 31430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 31440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 31450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 31460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 31470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 31480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 31490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 31500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 31510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 31520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 31530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 31540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 31550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 31560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 31570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 31580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 31590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 31600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 31610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 31620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 31630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 31640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 31650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 31660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 31670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 31680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 31690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 31700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 31710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 31720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 31730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]); 31740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]); 31750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[42][0]); 31760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[43][0]); 31770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[44][0]); 31780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[45][0]); 31790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[46][0]); 31800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[47][0]); 31810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 31820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o5[0-3] */ 31830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 31840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 31850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 31860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 31870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 31880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 31890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 31900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 31910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 31920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 31930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 31940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5); 31950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6); 31960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7); 31970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8); 31980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 31990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 32000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 32010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 32020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 32030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 32040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 32050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 32060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 32070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 32080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 32090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 32100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 32110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 32120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 32130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 32140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 32150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 32160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 32170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 32180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 32190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 32200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 32210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 32220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 32230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 32240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 32250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 32260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 32270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 32280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 32290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]); 32300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]); 32310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[50][0]); 32320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[51][0]); 32330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[52][0]); 32340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[53][0]); 32350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[54][0]); 32360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[55][0]); 32370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 32380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 32390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o6[0-3] */ 32400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 32410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 32420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 32430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 32440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 32450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 32460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 32470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 32480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 32490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 32500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 32510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5); 32520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6); 32530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7); 32540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8); 32550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 32560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 32570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 32580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 32590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 32600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 32610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 32620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 32630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 32640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 32650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 32660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 32670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 32680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 32690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 32700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 32710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 32720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 32730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 32740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 32750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 32760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 32770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 32780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 32790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 32800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 32810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 32820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 32830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 32840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 32850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 32860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]); 32870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]); 32880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[58][0]); 32890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[59][0]); 32900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[60][0]); 32910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[61][0]); 32920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[62][0]); 32930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[63][0]); 32940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 32950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o7[0-3] */ 32960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 32970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 32980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 32990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 33000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 33010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 33020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 33030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 33040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 33050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 33060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 33070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5); 33080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6); 33090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7); 33100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8); 33110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 33120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 33130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 33140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 33150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 33160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 33170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 33180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 33190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 33200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += 8; 33210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 33220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 33230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 33240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 33250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 33260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 33270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 33280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 33290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 33300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 33310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 33320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 33330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 33340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 33350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 33360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 33370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 33380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += 8; 33390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 33400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 33410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 33420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]); 33430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]); 33440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[66][0]); 33450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[67][0]); 33460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[68][0]); 33470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[69][0]); 33480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[70][0]); 33490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[71][0]); 33500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 33510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 33520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o8[0-3] */ 33530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 33540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 33550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 33560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 33570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 33580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 33590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 33600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 33610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 33620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 33630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 33640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 33650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5); 33660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6); 33670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7); 33680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8); 33690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 33700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 33710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 33720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 33730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 33740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 33750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 33760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 33770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 33780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch -= in_stride; 33790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 33800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 33810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 33820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 33830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 33840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 33850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 33860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 33870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 33880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 33890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 33900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 33910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 33920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 33930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 33940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 33950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 33960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch -= out_stride; 33970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 33980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 33990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]); 34000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]); 34010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[74][0]); 34020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[75][0]); 34030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[76][0]); 34040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[77][0]); 34050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[78][0]); 34060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[79][0]); 34070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 34080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 34090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o9[0-3] */ 34100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 34110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 34120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 34130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 34140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 34150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 34160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 34170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 34180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 34190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 34200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 34210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5); 34220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6); 34230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7); 34240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8); 34250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 34260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 34270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 34280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 34290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 34300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 34310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 34320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 34330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 34340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch -= in_stride; 34350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 34360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 34370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 34380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 34390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 34400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 34410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 34420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 34430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 34440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 34450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 34460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 34470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 34480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 34490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 34500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 34510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 34520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch -= out_stride; 34530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 34540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 34550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]); 34560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]); 34570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[82][0]); 34580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[83][0]); 34590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[84][0]); 34600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[85][0]); 34610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[86][0]); 34620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[87][0]); 34630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 34640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o10[0-3] */ 34650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 34660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 34670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 34680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 34690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 34700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 34710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 34720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 34730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 34740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 34750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 34760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5); 34770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6); 34780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7); 34790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8); 34800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 34810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 34820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 34830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 34840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 34850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 34860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 34870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 34880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 34890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch -= in_stride; 34900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 34910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 34920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 34930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 34940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 34950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 34960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 34970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 34980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 34990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 35000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 35010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 35020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 35030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 35040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 35050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 35060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 35070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch -= out_stride; 35080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 35090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 35100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]); 35110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]); 35120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[90][0]); 35130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[91][0]); 35140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[92][0]); 35150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[93][0]); 35160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[94][0]); 35170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[95][0]); 35180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 35190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o11[0-3] */ 35200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 35210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 35220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 35230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 35240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 35250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 35260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 35270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 35280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 35290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 35300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 35310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5); 35320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6); 35330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7); 35340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8); 35350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 35360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 35370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 35380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 35390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 35400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 35410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 35420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 35430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 35440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch -= in_stride; 35450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 35460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 35470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 35480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 35490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 35500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 35510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 35520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 35530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 35540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 35550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 35560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 35570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 35580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 35590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 35600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 35610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 35620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch -= out_stride; 35630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 35640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 35650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 35660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]); 35670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]); 35680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[98][0]); 35690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[99][0]); 35700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[100][0]); 35710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[101][0]); 35720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[102][0]); 35730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[103][0]); 35740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 35750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 35760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o12[0-3] */ 35770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 35780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 35790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 35800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 35810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 35820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 35830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 35840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 35850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 35860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 35870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 35880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5); 35890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6); 35900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7); 35910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8); 35920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 35930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 35940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 35950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 35960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 35970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 35980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 35990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 36000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 36010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch -= in_stride; 36020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 36030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 36040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 36050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 36060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 36070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 36080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 36090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 36100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 36110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 36120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 36130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 36140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 36150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 36160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 36170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 36180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 36190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch -= out_stride; 36200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 36210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 36220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 36230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]); 36240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]); 36250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[106][0]); 36260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[107][0]); 36270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[108][0]); 36280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[109][0]); 36290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[110][0]); 36300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[111][0]); 36310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 36320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 36330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o13[0-3] */ 36340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 36350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 36360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 36370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 36380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 36390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 36400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 36410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 36420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 36430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 36440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 36450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5); 36460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6); 36470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7); 36480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8); 36490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 36500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 36510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 36520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 36530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 36540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 36550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 36560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 36570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 36580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch -= in_stride; 36590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 36600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 36610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 36620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 36630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 36640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 36650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 36660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 36670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 36680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 36690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 36700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 36710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 36720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 36730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 36740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 36750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 36760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch -= out_stride; 36770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 36780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 36790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]); 36800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]); 36810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[114][0]); 36820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[115][0]); 36830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[116][0]); 36840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[117][0]); 36850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[118][0]); 36860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[119][0]); 36870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 36880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 36890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o14[0-3] */ 36900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 36910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 36920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 36930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 36940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 36950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 36960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 36970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 36980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 36990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 37000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 37010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5); 37020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6); 37030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7); 37040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8); 37050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 37060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 37070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 37080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 37090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 37100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 37110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 37120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 37130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 37140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch -= in_stride; 37150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 37160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 37170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 37180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 37190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 37200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 37210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 37220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 37230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 37240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 37250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 37260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 37270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 37280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 37290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 37300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 37310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 37320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch -= out_stride; 37330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 37340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 37350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 37360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]); 37370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]); 37380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[122][0]); 37390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[123][0]); 37400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[124][0]); 37410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[125][0]); 37420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[126][0]); 37430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[127][0]); 37440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 37450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o15[0-3] */ 37460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 37470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 37480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 37490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 37500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 37510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 37520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 37530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 37540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 37550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 37560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 37570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5); 37580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6); 37590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7); 37600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8); 37610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 37620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 37630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 37640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 37650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 37660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 37670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 37680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 37690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 37700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += 8; 37710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 37720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 37730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 37740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 37750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 37760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 37770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 37780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 37790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 37800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 37810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 37820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 37830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 37840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 37850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 37860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 37870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 37880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += 8; 37890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 37900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 37910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 37920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 37930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 37940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Transpose */ 37950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 37960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD16 *pi2_src_scratch = temp_ptr; 37970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD16 *pi2_dst_scratch = pi2_tmp; 37980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 in_stride = (trans_size << 1); 37990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 38000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(j = 0; j < 2; j++) 38010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 38020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 38030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 38040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_load_si128((__m128i *)pi2_src_scratch); 38050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 38060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_load_si128((__m128i *)pi2_src_scratch); 38070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 38080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_33 = _mm_load_si128((__m128i *)pi2_src_scratch); 38090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 38100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_load_si128((__m128i *)pi2_src_scratch); 38110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 38120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_load_si128((__m128i *)pi2_src_scratch); 38130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 38140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_36 = _mm_load_si128((__m128i *)pi2_src_scratch); 38150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 38160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_37 = _mm_load_si128((__m128i *)pi2_src_scratch); 38170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += 8; 38180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 38190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_src_scratch); 38200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch -= in_stride; 38210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_src_scratch); 38220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch -= in_stride; 38230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_src_scratch); 38240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch -= in_stride; 38250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_src_scratch); 38260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch -= in_stride; 38270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_src_scratch); 38280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch -= in_stride; 38290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_src_scratch); 38300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch -= in_stride; 38310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_src_scratch); 38320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch -= in_stride; 38330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_src_scratch); 38340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += 8; 38350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 38360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 38370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31); 38380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30); 38390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 38400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33); 38410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32); 38420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 38430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35); 38440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34); 38450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 38460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37); 38470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36); 38480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 38490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_80 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); 38500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_81 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_70); 38510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 38520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_82 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); 38530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_83 = _mm_unpackhi_epi16(m_temp_reg_73, m_temp_reg_72); 38540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 38550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_84 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); 38560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_85 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_74); 38570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 38580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_86 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); 38590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_87 = _mm_unpackhi_epi16(m_temp_reg_77, m_temp_reg_76); 38600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 38610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /****************/ 38620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 38630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42); 38640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42); 38650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 38660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46); 38670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46); 38680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 38690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_80, m_temp_reg_82); 38700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_80, m_temp_reg_82); 38710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 38720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_84, m_temp_reg_86); 38730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_84, m_temp_reg_86); 38740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 38750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_90 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41); 38760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_91 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41); 38770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 38780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_92 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45); 38790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_93 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45); 38800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 38810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_94 = _mm_unpacklo_epi32(m_temp_reg_83, m_temp_reg_81); 38820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_95 = _mm_unpackhi_epi32(m_temp_reg_83, m_temp_reg_81); 38830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 38840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_96 = _mm_unpacklo_epi32(m_temp_reg_87, m_temp_reg_85); 38850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_97 = _mm_unpackhi_epi32(m_temp_reg_87, m_temp_reg_85); 38860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 38870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /******************/ 38880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 38890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_2); 38900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_2); 38910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 38920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_unpacklo_epi64(m_temp_reg_92, m_temp_reg_90); 38930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_33 = _mm_unpackhi_epi64(m_temp_reg_92, m_temp_reg_90); 38940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 38950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_unpacklo_epi64(m_temp_reg_4, m_temp_reg_6); 38960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_unpackhi_epi64(m_temp_reg_4, m_temp_reg_6); 38970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 38980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_36 = _mm_unpacklo_epi64(m_temp_reg_96, m_temp_reg_94); 38990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_37 = _mm_unpackhi_epi64(m_temp_reg_96, m_temp_reg_94); 39000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 39010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_80 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_3); 39020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_81 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_3); 39030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 39040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_82 = _mm_unpacklo_epi64(m_temp_reg_93, m_temp_reg_91); 39050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_83 = _mm_unpackhi_epi64(m_temp_reg_93, m_temp_reg_91); 39060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 39070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_84 = _mm_unpacklo_epi64(m_temp_reg_5, m_temp_reg_7); 39080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_85 = _mm_unpackhi_epi64(m_temp_reg_5, m_temp_reg_7); 39090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 39100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_86 = _mm_unpacklo_epi64(m_temp_reg_97, m_temp_reg_95); 39110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_87 = _mm_unpackhi_epi64(m_temp_reg_97, m_temp_reg_95); 39120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 39130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size), m_temp_reg_30); 39140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 8), m_temp_reg_34); 39150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 16), m_temp_reg_36); 39160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 24), m_temp_reg_32); 39170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 39180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size), m_temp_reg_31); 39190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 8), m_temp_reg_35); 39200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 16), m_temp_reg_37); 39210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 24), m_temp_reg_33); 39220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 39230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size), m_temp_reg_80); 39240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 8), m_temp_reg_84); 39250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 16), m_temp_reg_86); 39260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 24), m_temp_reg_82); 39270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 39280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size), m_temp_reg_81); 39290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 8), m_temp_reg_85); 39300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 16), m_temp_reg_87); 39310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 24), m_temp_reg_83); 39320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 39330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += 4 * trans_size; 39340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 39350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 39360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src += 8; 39370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// pi2_dequant_coeff +=8; 39380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_tmp += 8 * trans_size; 39390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar zero_cols = zero_cols >> 1; 39400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 39410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 39420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(trans_size_stg1 != TRANS_SIZE_32) 39430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 39440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_setzero_si128(); 39450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 39460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(i = trans_size_stg1; i < 32; i += 8) 39470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 39480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD16 *pi2_dst_scratch = pi2_tmp; 39490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 39500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size), m_temp_reg_10); 39510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 8), m_temp_reg_10); 39520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 16), m_temp_reg_10); 39530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 24), m_temp_reg_10); 39540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 39550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size), m_temp_reg_10); 39560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 8), m_temp_reg_10); 39570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 16), m_temp_reg_10); 39580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 24), m_temp_reg_10); 39590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 39600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size), m_temp_reg_10); 39610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 8), m_temp_reg_10); 39620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 16), m_temp_reg_10); 39630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 24), m_temp_reg_10); 39640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 39650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size), m_temp_reg_10); 39660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 8), m_temp_reg_10); 39670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 16), m_temp_reg_10); 39680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 24), m_temp_reg_10); 39690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 39700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size), m_temp_reg_10); 39710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size + 8), m_temp_reg_10); 39720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size + 16), m_temp_reg_10); 39730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size + 24), m_temp_reg_10); 39740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 39750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size), m_temp_reg_10); 39760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size + 8), m_temp_reg_10); 39770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size + 16), m_temp_reg_10); 39780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size + 24), m_temp_reg_10); 39790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 39800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size), m_temp_reg_10); 39810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size + 8), m_temp_reg_10); 39820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size + 16), m_temp_reg_10); 39830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size + 24), m_temp_reg_10); 39840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 39850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size), m_temp_reg_10); 39860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size + 8), m_temp_reg_10); 39870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size + 16), m_temp_reg_10); 39880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size + 24), m_temp_reg_10); 39890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 39900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_tmp += 8 * trans_size; 39910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 39920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 39930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 39940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_tmp = pi2_tmp_orig; 39950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 39960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Inverse Transform 2nd stage */ 39970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 39980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(j = 0; j < trans_size; j += 4) 39990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 40000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar i4_shift = IT_SHIFT_STAGE_2; 40010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 40020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ 40030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(zero_last28_rows_stg2) 40040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 40050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 40060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 40070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87 40080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87 40090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80 40100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70 40110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57 40120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43 40130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25 40140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //9 40150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 40160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_loadu_si128((__m128i *)&pi2_tmp[2 * trans_size]); 40170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 40180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_10, all_zero_reg); 40190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 40200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo0[0-3] */ 40210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 40220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_90 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 40230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 40240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 40250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo1[0-3] */ 40260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 40270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_91 = _mm_madd_epi16(m_temp_reg_10, m_coeff2); 40280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 40290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 40300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo2[0-3] */ 40310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 40320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_92 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 40330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 40340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 40350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo3[0-3] */ 40360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 40370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_93 = _mm_madd_epi16(m_temp_reg_10, m_coeff4); 40380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 40390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo4[0-3] */ 40400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 40410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_94 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 40420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 40430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 40440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo5[0-3] */ 40450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 40460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_95 = _mm_madd_epi16(m_temp_reg_10, m_coeff6); 40470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 40480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 40490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo6[0-3] */ 40500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 40510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_96 = _mm_madd_epi16(m_temp_reg_10, m_coeff7); 40520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 40530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo7[0-3] */ 40540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 40550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_97 = _mm_madd_epi16(m_temp_reg_10, m_coeff8); 40560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 40570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 40580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 40590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64 40600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 40610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[0 * trans_size]); 40620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 40630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg); 40640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 40650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); 40660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 40670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); 40680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 40690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[]*/ 40700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 40710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp1 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_90); /* ee[0] */ 40720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp2 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_90); /* ee[15] */ 40730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 40740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp3 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_91); /* ee[1] */ 40750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp4 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_91); /* ee[14] */ 40760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 40770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp5 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_92); /* ee[2] */ 40780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp6 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_92); /* ee[13] */ 40790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 40800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp7 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_93); /* ee[3] */ 40810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp8 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_93); /* ee[12] */ 40820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 40830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_90 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_94); /* ee[4] */ 40840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_94); /* ee[11] */ 40850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 40860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_92 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_95); /* ee[5] */ 40870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_95); /* ee[10] */ 40880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 40890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_94 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_96); /* ee[6] */ 40900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_95 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_96); /* ee[9] */ 40910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 40920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_96 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_97); /* ee[7] */ 40930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_97); /* ee[8] */ 40940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 40950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*o[k]*/ 40960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 40970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 40980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD16 *pi2_dst_scratch = temp_ptr; 40990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 out_stride = 8; 41000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 41010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]); 41020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 41030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[trans_size]); 41040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_71 = _mm_loadu_si128((__m128i *)&pi2_tmp[3 * trans_size]); 41050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 41060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved 41070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 41080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 41090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o0[0-3] */ 41100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 41110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 41120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 41130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(temp1, m_temp_reg_20); 41140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(temp1, m_temp_reg_20); 41150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 41160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 41170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 41180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 41190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 41200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 41210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 41220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 41230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 41240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 41250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 41260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 41270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 41280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 41290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 41300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 41310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 41320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 41330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]); 41340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 41350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o1[0-3] */ 41360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 41370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 41380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 41390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(temp3, m_temp_reg_20); 41400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(temp3, m_temp_reg_20); 41410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 41420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 41430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 41440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 41450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 41460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 41470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 41480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 41490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 41500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 41510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 41520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 41530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 41540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 41550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 41560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 41570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 41580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 41590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]); 41600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 41610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o2[0-3] */ 41620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 41630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 41640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 41650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(temp5, m_temp_reg_20); 41660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(temp5, m_temp_reg_20); 41670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 41680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 41690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 41700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 41710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 41720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 41730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 41740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 41750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 41760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 41770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 41780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 41790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 41800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 41810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 41820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 41830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 41840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 41850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]); 41860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 41870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o3[0-3] */ 41880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 41890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 41900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 41910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(temp7, m_temp_reg_20); 41920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(temp7, m_temp_reg_20); 41930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 41940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 41950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 41960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 41970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 41980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 41990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 42000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 42010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 42020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 42030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 42040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 42050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 42060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 42070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 42080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 42090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 42100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 42110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]); 42120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 42130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o4[0-3] */ 42140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 42150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 42160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 42170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_20); 42180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_20); 42190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 42200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 42210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 42220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 42230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 42240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 42250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 42260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 42270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 42280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 42290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 42300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 42310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 42320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 42330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 42340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 42350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 42360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 42370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]); 42380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 42390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o5[0-3] */ 42400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 42410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 42420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 42430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_20); 42440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_20); 42450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 42460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 42470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 42480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 42490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 42500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 42510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 42520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 42530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 42540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 42550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 42560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 42570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 42580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 42590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 42600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 42610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 42620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 42630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]); 42640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 42650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o6[0-3] */ 42660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 42670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 42680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 42690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_20); 42700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_20); 42710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 42720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 42730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 42740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 42750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 42760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 42770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 42780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 42790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 42800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 42810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 42820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 42830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 42840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 42850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 42860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 42870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 42880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 42890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]); 42900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 42910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o7[0-3] */ 42920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 42930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 42940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 42950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_20); 42960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_20); 42970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 42980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 42990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 43000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 43010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 43020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 43030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 43040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 43050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 43060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 43070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 43080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 43090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 43100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 43110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += 8; 43120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 43130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 43140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 43150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]); 43160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 43170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o8[0-3] */ 43180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 43190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 43200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 43210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_20); 43220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_20); 43230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 43240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 43250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 43260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 43270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 43280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 43290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 43300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 43310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 43320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 43330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 43340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 43350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 43360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 43370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 43380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 43390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 43400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]); 43410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 43420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o9[0-3] */ 43430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 43440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 43450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 43460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_20); 43470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_20); 43480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 43490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 43500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 43510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 43520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 43530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 43540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 43550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 43560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 43570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 43580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 43590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 43600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 43610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 43620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 43630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 43640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 43650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 43660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]); 43670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 43680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o10[0-3] */ 43690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 43700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 43710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 43720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_20); 43730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_20); 43740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 43750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 43760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 43770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 43780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 43790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 43800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 43810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 43820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 43830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 43840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 43850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 43860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 43870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 43880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 43890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 43900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 43910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]); 43920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 43930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o11[0-3] */ 43940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 43950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 43960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 43970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_20); 43980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_20); 43990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 44000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 44010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 44020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 44030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 44040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 44050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 44060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 44070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 44080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 44090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 44100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 44110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 44120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 44130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 44140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 44150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 44160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 44170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]); 44180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 44190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o12[0-3] */ 44200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 44210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 44220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 44230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(temp8, m_temp_reg_20); 44240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(temp8, m_temp_reg_20); 44250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 44260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 44270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 44280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 44290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 44300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 44310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 44320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 44330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 44340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 44350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 44360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 44370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 44380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 44390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 44400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 44410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 44420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 44430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]); 44440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 44450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o13[0-3] */ 44460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 44470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 44480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 44490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(temp6, m_temp_reg_20); 44500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(temp6, m_temp_reg_20); 44510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 44520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 44530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 44540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 44550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 44560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 44570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 44580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 44590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 44600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 44610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 44620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 44630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 44640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 44650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 44660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 44670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 44680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]); 44690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 44700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o14[0-3] */ 44710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 44720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 44730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 44740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(temp4, m_temp_reg_20); 44750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(temp4, m_temp_reg_20); 44760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 44770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 44780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 44790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 44800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 44810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 44820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 44830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 44840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 44850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 44860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 44870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 44880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 44890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 44900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 44910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 44920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 44930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 44940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]); 44950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 44960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o15[0-3] */ 44970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 44980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 44990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 45000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(temp2, m_temp_reg_20); 45010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(temp2, m_temp_reg_20); 45020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 45030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 45040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 45050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 45060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 45070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 45080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 45090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 45100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 45110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 45120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 45130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 45140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 45150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 45160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += 8; 45170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 45180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 45190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 45200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 45210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 45220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else if(zero_last24_rows_stg2) 45230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 45240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo */ 45250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 45260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87 45270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 45280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_loadu_si128((__m128i *)&pi2_tmp[2 * trans_size]); 45290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_11 = _mm_loadu_si128((__m128i *)&pi2_tmp[6 * trans_size]); 45300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 45310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_10, m_temp_reg_11); 45320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 45330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 45340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo0[0-3] */ 45350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 45360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_90 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 45370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 45380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 45390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 45400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87 57 45410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 45420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo1[0-3] */ 45430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 45440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_91 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 45450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 45460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 45470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80 9 45480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 45490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo2[0-3] */ 45500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 45510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_92 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 45520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 45530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 45540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 45550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70 -43 45560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 45570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo3[0-3] */ 45580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 45590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 45600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_93 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 45610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 45620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 45630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 45640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57 -80 45650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 45660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo4[0-3] */ 45670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 45680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_94 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 45690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 45700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 45710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 45720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43 -90 45730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 45740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo5[0-3] */ 45750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 45760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_95 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 45770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 45780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 45790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25 -70 45800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo6[0-3] */ 45810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 45820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_96 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 45830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 45840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 45850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9 -25 45860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo7[0-3] */ 45870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 45880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_97 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 45890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 45900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 45910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 45920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 45930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 45940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeo */ 45950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 45960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 45970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75 45980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75 45990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18 46000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[8][0]); //50 46010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 46020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[4 * trans_size]); 46030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 46040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, all_zero_reg); 46050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 46060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeo0[0-3] */ 46070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 46080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp1 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 46090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 46100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 46110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 46120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeo1[0-3] */ 46130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 46140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp2 = _mm_madd_epi16(m_temp_reg_10, m_coeff2); 46150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 46160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 46170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 46180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo2[0-3] */ 46190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 46200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp3 = _mm_madd_epi16(m_temp_reg_10, m_coeff4); 46210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 46220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 46230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 46240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 46250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo3[0-3] */ 46260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 46270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp4 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 46280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 46290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 46300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 46310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 46320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 46330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83 46340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36 46350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64 46360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 46370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[0 * trans_size]); 46380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 46390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //m_temp_reg_1 = _mm_cvtepi16_epi32(m_temp_reg_70); 46400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg); 46410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 46420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); 46430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); 46440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 46450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_70 = _mm_add_epi32(m_temp_reg_14, temp1); /* ee[0] */ 46460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_71 = _mm_sub_epi32(m_temp_reg_14, temp1); /* ee[7] */ 46470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 46480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_72 = _mm_add_epi32(m_temp_reg_16, temp2); /* ee[1] */ 46490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_73 = _mm_sub_epi32(m_temp_reg_16, temp2); /* ee[6] */ 46500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 46510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_74 = _mm_add_epi32(m_temp_reg_16, temp3); /* ee[2] */ 46520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_75 = _mm_sub_epi32(m_temp_reg_16, temp3); /* ee[5] */ 46530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 46540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_76 = _mm_add_epi32(m_temp_reg_14, temp4); /* ee[3] */ 46550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_77 = _mm_sub_epi32(m_temp_reg_14, temp4); /* ee[4] */ 46560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 46570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[]*/ 46580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 46590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp1 = _mm_add_epi32(m_temp_reg_70, m_temp_reg_90); /* ee[0] */ 46600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp2 = _mm_sub_epi32(m_temp_reg_70, m_temp_reg_90); /* ee[15] */ 46610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 46620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp3 = _mm_add_epi32(m_temp_reg_72, m_temp_reg_91); /* ee[1] */ 46630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp4 = _mm_sub_epi32(m_temp_reg_72, m_temp_reg_91); /* ee[14] */ 46640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 46650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp5 = _mm_add_epi32(m_temp_reg_74, m_temp_reg_92); /* ee[2] */ 46660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp6 = _mm_sub_epi32(m_temp_reg_74, m_temp_reg_92); /* ee[13] */ 46670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 46680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp7 = _mm_add_epi32(m_temp_reg_76, m_temp_reg_93); /* ee[3] */ 46690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp8 = _mm_sub_epi32(m_temp_reg_76, m_temp_reg_93); /* ee[12] */ 46700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 46710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_90 = _mm_add_epi32(m_temp_reg_77, m_temp_reg_94); /* ee[4] */ 46720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_77, m_temp_reg_94); /* ee[11] */ 46730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 46740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_92 = _mm_add_epi32(m_temp_reg_75, m_temp_reg_95); /* ee[5] */ 46750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_75, m_temp_reg_95); /* ee[10] */ 46760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 46770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_94 = _mm_add_epi32(m_temp_reg_73, m_temp_reg_96); /* ee[6] */ 46780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_95 = _mm_sub_epi32(m_temp_reg_73, m_temp_reg_96); /* ee[9] */ 46790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 46800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_96 = _mm_add_epi32(m_temp_reg_71, m_temp_reg_97); /* ee[7] */ 46810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_71, m_temp_reg_97); /* ee[8] */ 46820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 46830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*o[k] */ 46840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 46850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 46860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD16 *pi2_dst_scratch = temp_ptr; 46870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 out_stride = 8; 46880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 46890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]); 46900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]); 46910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 46920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[trans_size]); 46930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_71 = _mm_loadu_si128((__m128i *)&pi2_tmp[3 * trans_size]); 46940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[5 * trans_size]); 46950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_73 = _mm_loadu_si128((__m128i *)&pi2_tmp[7 * trans_size]); 46960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 46970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); 46980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); 46990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 47000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o0[0-3] */ 47010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 47020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 47030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 47040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 47050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 47060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 47070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(temp1, m_temp_reg_20); 47080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(temp1, m_temp_reg_20); 47090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 47100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 47110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 47120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 47130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 47140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 47150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 47160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 47170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 47180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 47190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 47200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 47210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 47220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 47230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 47240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 47250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 47260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 47270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 47280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]); 47290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]); 47300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 47310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o1[0-3] */ 47320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 47330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 47340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 47350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 47360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 47370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 47380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(temp3, m_temp_reg_20); 47390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(temp3, m_temp_reg_20); 47400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 47410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 47420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 47430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 47440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 47450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 47460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 47470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 47480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 47490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 47500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 47510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 47520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 47530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 47540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 47550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 47560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 47570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 47580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]); 47590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]); 47600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 47610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o2[0-3] */ 47620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 47630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 47640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 47650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 47660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20); 47670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 47680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(temp5, m_temp_reg_20); 47690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(temp5, m_temp_reg_20); 47700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 47710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 47720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 47730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 47740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 47750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 47760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 47770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 47780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 47790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 47800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 47810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 47820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 47830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 47840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 47850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 47860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 47870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 47880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]); 47890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]); 47900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 47910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o3[0-3] */ 47920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 47930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 47940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 47950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 47960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20); 47970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 47980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(temp7, m_temp_reg_20); 47990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(temp7, m_temp_reg_20); 48000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 48010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 48020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 48030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 48040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 48050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 48060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 48070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 48080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 48090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 48100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 48110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 48120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 48130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 48140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 48150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 48160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 48170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 48180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]); 48190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]); 48200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 48210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o4[0-3] */ 48220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 48230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 48240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 48250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 48260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); 48270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 48280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_20); 48290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_20); 48300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 48310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 48320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 48330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 48340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 48350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 48360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 48370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 48380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 48390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 48400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 48410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 48420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 48430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 48440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 48450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 48460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 48470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 48480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]); 48490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]); 48500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 48510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o5[0-3] */ 48520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 48530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 48540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 48550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 48560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); 48570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 48580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_20); 48590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_20); 48600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 48610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 48620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 48630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 48640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 48650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 48660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 48670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 48680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 48690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 48700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 48710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 48720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 48730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 48740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 48750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 48760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 48770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 48780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]); 48790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]); 48800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 48810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o6[0-3] */ 48820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 48830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 48840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 48850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 48860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); 48870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 48880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_20); 48890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_20); 48900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 48910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 48920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 48930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 48940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 48950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 48960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 48970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 48980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 48990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 49000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 49010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 49020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 49030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 49040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 49050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 49060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 49070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 49080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]); 49090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]); 49100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 49110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o7[0-3] */ 49120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 49130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 49140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 49150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 49160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); 49170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 49180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_20); 49190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_20); 49200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 49210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 49220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 49230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 49240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 49250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 49260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 49270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 49280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 49290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 49300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 49310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 49320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 49330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 49340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += 8; 49350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 49360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 49370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 49380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]); 49390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]); 49400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 49410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o8[0-3] */ 49420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 49430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 49440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 49450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 49460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); 49470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 49480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_20); 49490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_20); 49500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 49510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 49520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 49530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 49540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 49550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 49560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 49570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 49580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 49590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 49600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 49610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 49620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 49630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 49640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 49650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 49660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 49670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]); 49680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]); 49690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 49700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o9[0-3] */ 49710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 49720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 49730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 49740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 49750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); 49760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 49770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_20); 49780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_20); 49790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 49800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 49810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 49820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 49830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 49840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 49850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 49860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 49870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 49880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 49890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 49900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 49910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 49920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 49930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 49940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 49950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 49960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]); 49970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]); 49980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 49990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o10[0-3] */ 50000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 50010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 50020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 50030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 50040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); 50050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 50060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_20); 50070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_20); 50080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 50090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 50100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 50110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 50120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 50130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 50140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 50150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 50160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 50170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 50180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 50190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 50200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 50210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 50220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 50230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 50240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 50250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]); 50260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]); 50270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 50280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o11[0-3] */ 50290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 50300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 50310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 50320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 50330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); 50340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 50350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_20); 50360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_20); 50370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 50380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 50390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 50400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 50410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 50420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 50430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 50440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 50450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 50460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 50470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 50480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 50490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 50500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 50510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 50520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 50530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 50540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 50550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]); 50560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]); 50570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 50580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o12[0-3] */ 50590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 50600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 50610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 50620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 50630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); 50640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 50650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(temp8, m_temp_reg_20); 50660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(temp8, m_temp_reg_20); 50670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 50680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 50690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 50700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 50710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 50720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 50730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 50740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 50750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 50760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 50770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 50780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 50790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 50800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 50810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 50820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 50830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 50840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 50850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]); 50860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]); 50870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 50880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o13[0-3] */ 50890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 50900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 50910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 50920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 50930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); 50940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 50950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(temp6, m_temp_reg_20); 50960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(temp6, m_temp_reg_20); 50970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 50980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 50990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 51000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 51010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 51020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 51030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 51040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 51050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 51060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 51070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 51080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 51090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 51100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 51110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 51120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 51130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 51140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]); 51150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]); 51160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 51170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o14[0-3] */ 51180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 51190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 51200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 51210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 51220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); 51230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 51240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(temp4, m_temp_reg_20); 51250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(temp4, m_temp_reg_20); 51260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 51270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 51280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 51290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 51300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 51310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 51320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 51330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 51340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 51350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 51360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 51370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 51380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 51390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 51400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 51410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 51420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 51430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]); 51440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]); 51450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 51460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o15[0-3] */ 51470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 51480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 51490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 51500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 51510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); 51520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 51530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(temp2, m_temp_reg_20); 51540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(temp2, m_temp_reg_20); 51550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 51560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 51570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 51580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 51590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 51600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 51610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 51620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 51630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 51640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 51650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 51660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 51670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 51680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 51690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += 8; 51700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 51710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 51720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 51730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 51740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else 51750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 51760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo */ 51770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 51780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 51790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87 51800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[1][0]); //80 70 51810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57 43 51820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25 9 51830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 51840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 51850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_loadu_si128((__m128i *)&pi2_tmp[2 * trans_size]); 51860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_11 = _mm_loadu_si128((__m128i *)&pi2_tmp[6 * trans_size]); 51870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_12 = _mm_loadu_si128((__m128i *)&pi2_tmp[10 * trans_size]); 51880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_13 = _mm_loadu_si128((__m128i *)&pi2_tmp[14 * trans_size]); 51890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_18 = _mm_loadu_si128((__m128i *)&pi2_tmp[18 * trans_size]); 51900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_19 = _mm_loadu_si128((__m128i *)&pi2_tmp[22 * trans_size]); 51910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_loadu_si128((__m128i *)&pi2_tmp[26 * trans_size]); 51920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_loadu_si128((__m128i *)&pi2_tmp[30 * trans_size]); 51930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 51940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_10, m_temp_reg_11); 51950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_12, m_temp_reg_13); 51960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_18, m_temp_reg_19); 51970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_21); 51980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 51990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo0[0-3] */ 52000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 52010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 52020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 52030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 52040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 52050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 52060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 52070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 52080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 52090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); 52100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 52110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_90 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 52120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 52130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 52140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 52150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87 57 52160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //0 -43 52170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80 90 52180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70 25 52190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 52200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo1[0-3] */ 52210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 52220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 52230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 52240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 52250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 52260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 52270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 52280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 52290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 52300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); 52310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 52320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_32); 52330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 52340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 52350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 52360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80 9 52370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[9][0]); //70 87 52380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[10][0]); //-25 57 52390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[11][0]); //90 43 52400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 52410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo2[0-3] */ 52420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 52430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 52440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 52450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 52460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31); 52470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 52480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 52490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 52500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 52510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); 52520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 52530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_92 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 52540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 52550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 52560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 52570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70 -43 52580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[13][0]); //-87 9 52590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[14][0]); //90 25 52600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[15][0]); //80 57 52610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 52620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo3[0-3] */ 52630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 52640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 52650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 52660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 52670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 52680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 52690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 52700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 52710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 52720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_32, m_temp_reg_33); 52730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 52740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_93 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 52750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 52760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 52770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 52780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57 -80 52790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[17][0]); //-25 90 52800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[18][0]); //9 87 52810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43 70 52820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 52830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 52840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo4[0-3] */ 52850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 52860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 52870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 52880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 52890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 52900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 52910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 52920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 52930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 52940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32); 52950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 52960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_94 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 52970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 52980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 52990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 53000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43 -90 53010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[21][0]); //57 25 53020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[22][0]); //-87 70 53030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[23][0]); //9 -80 53040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 53050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo5[0-3] */ 53060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 53070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 53080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 53090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 53100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 53110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 53120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 53130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 53140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 53150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); 53160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 53170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_95 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 53180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 53190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 53200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25 -70 53210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[25][0]); //90 -80 53220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[26][0]); //43 9 53230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[27][0]); //-57 87 53240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 53250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo6[0-3] */ 53260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 53270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 53280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 53290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 53300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 53310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 53320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 53330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 53340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 53350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); 53360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 53370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_96 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 53380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 53390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 53400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 53410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9 -25 53420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[29][0]); //43 -57 53430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[30][0]); //70 -80 53440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[31][0]); //87 -90 53450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 53460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo7[0-3] */ 53470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 53480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 53490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 53500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 53510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 53520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 53530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 53540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 53550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 53560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); 53570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 53580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_97 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 53590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 53600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 53610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 53620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 53630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 53640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 53650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeo */ 53660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 53670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75 53680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[5][0]); //50 18 53690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 53700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[4 * trans_size]); 53710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_76 = _mm_loadu_si128((__m128i *)&pi2_tmp[12 * trans_size]); 53720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_82 = _mm_loadu_si128((__m128i *)&pi2_tmp[20 * trans_size]); 53730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_86 = _mm_loadu_si128((__m128i *)&pi2_tmp[28 * trans_size]); 53740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 53750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeo0[0-3] */ 53760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 53770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 53780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); 53790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_86); 53800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 53810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 53820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 53830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 53840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp1 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 53850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 53860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 53870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 53880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75 -18 53890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[7][0]); //89 50 53900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 53910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeo1[0-3] */ 53920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 53930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 53940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4); 53950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 53960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp2 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31); 53970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 53980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 53990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 54000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[8][0]); //50 -89 54010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18 75 54020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 54030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo2[0-3] */ 54040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 54050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 54060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4); 54070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 54080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp3 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 54090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 54100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 54110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 54120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[10][0]); //18 -50 54130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[11][0]); //75 -89 54140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 54150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo3[0-3] */ 54160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 54170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 54180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4); 54190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 54200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp4 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 54210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 54220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 54230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 54240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 54250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 54260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 54270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83 36 54280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36 -83 54290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 54300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64 64 54310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[3][0]); //64 -64 54320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 54330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_74 = _mm_loadu_si128((__m128i *)&pi2_tmp[8 * trans_size]); 54340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_84 = _mm_loadu_si128((__m128i *)&pi2_tmp[24 * trans_size]); 54350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 54360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_84); 54370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 54380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[0 * trans_size]); 54390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_80 = _mm_loadu_si128((__m128i *)&pi2_tmp[16 * trans_size]); 54400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 54410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_80); 54420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 54430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); /* eeeo[0] */ 54440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); /* eeeo[1] */ 54450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 54460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); /* eeee[0] */ 54470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff4); /* eeee[1] */ 54480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 54490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/* eeeo[0]= m_temp_reg_20 */ 54500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/* eeeo[1]= m_temp_reg_21 */ 54510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/* eeee[0]= m_temp_reg_22 */ 54520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/* eeee[1]= m_temp_reg_23 */ 54530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 54540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eee[0] = eeee[0] + eeeo[0]; */ 54550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); /* eeeo[0] */ 54560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 54570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eee[3] = eeee[0] - eeeo[0]; */ 54580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_43 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20); /* eeeo[1] */ 54590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 54600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eee[2] = eeee[1] - eeeo[1]; */ 54610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_22); /* eeee[1] */ 54620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 54630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eee[1] = eeee[1] + eeeo[1];*/ 54640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_41 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_22); /* eeee[0] */ 54650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 54660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_70 = _mm_add_epi32(m_temp_reg_40, temp1); /* ee[0] */ 54670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_71 = _mm_sub_epi32(m_temp_reg_40, temp1); /* ee[7] */ 54680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 54690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_72 = _mm_add_epi32(m_temp_reg_41, temp2); /* ee[1] */ 54700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_73 = _mm_sub_epi32(m_temp_reg_41, temp2); /* ee[6] */ 54710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 54720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_74 = _mm_add_epi32(m_temp_reg_42, temp3); /* ee[2] */ 54730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_75 = _mm_sub_epi32(m_temp_reg_42, temp3); /* ee[5] */ 54740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 54750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_76 = _mm_add_epi32(m_temp_reg_43, temp4); /* ee[3] */ 54760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_77 = _mm_sub_epi32(m_temp_reg_43, temp4); /* ee[4] */ 54770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 54780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/* e[]*/ 54790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 54800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp1 = _mm_add_epi32(m_temp_reg_70, m_temp_reg_90); /* ee[0] */ 54810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp2 = _mm_sub_epi32(m_temp_reg_70, m_temp_reg_90); /* ee[15] */ 54820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 54830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp3 = _mm_add_epi32(m_temp_reg_72, m_temp_reg_91); /* ee[1] */ 54840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp4 = _mm_sub_epi32(m_temp_reg_72, m_temp_reg_91); /* ee[14] */ 54850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 54860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp5 = _mm_add_epi32(m_temp_reg_74, m_temp_reg_92); /* ee[2] */ 54870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp6 = _mm_sub_epi32(m_temp_reg_74, m_temp_reg_92); /* ee[13] */ 54880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 54890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp7 = _mm_add_epi32(m_temp_reg_76, m_temp_reg_93); /* ee[3] */ 54900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp8 = _mm_sub_epi32(m_temp_reg_76, m_temp_reg_93); /* ee[12] */ 54910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 54920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_90 = _mm_add_epi32(m_temp_reg_77, m_temp_reg_94); /* ee[4] */ 54930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_77, m_temp_reg_94); /* ee[11] */ 54940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 54950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_92 = _mm_add_epi32(m_temp_reg_75, m_temp_reg_95); /* ee[5] */ 54960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_75, m_temp_reg_95); /* ee[10] */ 54970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 54980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_94 = _mm_add_epi32(m_temp_reg_73, m_temp_reg_96); /* ee[6] */ 54990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_95 = _mm_sub_epi32(m_temp_reg_73, m_temp_reg_96); /* ee[9] */ 55000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 55010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_96 = _mm_add_epi32(m_temp_reg_71, m_temp_reg_97); /* ee[7] */ 55020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_71, m_temp_reg_97); /* ee[8] */ 55030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 55040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/*o[k] */ 55050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 55060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 55070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD16 *pi2_dst_scratch = temp_ptr; 55080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 out_stride = 8; 55090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 55100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]); 55110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]); 55120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[2][0]); 55130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[3][0]); 55140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[4][0]); 55150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[5][0]); 55160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[6][0]); 55170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[7][0]); 55180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 55190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 55200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[trans_size]); 55210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_71 = _mm_loadu_si128((__m128i *)&pi2_tmp[3 * trans_size]); 55220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[5 * trans_size]); 55230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_73 = _mm_loadu_si128((__m128i *)&pi2_tmp[7 * trans_size]); 55240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_74 = _mm_loadu_si128((__m128i *)&pi2_tmp[9 * trans_size]); 55250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_75 = _mm_loadu_si128((__m128i *)&pi2_tmp[11 * trans_size]); 55260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_76 = _mm_loadu_si128((__m128i *)&pi2_tmp[13 * trans_size]); 55270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_77 = _mm_loadu_si128((__m128i *)&pi2_tmp[15 * trans_size]); 55280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 55290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_80 = _mm_loadu_si128((__m128i *)&pi2_tmp[17 * trans_size]); 55300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_81 = _mm_loadu_si128((__m128i *)&pi2_tmp[19 * trans_size]); 55310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_82 = _mm_loadu_si128((__m128i *)&pi2_tmp[21 * trans_size]); 55320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_83 = _mm_loadu_si128((__m128i *)&pi2_tmp[23 * trans_size]); 55330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_84 = _mm_loadu_si128((__m128i *)&pi2_tmp[25 * trans_size]); 55340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_85 = _mm_loadu_si128((__m128i *)&pi2_tmp[27 * trans_size]); 55350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_86 = _mm_loadu_si128((__m128i *)&pi2_tmp[29 * trans_size]); 55360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_87 = _mm_loadu_si128((__m128i *)&pi2_tmp[31 * trans_size]); 55370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 55380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved 55390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 interleaved 55400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 interleaved 55410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 interleaved 55420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_80, m_temp_reg_81); //row 17 and row 19 interleaved 55430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_15 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_83); //row 21 and row 23 interleaved 55440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_16 = _mm_unpacklo_epi16(m_temp_reg_84, m_temp_reg_85); //row 25 and row 27 interleaved 55450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_17 = _mm_unpacklo_epi16(m_temp_reg_86, m_temp_reg_87); //row 29 and row 31 interleaved 55460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 55470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o0[0-3] */ 55480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 55490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 55500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 55510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 55520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 55530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 55540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 55550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 55560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 55570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 55580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 55590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5); 55600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6); 55610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7); 55620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8); 55630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 55640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 55650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 55660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 55670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 55680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 55690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 55700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 55710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(temp1, m_temp_reg_20); 55720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(temp1, m_temp_reg_20); 55730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 55740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 55750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 55760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 55770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 55780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 55790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 55800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 55810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 55820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 55830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 55840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 55850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 55860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 55870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 55880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 55890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 55900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 55910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]); 55920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]); 55930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[10][0]); 55940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[11][0]); 55950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[12][0]); 55960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[13][0]); 55970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[14][0]); 55980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[15][0]); 55990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 56000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o1[0-3] */ 56010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 56020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 56030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 56040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 56050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 56060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 56070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 56080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 56090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 56100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_20); 56110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 56120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5); 56130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6); 56140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7); 56150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8); 56160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 56170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 56180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 56190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 56200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 56210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 56220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 56230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 56240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(temp3, m_temp_reg_20); 56250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(temp3, m_temp_reg_20); 56260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 56270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 56280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 56290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 56300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 56310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 56320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 56330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 56340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 56350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 56360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 56370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 56380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 56390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 56400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 56410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 56420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 56430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 56440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]); 56450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]); 56460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[18][0]); 56470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[19][0]); 56480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[20][0]); 56490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[21][0]); 56500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[22][0]); 56510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[23][0]); 56520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 56530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o2[0-3] */ 56540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 56550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 56560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 56570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 56580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 56590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 56600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20); 56610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 56620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 56630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 56640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 56650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5); 56660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6); 56670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7); 56680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8); 56690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 56700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_41); 56710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 56720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 56730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_42); 56740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 56750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 56760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 56770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(temp5, m_temp_reg_20); 56780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(temp5, m_temp_reg_20); 56790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 56800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 56810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 56820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 56830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 56840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 56850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 56860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 56870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 56880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 56890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 56900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 56910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 56920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 56930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 56940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 56950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 56960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 56970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]); 56980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]); 56990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[26][0]); 57000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[27][0]); 57010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[28][0]); 57020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[29][0]); 57030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[30][0]); 57040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[31][0]); 57050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 57060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o3[0-3] */ 57070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 57080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 57090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 57100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 57110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 57120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 57130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20); 57140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 57150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 57160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 57170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 57180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5); 57190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6); 57200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7); 57210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8); 57220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 57230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_40); 57240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 57250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 57260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 57270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 57280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 57290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 57300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(temp7, m_temp_reg_20); 57310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(temp7, m_temp_reg_20); 57320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 57330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 57340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 57350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 57360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 57370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 57380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 57390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 57400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 57410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 57420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 57430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 57440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 57450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 57460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 57470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 57480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 57490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 57500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]); 57510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]); 57520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[34][0]); 57530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[35][0]); 57540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[36][0]); 57550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[37][0]); 57560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[38][0]); 57570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[39][0]); 57580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 57590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o4[0-3] */ 57600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 57610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 57620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 57630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 57640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 57650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 57660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 57670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 57680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 57690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 57700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 57710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5); 57720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6); 57730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7); 57740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8); 57750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 57760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 57770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 57780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 57790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 57800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 57810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 57820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 57830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_20); 57840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_20); 57850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 57860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 57870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 57880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 57890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 57900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 57910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 57920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 57930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 57940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 57950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 57960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 57970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 57980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 57990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 58000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 58010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 58020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]); 58030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]); 58040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[42][0]); 58050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[43][0]); 58060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[44][0]); 58070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[45][0]); 58080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[46][0]); 58090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[47][0]); 58100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 58110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o5[0-3] */ 58120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 58130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 58140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 58150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 58160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 58170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 58180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 58190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 58200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 58210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 58220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 58230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5); 58240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6); 58250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7); 58260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8); 58270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 58280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 58290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 58300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 58310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 58320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 58330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 58340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 58350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_20); 58360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_20); 58370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 58380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 58390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 58400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 58410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 58420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 58430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 58440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 58450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 58460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 58470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 58480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 58490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 58500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 58510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 58520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 58530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 58540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 58550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]); 58560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]); 58570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[50][0]); 58580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[51][0]); 58590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[52][0]); 58600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[53][0]); 58610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[54][0]); 58620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[55][0]); 58630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 58640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o6[0-3] */ 58650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 58660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 58670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 58680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 58690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 58700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 58710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 58720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 58730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 58740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 58750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 58760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5); 58770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6); 58780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7); 58790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8); 58800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 58810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 58820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 58830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 58840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 58850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 58860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 58870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 58880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_20); 58890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_20); 58900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 58910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 58920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 58930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 58940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 58950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 58960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 58970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 58980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 58990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 59000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 59010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 59020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 59030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 59040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 59050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 59060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 59070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 59080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]); 59090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]); 59100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[58][0]); 59110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[59][0]); 59120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[60][0]); 59130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[61][0]); 59140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[62][0]); 59150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[63][0]); 59160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 59170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o7[0-3] */ 59180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 59190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 59200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 59210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 59220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 59230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 59240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 59250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 59260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 59270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 59280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 59290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5); 59300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6); 59310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7); 59320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8); 59330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 59340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 59350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 59360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 59370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 59380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 59390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 59400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 59410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_20); 59420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_20); 59430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 59440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 59450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 59460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 59470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 59480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 59490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 59500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 59510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 59520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 59530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 59540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 59550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 59560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 59570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += 8; 59580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 59590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 59600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 59610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]); 59620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]); 59630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[66][0]); 59640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[67][0]); 59650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[68][0]); 59660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[69][0]); 59670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[70][0]); 59680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[71][0]); 59690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 59700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o8[0-3] */ 59710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 59720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 59730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 59740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 59750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 59760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 59770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 59780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 59790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 59800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 59810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 59820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5); 59830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6); 59840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7); 59850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8); 59860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 59870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 59880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 59890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 59900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 59910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 59920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 59930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 59940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_20); 59950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_20); 59960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 59970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 59980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 59990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 60000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 60010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 60020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 60030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 60040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 60050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 60060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 60070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 60080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 60090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 60100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 60110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 60120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 60130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]); 60140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]); 60150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[74][0]); 60160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[75][0]); 60170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[76][0]); 60180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[77][0]); 60190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[78][0]); 60200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[79][0]); 60210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 60220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o9[0-3] */ 60230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 60240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 60250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 60260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 60270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 60280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 60290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 60300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 60310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 60320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 60330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 60340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5); 60350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6); 60360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7); 60370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8); 60380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 60390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 60400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 60410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 60420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 60430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 60440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 60450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 60460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_20); 60470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_20); 60480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 60490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 60500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 60510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 60520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 60530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 60540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 60550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 60560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 60570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 60580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 60590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 60600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 60610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 60620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 60630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 60640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 60650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]); 60660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]); 60670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[82][0]); 60680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[83][0]); 60690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[84][0]); 60700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[85][0]); 60710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[86][0]); 60720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[87][0]); 60730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 60740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o10[0-3] */ 60750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 60760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 60770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 60780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 60790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 60800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 60810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 60820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 60830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 60840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 60850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 60860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5); 60870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6); 60880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7); 60890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8); 60900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 60910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 60920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 60930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 60940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 60950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 60960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 60970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 60980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_20); 60990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_20); 61000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 61010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 61020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 61030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 61040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 61050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 61060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 61070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 61080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 61090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 61100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 61110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 61120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 61130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 61140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 61150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 61160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 61170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 61180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]); 61190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]); 61200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[90][0]); 61210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[91][0]); 61220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[92][0]); 61230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[93][0]); 61240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[94][0]); 61250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[95][0]); 61260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 61270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o11[0-3] */ 61280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 61290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 61300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 61310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 61320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 61330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 61340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 61350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 61360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 61370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 61380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 61390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5); 61400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6); 61410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7); 61420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8); 61430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 61440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 61450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 61460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 61470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 61480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 61490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 61500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 61510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_20); 61520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_20); 61530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 61540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 61550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 61560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 61570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 61580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 61590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 61600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 61610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 61620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 61630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 61640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 61650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 61660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 61670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 61680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 61690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 61700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 61710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]); 61720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]); 61730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[98][0]); 61740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[99][0]); 61750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[100][0]); 61760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[101][0]); 61770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[102][0]); 61780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[103][0]); 61790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 61800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o12[0-3] */ 61810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 61820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 61830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 61840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 61850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 61860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 61870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 61880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 61890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 61900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 61910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 61920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5); 61930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6); 61940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7); 61950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8); 61960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 61970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 61980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 61990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 62000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 62010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 62020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 62030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 62040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(temp8, m_temp_reg_20); 62050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(temp8, m_temp_reg_20); 62060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 62070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 62080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 62090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 62100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 62110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 62120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 62130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 62140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 62150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 62160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 62170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 62180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 62190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 62200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 62210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 62220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 62230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 62240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]); 62250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]); 62260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[106][0]); 62270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[107][0]); 62280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[108][0]); 62290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[109][0]); 62300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[110][0]); 62310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[111][0]); 62320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 62330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o13[0-3] */ 62340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 62350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 62360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 62370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 62380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 62390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 62400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 62410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 62420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 62430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 62440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 62450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5); 62460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6); 62470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7); 62480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8); 62490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 62500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 62510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 62520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 62530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 62540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 62550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 62560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 62570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(temp6, m_temp_reg_20); 62580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(temp6, m_temp_reg_20); 62590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 62600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 62610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 62620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 62630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 62640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 62650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 62660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 62670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 62680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 62690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 62700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 62710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 62720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 62730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 62740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 62750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 62760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]); 62770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]); 62780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[114][0]); 62790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[115][0]); 62800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[116][0]); 62810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[117][0]); 62820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[118][0]); 62830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[119][0]); 62840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 62850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o14[0-3] */ 62860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 62870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 62880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 62890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 62900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 62910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 62920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 62930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 62940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 62950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 62960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 62970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5); 62980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6); 62990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7); 63000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8); 63010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 63020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 63030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 63040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 63050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 63060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 63070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 63080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 63090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(temp4, m_temp_reg_20); 63100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(temp4, m_temp_reg_20); 63110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 63120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 63130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 63140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 63150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 63160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 63170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 63180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 63190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 63200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 63210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 63220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 63230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 63240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 63250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 63260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 63270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 63280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 63290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]); 63300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]); 63310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[122][0]); 63320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[123][0]); 63330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[124][0]); 63340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[125][0]); 63350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[126][0]); 63360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[127][0]); 63370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 63380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o15[0-3] */ 63390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 63400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 63410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 63420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 63430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 63440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 63450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 63460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 63470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 63480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 63490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 63500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5); 63510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6); 63520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7); 63530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8); 63540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 63550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 63560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 63570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 63580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 63590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 63600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 63610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 63620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(temp2, m_temp_reg_20); 63630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sub_epi32(temp2, m_temp_reg_20); 63640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 63650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 63660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 63670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 63680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 63690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 63700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 63710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 63720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 63730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 63740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 63750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 63760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 63770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 63780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += 8; 63790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 63800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 63810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 63820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 63830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 63840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 63850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Transpose */ 63860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 63870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 63880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD16 *pi2_src_scratch = temp_ptr; 63890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 out_stride = dst_strd; 63900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 in_stride = 8; 63910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 63920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 63930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 63940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_load_si128((__m128i *)pi2_src_scratch); 63950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 63960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_load_si128((__m128i *)pi2_src_scratch); 63970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 63980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_33 = _mm_load_si128((__m128i *)pi2_src_scratch); 63990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 64000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_load_si128((__m128i *)pi2_src_scratch); 64010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 64020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_load_si128((__m128i *)pi2_src_scratch); 64030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 64040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_36 = _mm_load_si128((__m128i *)pi2_src_scratch); 64050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 64060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_37 = _mm_load_si128((__m128i *)pi2_src_scratch); 64070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += 8; 64080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 64090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_src_scratch); 64100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 64110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_src_scratch); 64120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 64130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_src_scratch); 64140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 64150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_src_scratch); 64160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 64170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_src_scratch); 64180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 64190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_src_scratch); 64200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 64210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_src_scratch); 64220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 64230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_src_scratch); 64240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += 8; 64250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 64260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 64270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31); 64280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30); 64290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 64300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33); 64310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32); 64320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 64330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35); 64340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34); 64350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 64360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37); 64370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36); 64380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 64390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_80 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); 64400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_81 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_70); 64410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 64420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_82 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); 64430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_83 = _mm_unpackhi_epi16(m_temp_reg_73, m_temp_reg_72); 64440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 64450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_84 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); 64460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_85 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_74); 64470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 64480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_86 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); 64490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_87 = _mm_unpackhi_epi16(m_temp_reg_77, m_temp_reg_76); 64500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 64510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 64520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42); 64530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42); 64540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 64550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46); 64560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46); 64570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 64580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_80, m_temp_reg_82); 64590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_80, m_temp_reg_82); 64600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 64610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_84, m_temp_reg_86); 64620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_84, m_temp_reg_86); 64630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 64640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_90 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41); 64650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_91 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41); 64660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 64670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_92 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45); 64680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_93 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45); 64690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 64700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_94 = _mm_unpacklo_epi32(m_temp_reg_83, m_temp_reg_81); 64710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_95 = _mm_unpackhi_epi32(m_temp_reg_83, m_temp_reg_81); 64720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 64730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_96 = _mm_unpacklo_epi32(m_temp_reg_87, m_temp_reg_85); 64740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_97 = _mm_unpackhi_epi32(m_temp_reg_87, m_temp_reg_85); 64750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 64760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 64770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_2); // row0 = 0-7 64780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_2); // row1 = 0-7 64790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 64800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_unpacklo_epi64(m_temp_reg_92, m_temp_reg_90); // row0=24-31 64810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_33 = _mm_unpackhi_epi64(m_temp_reg_92, m_temp_reg_90); // row1=24-31 64820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 64830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_unpacklo_epi64(m_temp_reg_4, m_temp_reg_6); // row0=8-15 64840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_unpackhi_epi64(m_temp_reg_4, m_temp_reg_6); // row1=8-15 64850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 64860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_36 = _mm_unpacklo_epi64(m_temp_reg_96, m_temp_reg_94); // row0=16-23 64870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_37 = _mm_unpackhi_epi64(m_temp_reg_96, m_temp_reg_94); // row1=16-23 64880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 64890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_80 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_3); // row2 =0-7 64900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_81 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_3); // row3 =0-7 64910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 64920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_82 = _mm_unpacklo_epi64(m_temp_reg_93, m_temp_reg_91); // row2=24-31 64930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_83 = _mm_unpackhi_epi64(m_temp_reg_93, m_temp_reg_91); // row3=24-31 64940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 64950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_84 = _mm_unpacklo_epi64(m_temp_reg_5, m_temp_reg_7); // row2=8-15 64960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_85 = _mm_unpackhi_epi64(m_temp_reg_5, m_temp_reg_7); // row3=8-15 64970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 64980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_86 = _mm_unpacklo_epi64(m_temp_reg_97, m_temp_reg_95); // row2=16-23 64990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_87 = _mm_unpackhi_epi64(m_temp_reg_97, m_temp_reg_95); // row3=16-23 65000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 65010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred); 65020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 65030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg); 65040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 65050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi16(m_temp_reg_30, m_temp_reg_0); 65060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8); 65070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 65080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg); 65090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 65100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_44 = _mm_add_epi16(m_temp_reg_34, m_temp_reg_0); 65110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44); 65120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 65130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20); 65140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 65150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16)); 65160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 65170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg); 65180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 65190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi16(m_temp_reg_36, m_temp_reg_0); 65200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8); 65210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 65220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg); 65230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 65240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_44 = _mm_add_epi16(m_temp_reg_32, m_temp_reg_0); 65250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44); 65260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 65270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20); 65280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += out_stride; 65290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_pred += pred_strd; 65300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 65310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 65320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred); 65330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 65340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg); 65350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 65360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi16(m_temp_reg_31, m_temp_reg_0); 65370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8); 65380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 65390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg); 65400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 65410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_44 = _mm_add_epi16(m_temp_reg_35, m_temp_reg_0); 65420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44); 65430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 65440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20); 65450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 65460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16)); 65470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 65480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg); 65490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 65500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi16(m_temp_reg_37, m_temp_reg_0); 65510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8); 65520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 65530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg); 65540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 65550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_44 = _mm_add_epi16(m_temp_reg_33, m_temp_reg_0); 65560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44); 65570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 65580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20); 65590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += out_stride; 65600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_pred += pred_strd; 65610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 65620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred); 65630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 65640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg); 65650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 65660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi16(m_temp_reg_80, m_temp_reg_0); 65670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8); 65680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 65690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg); 65700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 65710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_44 = _mm_add_epi16(m_temp_reg_84, m_temp_reg_0); 65720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44); 65730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 65740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20); 65750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 65760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16)); 65770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 65780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg); 65790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 65800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi16(m_temp_reg_86, m_temp_reg_0); 65810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8); 65820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 65830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg); 65840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 65850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_44 = _mm_add_epi16(m_temp_reg_82, m_temp_reg_0); 65860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44); 65870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 65880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20); 65890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += out_stride; 65900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_pred += pred_strd; 65910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 65920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 65930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred); 65940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 65950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg); 65960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 65970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi16(m_temp_reg_81, m_temp_reg_0); 65980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8); 65990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 66000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg); 66010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 66020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_44 = _mm_add_epi16(m_temp_reg_85, m_temp_reg_0); 66030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44); 66040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 66050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20); 66060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 66070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16)); 66080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 66090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg); 66100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 66110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi16(m_temp_reg_87, m_temp_reg_0); 66120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8); 66130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 66140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg); 66150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 66160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_44 = _mm_add_epi16(m_temp_reg_83, m_temp_reg_0); 66170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44); 66180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 66190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20); 66200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += out_stride; 66210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_pred += pred_strd; 66220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 66230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 66240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_tmp += 4; 66250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 66260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar} 66270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 66280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6629