10d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/****************************************************************************** 20d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 30d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 40d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 50d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Licensed under the Apache License, Version 2.0 (the "License"); 60d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* you may not use this file except in compliance with the License. 70d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* You may obtain a copy of the License at: 80d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 90d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* http://www.apache.org/licenses/LICENSE-2.0 100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Unless required by applicable law or agreed to in writing, software 120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* distributed under the License is distributed on an "AS IS" BASIS, 130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* See the License for the specific language governing permissions and 150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* limitations under the License. 160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar******************************************************************************/ 180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/** 190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ******************************************************************************* 200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @file 210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * ihevc_iquant_itrans_recon_atom_intr.c 220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @brief 240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Contains function definitions for inverse quantization, inverse 250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * transform and reconstruction 260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @author 280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 100470 290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 100592 (edited by) 300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @par List of Functions: 320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * - ihevc_iquant_itrans_recon_16x16_ssse3() 330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @remarks 350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * None 360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ******************************************************************************* 380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar */ 390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <stdio.h> 400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <string.h> 410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_typedefs.h" 420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_macros.h" 430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_platform_macros.h" 440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_defs.h" 450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_trans_tables.h" 460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_itrans_recon.h" 470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_func_selector.h" 480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_trans_macros.h" 490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <immintrin.h> 530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <emmintrin.h> 540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <tmmintrin.h> 560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/** 590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ******************************************************************************* 600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @brief 620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * This function performs inverse quantization, inverse transform and 630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * reconstruction for 16x16 input block 640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @par Description: 660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Performs inverse quantization , inverse transform and adds the 670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * prediction data and clips output to 8 bit 680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] pi2_src 700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Input 16x16 coefficients 710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] pi2_tmp 730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Temporary 16x16 buffer for storing inverse 740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * transform 1st stage output 750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] pu1_pred 770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Prediction 16x16 block 780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] pi2_dequant_coeff 800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Dequant Coeffs 810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[out] pu1_dst 830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Output 16x16 block 840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] qp_div 860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Quantization parameter / 6 870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] qp_rem 890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Quantization parameter % 6 900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] src_strd 920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Input stride 930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] pred_strd 950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Prediction stride 960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] dst_strd 980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Output Stride 990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 1000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] zero_cols 1010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * Zero columns in pi2_src 1020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 1030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @returns Void 1040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 1050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @remarks 1060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * None 1070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * 1080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ******************************************************************************* 1090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar */ 1100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarvoid ihevc_itrans_recon_16x16_ssse3(WORD16 *pi2_src, 1120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD16 *pi2_tmp, 1130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UWORD8 *pu1_pred, 1140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UWORD8 *pu1_dst, 1150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 src_strd, 1160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 pred_strd, 1170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 dst_strd, 1180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 zero_cols, 1190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 zero_rows) 1200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar{ 1210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_0; 1220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_1; 1230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_10; 1240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_11; 1250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_12; 1260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_13; 1270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_14; 1280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_20; 1300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_21; 1310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_22; 1320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_23; 1330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_24; 1340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_25; 1350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_26; 1360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_27; 1370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_30; 1380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_31; 1390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_32; 1400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_33; 1410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_34; 1420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_35; 1430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_36; 1440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_37; 1450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_40; 1460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_41; 1470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_42; 1480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_43; 1490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_44; 1500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_45; 1510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_46; 1520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_47; 1530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_70; 1550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_71; 1560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_72; 1570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_73; 1580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_74; 1590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_75; 1600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_76; 1610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_77; 1620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_rdng_factor; 1630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_count; 1640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_coeff1, m_coeff2, m_coeff3, m_coeff4; 1650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_coeff5, m_coeff6, m_coeff7, m_coeff8; 1660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 i; 1680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/*Lokesh*/ 1690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 zero_last8_cols_stg1; 1700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 zero_last8_rows_stg1; 1710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 zero_last12_rows_stg1; 1720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 zero_last12_rows_stg2; 1730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 zero_last8_rows_stg2; 1740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 loop = 0; 1760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 i4_shift = IT_SHIFT_STAGE_1; 1780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 trans_size = TRANS_SIZE_16; 1790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Following 3 instructions replicates the value in the */ 1840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* lower 16 bits of m_add_iq in the entire register */ 1850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Last 8 cols of 16x16 block are skipped based on the below flag : Lokesh */ 1870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar zero_last8_cols_stg1 = ((zero_cols & 0xFF00) == 0xFF00) ? 1 : 0; 1890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar zero_last8_rows_stg1 = ((zero_rows & 0xFF00) == 0xFF00) ? 1 : 0; 1900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar zero_last12_rows_stg1 = ((zero_rows & 0xFFF0) == 0xFFF0) ? 1 : 0; 1910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar zero_last12_rows_stg2 = ((zero_cols & 0xFFF0) == 0xFFF0) ? 1 : 0; 1930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar zero_last8_rows_stg2 = zero_last8_cols_stg1; 1940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(zero_last8_cols_stg1) 1950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 1960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar loop = 1; 1970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 1980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else 1990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar loop = 2; 2000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* i = 0 => lower 8 samples */ 2020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* i = 1 => higher 8 samples */ 2030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(i = 0; i < loop; i++) 2040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 2050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 2060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 sample_half_index = i << 3; 2070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD16 *pi2_tmp_src = pi2_src + sample_half_index; 2080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD16 *pi2_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp; 2090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_tmp_src); 2110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_tmp_src += (src_strd << 1); 2120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_tmp_src); 2130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_tmp_src += (src_strd << 1); 2140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_tmp_src); 2150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_tmp_src += (src_strd << 1); 2160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_tmp_src); 2170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_tmp_src += (src_strd << 1); 2180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_tmp_src); 2190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_tmp_src += (src_strd << 1); 2200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_tmp_src); 2210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_tmp_src += (src_strd << 1); 2220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_tmp_src); 2230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_tmp_src += (src_strd << 1); 2240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_tmp_src); 2250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_tmp_src += (src_strd << 1); 2260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* If last 12 rows are zero : Rishab */ 2310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(zero_last12_rows_stg1) 2320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 2330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eee */ 2350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */ 2360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */ 2370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 2380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Loading coeff and src for use in next block */ 2390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_77, m_temp_reg_70); //to get sign 2410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0 2420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6); 2440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77); 2460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6); 2480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75 2500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_26 = m_temp_reg_24; 2520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_27 = m_temp_reg_25; 2530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 2540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo */ 2560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo0[0-3] */ 2580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 2590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); 2600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73); 2610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ 2650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[0][0-3] stored in pi2_tmp[0][0-7] */ 2670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[7][0-3] stored in pi2_tmp[0][8-15] */ 2680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30); 2690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30); 2700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 2720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 2730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 2740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 2750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 2770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo0[4-7] */ 2800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 2810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1); 2820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ 2840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[0][4-7] stored in pi2_tmp[1][0-7] */ 2860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[7][4-7] stored in pi2_tmp[1][8-15] */ 2870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31); 2880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31); 2890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 2910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 2920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 2930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 2940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18 2960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 2970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo1[0-3] */ 2990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 3000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 3010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ 3030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[1][0-3] stored in pi2_tmp[2][0-7] */ 3050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[6][0-3] stored in pi2_tmp[2][8-15] */ 3060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30); 3070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30); 3080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 3100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 3110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 3120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 3130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 3140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo1[4-7] */ 3160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 3170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3); 3180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ 3200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[1][4-7] stored in pi2_tmp[3][0-7] */ 3220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[6][4-7] stored in pi2_tmp[3][8-15] */ 3230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31); 3240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31); 3250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 3270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 3280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 3290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 3300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89 3320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 3340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo2[0-3] */ 3360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 3370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 3380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[2][0-3] stored in pi2_tmp[4][0-7] */ 3400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[5][0-3] stored in pi2_tmp[4][8-15] */ 3410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30); 3420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30); 3430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 3450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 3460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 3470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 3480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 3500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo2[4-7] */ 3520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 3530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1); 3540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[2][4-7] stored in pi2_tmp[5][0-7] */ 3560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[5][4-7] stored in pi2_tmp[5][8-15] */ 3570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31); 3580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31); 3590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 3620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 3630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 3640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 3650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50 3670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 3680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo3[0-3] */ 3700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 3710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 3720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[3][0-3] stored in pi2_tmp[6][0-7] */ 3740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[4][0-3] stored in pi2_tmp[6][8-15] */ 3750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30); 3760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30); 3770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 3790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 3800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 3810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 3820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 3830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo3[4-7] */ 3850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 3860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3); 3870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[3][4-7] stored in pi2_tmp[7][0-7] */ 3890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[4][4-7] stored in pi2_tmp[7][8-15] */ 3900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31); 3910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31); 3920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 3940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 3950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 3960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 3970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 3980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 3990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* If last 8 rows are zero : Rishab */ 4000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else if(zero_last8_rows_stg1) 4010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 4020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeo */ 4030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */ 4040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */ 4050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 4060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83 36 4070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83 4080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's 4100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's 4110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 4130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); 4140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); 4160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2); 4170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 4190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eee */ 4210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */ 4220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */ 4230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 4240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Loading coeff and src for use in next block */ 4250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_77, m_temp_reg_70); //to get signs 4260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0 4270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6); 4290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77); 4310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6); 4330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_26 = m_temp_reg_24; 4350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_27 = m_temp_reg_25; 4360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75 4380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18 4390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 4400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo0[0-3] */ 4420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 4430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); 4440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73); 4450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ 4490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20); 4500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20); 4510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[0][0-3] stored in pi2_tmp[0][0-7] */ 4530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[7][0-3] stored in pi2_tmp[0][8-15] */ 4540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); 4550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); 4560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 4580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 4590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 4600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 4610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 4630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo0[4-7] */ 4650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 4660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1); 4670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ 4690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21); 4700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21); 4710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[0][4-7] stored in pi2_tmp[1][0-7] */ 4730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[7][4-7] stored in pi2_tmp[1][8-15] */ 4740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31); 4750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31); 4760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 4780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 4790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 4800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 4810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18 4830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50 4840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 4860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo1[0-3] */ 4880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 4890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 4900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ 4920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22); 4930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22); 4940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[1][0-3] stored in pi2_tmp[2][0-7] */ 4960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[6][0-3] stored in pi2_tmp[2][8-15] */ 4970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30); 4980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30); 4990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 5010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 5020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 5030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 5040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 5060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo1[4-7] */ 5080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 5090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3); 5100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ 5120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23); 5130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23); 5140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[1][4-7] stored in pi2_tmp[3][0-7] */ 5160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[6][4-7] stored in pi2_tmp[3][8-15] */ 5170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31); 5180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31); 5190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 5210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 5220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 5230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 5240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89 5260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[10][0]); //18 75 5270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 5290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo2[0-3] */ 5310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 5320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 5330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[2][0-3] stored in pi2_tmp[4][0-7] */ 5350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[5][0-3] stored in pi2_tmp[4][8-15] */ 5360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30); 5370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30); 5380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 5400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 5410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 5420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 5430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 5450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo2[4-7] */ 5470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 5480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1); 5490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[2][4-7] stored in pi2_tmp[5][0-7] */ 5510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[5][4-7] stored in pi2_tmp[5][8-15] */ 5520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31); 5530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31); 5540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 5560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 5570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 5580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 5590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50 5610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89 5620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 5640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo3[0-3] */ 5660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 5670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 5680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[3][0-3] stored in pi2_tmp[6][0-7] */ 5700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[4][0-3] stored in pi2_tmp[6][8-15] */ 5710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30); 5720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30); 5730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 5750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 5760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 5770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 5780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 5790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo3[4-7] */ 5810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 5820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3); 5830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[3][4-7] stored in pi2_tmp[7][0-7] */ 5850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[4][4-7] stored in pi2_tmp[7][8-15] */ 5860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31); 5870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31); 5880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 5900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 5910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 5920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 5930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 5940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } /* If all the rows are non-zero : Rishab */ 5950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else 5960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 5970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeo */ 5980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */ 5990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */ 6000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 6020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83 36 6030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83 6040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's 6060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's 6070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 6090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); 6100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); 6120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2); 6130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 6140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eee */ 6160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */ 6170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */ 6180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 6190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Loading coeff and src for use in next block */ 6200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[0][0]); //64 64 6210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[1][0]); //64 -64 6220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved LSB's 6240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved MSB's 6250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_0, m_coeff3); 6270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_0, m_coeff4); 6280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); 6300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_1, m_coeff4); 6310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75 6330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18 6340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 6360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo0[0-3] */ 6370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 6380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); 6390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73); 6400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77); 6410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_77); 6420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 6440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2); 6450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ 6480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20); 6490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20); 6500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[0][0-3] stored in pi2_tmp[0][0-7] */ 6520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[7][0-3] stored in pi2_tmp[0][8-15] */ 6530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); 6540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); 6550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32); 6560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32); 6570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 6590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 6600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 6610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 6620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 6650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo0[4-7] */ 6670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 6680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1); 6690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2); 6700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ 6720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21); 6730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21); 6740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[0][4-7] stored in pi2_tmp[1][0-7] */ 6760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[7][4-7] stored in pi2_tmp[1][8-15] */ 6770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31); 6780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31); 6790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33); 6800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33); 6810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 6830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 6840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 6850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 6860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18 6880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50 6890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 6910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo1[0-3] */ 6930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 6940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 6950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4); 6960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ 6980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22); 6990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22); 7000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[1][0-3] stored in pi2_tmp[2][0-7] */ 7020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[6][0-3] stored in pi2_tmp[2][8-15] */ 7030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30); 7040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30); 7050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_32); 7060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_32); 7070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 7090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 7100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 7110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 7120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 7140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo1[4-7] */ 7160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 7170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3); 7180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 7190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ 7210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23); 7220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23); 7230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[1][4-7] stored in pi2_tmp[3][0-7] */ 7250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[6][4-7] stored in pi2_tmp[3][8-15] */ 7260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31); 7270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31); 7280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_33); 7290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_33); 7300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 7320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 7330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 7340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 7350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89 7360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[10][0]); //18 75 7370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 7380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo2[0-3] */ 7400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 7410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 7420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2); 7430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[2][0-3] stored in pi2_tmp[4][0-7] */ 7450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[5][0-3] stored in pi2_tmp[4][8-15] */ 7460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30); 7470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30); 7480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32); 7490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32); 7500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 7520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 7530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 7540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 7550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 7560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo2[4-7] */ 7580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 7590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1); 7600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2); 7610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[2][4-7] stored in pi2_tmp[5][0-7] */ 7630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[5][4-7] stored in pi2_tmp[5][8-15] */ 7640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31); 7650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31); 7660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33); 7670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33); 7680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 7700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 7710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 7720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 7730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50 7750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89 7760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 7780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo3[0-3] */ 7800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 7810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 7820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4); 7830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[3][0-3] stored in pi2_tmp[6][0-7] */ 7850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[4][0-3] stored in pi2_tmp[6][8-15] */ 7860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30); 7870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30); 7880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32); 7890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32); 7900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 7930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 7940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 7950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 7960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 7970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo3[4-7] */ 7990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 8000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3); 8010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 8020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[3][4-7] stored in pi2_tmp[7][0-7] */ 8040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[4][4-7] stored in pi2_tmp[7][8-15] */ 8050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31); 8060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31); 8070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33); 8080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33); 8090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 8110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 8120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 8130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += 8; 8140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 8150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 8170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 8180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 8200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 sample_half_index = i << 3; 8210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD16 *pi2_tmp_src = pi2_src + sample_half_index + src_strd; 8220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_tmp_src); 8240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_tmp_src += (src_strd << 1); 8250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_tmp_src); 8260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_tmp_src += (src_strd << 1); 8270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_tmp_src); 8280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_tmp_src += (src_strd << 1); 8290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_tmp_src); 8300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_tmp_src += (src_strd << 1); 8310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_tmp_src); 8320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_tmp_src += (src_strd << 1); 8330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_tmp_src); 8340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_tmp_src += (src_strd << 1); 8350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_tmp_src); 8360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_tmp_src += (src_strd << 1); 8370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_tmp_src); 8380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_tmp_src += (src_strd << 1); 8390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 8400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o & stage 1 out */ 8420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 8430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 j; 8440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD16 *pi2_src_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp; 8450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp; 8460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 out_stride = (trans_size << 1); 8470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 in_stride = trans_size << 1; 8480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(zero_last12_rows_stg1) 8500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 8510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(j = 0; j < 2; j++) 8520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 8530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(j) //H8B= higher 8 bytes L8B lower 8 bytes 8540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 8550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B 8560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 8570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else 8580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 8590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B 8600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 8610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87 8620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o0[0-3] */ 8650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 8660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 8670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 8700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 8710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57 8730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 8760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 8770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 8790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 8800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00); 8810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 8830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 8840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 8850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 8860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 8880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 8900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 8910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 8920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o1[0-3] */ 8940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 8950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 8960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 8990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 9000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9 9020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 9040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 9050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 9070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 9080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 9090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 9100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 9120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 9140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 9150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 9160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o2[0-3] */ 9180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 9190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 9200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 9220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 9230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43 9250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 9270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 9280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 9300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 9310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 9320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 9330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 9350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 9370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 9380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 9390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o3[0-3] */ 9410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 9420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 9430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 9450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += 8; 9460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80 9480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 9500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 9510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 9530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 9540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 9550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 9560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 9580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 9600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += 8; 9610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 9620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o4[0-3] */ 9640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 9650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 9660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 9680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch -= in_stride; 9690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90 9710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 9730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 9740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 9760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 9770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 9780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 9790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 9810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 9830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch -= out_stride; 9840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 9850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o5[0-3] */ 9870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 9880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 9890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 9920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch -= in_stride; 9930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70 9950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 9970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 9980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 10000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 10010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 10020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 10030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 10050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 10070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch -= out_stride; 10080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 10090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o6[0-3] */ 10110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 10120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 10130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 10150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch -= in_stride; 10160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25 10180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 10200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 10210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 10230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 10240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 10250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 10260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 10280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 10300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch -= out_stride; 10310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 10320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o7[0-3] */ 10340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 10350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 10360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 10380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += 8; 10390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 10410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 10420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 10440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 10450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 10460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 10470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 10490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 10510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += 8; 10520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 10530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 10540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 10550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else if(zero_last8_rows_stg1) 10560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 10570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(j = 0; j < 2; j++) 10580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 10590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(j) 10600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 10610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B 10620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B 10630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 10640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else 10650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 10660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B 10670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B 10680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 10690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87 10700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70 10710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o0[0-3] */ 10730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 10740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 10750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 10760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 10780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 10790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57 10810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43 10820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 10840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 10850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 10860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 10890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 10900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00); 10920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 10940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 10950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 10960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 10970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 10990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 11010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 11020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 11030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o1[0-3] */ 11050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 11060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 11070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); 11080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 11100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 11110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9 11130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87 11140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25); 11160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 11170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 11180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 11200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 11210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 11220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 11230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 11250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 11270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 11280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 11290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o2[0-3] */ 11310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 11320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 11330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 11340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 11360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 11370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43 11390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9 11400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21); 11420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 11430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 11440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 11460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 11470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 11480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 11490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 11510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 11530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 11540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 11550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o3[0-3] */ 11570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 11580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 11590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); 11600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 11620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += 8; 11630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80 11650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90 11660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25); 11680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 11690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 11700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 11720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 11730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 11740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 11750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 11770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 11790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += 8; 11800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 11810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o4[0-3] */ 11830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 11840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 11850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 11860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 11880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch -= in_stride; 11890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90 11910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25 11920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21); 11940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 11950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 11960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 11990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 12000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 12010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 12020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 12040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 12060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch -= out_stride; 12070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 12080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o5[0-3] */ 12100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 12110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 12120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); 12130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 12150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch -= in_stride; 12160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70 12180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80 12190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25); 12210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 12220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 12230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 12260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 12270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 12280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 12290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 12310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 12330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch -= out_stride; 12340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 12350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o6[0-3] */ 12370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 12380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 12390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 12400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 12420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch -= in_stride; 12430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25 12450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57 12460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 12480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 12490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 12500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 12520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 12530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 12540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 12550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 12570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 12590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch -= out_stride; 12600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 12610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o7[0-3] */ 12630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 12640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 12650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); 12660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 12680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += 8; 12690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25); 12710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 12720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 12730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 12750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 12760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 12770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 12780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 12800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 12820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += 8; 12830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 12840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 12850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 12870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else 12880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 12890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(j = 0; j < 2; j++) 12930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 12940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(j) //H8B= higher 8 bytes L8B lower 8 bytes 12950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 12960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B 12970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B 12980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_12 = _mm_unpackhi_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 H8B 12990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 H8B 13000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 13010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else 13020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 13030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B 13040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B 13050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 L8B 13060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 L8B 13070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 13080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87 13090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70 13100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[2][0]); //57 43 13110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[3][0]); //25 9 13120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o0[0-3] */ 13150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 13160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 13170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 13180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 13190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 13200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 13230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 13240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57 13260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43 13270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[6][0]); //80 90 13280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[7][0]); //70 25 13290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 13310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 13320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 13330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 13340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 13350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 13370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 13380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 13390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 13400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 13420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 13430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 13440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 13450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 13470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 13490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 13500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 13510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o1[0-3] */ 13530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 13540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 13550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); 13560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7); 13570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8); 13580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 13610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 13620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9 13640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87 13650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[10][0]); //25 -57 13660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[11][0]); //90 43 13670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25); 13690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27); 13700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26); 13710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 13720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 13730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 13750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 13760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 13770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 13780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 13800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 13820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 13830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 13840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o2[0-3] */ 13860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 13870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 13880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 13890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 13900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 13910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 13930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 13940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43 13960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9 13970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[14][0]); //90 25 13980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[15][0]); //80 57 13990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21); 14010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23); 14020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22); 14030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 14040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 14050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 14070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 14080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 14090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 14100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 14120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 14140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 14150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 14160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o3[0-3] */ 14180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 14190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 14200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); 14210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7); 14220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8); 14230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 14260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += 8; 14270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80 14290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90 14300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[18][0]); //9 87 14310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[19][0]); //43 70 14320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25); 14340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27); 14350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26); 14360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 14370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 14380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 14400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 14410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 14420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 14430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 14450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 14470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += 8; 14480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 14490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o4[0-3] */ 14510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 14520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 14530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 14540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 14550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 14560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 14580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch -= in_stride; 14590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90 14610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25 14620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[22][0]); //87 -70 14630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[23][0]); //9 -80 14640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21); 14660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23); 14670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22); 14680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 14690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 14700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 14720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 14730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 14740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 14750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 14770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 14790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch -= out_stride; 14800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 14810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o5[0-3] */ 14830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 14840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 14850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); 14860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7); 14870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8); 14880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 14910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch -= in_stride; 14920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70 14940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80 14950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[26][0]); //43 9 14960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[27][0]); //57 -87 14970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25); 14990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27); 15000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26); 15010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 15020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 15030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 15050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 15060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 15070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 15080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 15100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 15120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch -= out_stride; 15130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 15140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o6[0-3] */ 15160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 15170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 15180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 15190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 15200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 15210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 15240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch -= in_stride; 15250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25 15270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57 15280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[30][0]); //70 -80 15290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[31][0]); //87 -90 15300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 15330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23); 15340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 15350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 15360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 15370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 15390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 15400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 15410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 15420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 15440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 15460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch -= out_stride; 15470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 15480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o7[0-3] */ 15500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 15510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 15520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); 15530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7); 15540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8); 15550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 15570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += 8; 15580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25); 15600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27); 15610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26); 15620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 15630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 15640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 15670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 15680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 15690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 15700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 15720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 15740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += 8; 15750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 15760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 15770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 15780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 15790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Transpose */ 15810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 15820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD16 *pi2_src_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp; 15830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD16 *pi2_dst_scratch = ((i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp); 15840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 out_stride = (trans_size << 1); 15850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 in_stride = (trans_size << 1); 15860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 j; 15870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(j = 0; j < 2; j++) 15890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 15900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //b, a 15910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 15920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //d, c 15930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 15940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //f, e 15950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 15960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_33 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //h, g 15970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += 8; 15980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //j, i 15990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch -= in_stride; 16000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //l, k 16010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch -= in_stride; 16020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_36 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //n, m 16030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch -= in_stride; 16040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_37 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //p, o 16050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += 8; 16060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31); //ca3ca2ca1ca0 16080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30); //bd3bd2bd1bd0 16090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33); //ge3ge2ge1ge0 16110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32); //fh3fh2fh1fh0 16120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35); //ki3ki2ki1ki0 16140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34); //jl3jl2jl1jl0 16150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37); //om3om2om1om0 16170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36); //np3np2np1np0 16180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42); //ge1ca1ge0ca0 16210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42); //ge3ca3ge2ca2 16220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46); //om1ki1om0ki0 16240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_33 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46); //om3ki3om2ki2 16250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41); //bd1fh1bd0fh0 16270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41); //bd3fh3bd2fh2 16280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_36 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45); //jl1np1jl0np0 16300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_37 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45); //jl3np3jl2np2 16310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_unpacklo_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca0 16340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_41 = _mm_unpackhi_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca1 16350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_unpacklo_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca2 16370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_43 = _mm_unpackhi_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca3 16380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_44 = _mm_unpacklo_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp0 16400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_45 = _mm_unpackhi_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp1 16410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_46 = _mm_unpacklo_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp2 16430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_47 = _mm_unpackhi_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp3 16440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40); 16460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 16470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_44); 16480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 16490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_41); 16500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 16510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_45); 16520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += 8; 16530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_42); 16540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch -= out_stride; 16550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_46); 16560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch -= out_stride; 16570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_43); 16580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch -= out_stride; 16590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_47); 16600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += 8; 16610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 16620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 16630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 16640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(zero_last8_cols_stg1) 16660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 16670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD16 *pi2_dst_scratch = (pi2_tmp + 8 * trans_size); 16680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 out_stride = (trans_size << 1); 16690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 j; 16700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_setzero_si128(); 16720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(j = 0; j < 2; j++) 16730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 16740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40); 16750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 16760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40); 16770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 16780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40); 16790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 16800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40); 16810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += 8; 16820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40); 16830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch -= out_stride; 16840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40); 16850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch -= out_stride; 16860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40); 16870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch -= out_stride; 16880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40); 16890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += 8; 16900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 16910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 16920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Stage 2 */ 16970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(i = 0; i < 2; i++) 16980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 16990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD16 *pi2_src_temp = (i) ? (pi2_tmp + 2 * trans_size) : (WORD16 *)(pi2_tmp); 17000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 stride = (trans_size); 17010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MEM_ALIGN16 WORD16 temp_array[256]; 17020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar i4_shift = IT_SHIFT_STAGE_2; 17040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(zero_last12_rows_stg2) 17060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 17070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeo */ 17080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */ 17090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */ 17100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 17110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //0 17120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_temp += (stride * 9); 17140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(!i) 17160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 17170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_temp += (stride * 6 + 8); 17180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 17190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else 17200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 17210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_temp += (stride * 2 + 8); 17220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 17230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_temp -= (stride * 9); 17250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //2 17270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_setzero_si128(); 17290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_setzero_si128(); 17300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_setzero_si128(); 17320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_setzero_si128(); 17330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 17340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eee */ 17360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */ 17370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */ 17380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 17390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Loading coeff and src for use in next block */ 17400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Loading coeff and src for use in next block */ 17420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_20, m_temp_reg_70); 17430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0 17450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6); 17470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77); 17490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6); 17500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75 17520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18 17530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_26 = m_temp_reg_24; 17550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_27 = m_temp_reg_25; 17560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* */ 17570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_20); 17590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_20); 17600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 17610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo */ 17630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 17640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD16 *pi2_scratch = temp_array; 17650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 out_stride = 8; 17660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo0[0-3] */ 17690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 17700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 17710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ 17730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[0][0-3] stored in pu1_dst[0] */ 17750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[7][0-3] stored in pu1_dst[1] */ 17760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30); 17770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30); 17780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 17800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += out_stride; 17810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)(pi2_scratch), m_temp_reg_35); 17820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += out_stride; 17830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 17840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo0[4-7] */ 17860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 17870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 17880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ 17900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[0][4-7] stored in pu1_dst[2] */ 17920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[7][4-7] stored in pu1_dst[3] */ 17930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31); 17950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31); 17960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 17980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += out_stride; 17990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 18000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += out_stride; 18010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18 18030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50 18040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 18060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo1[0-3] */ 18080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 18090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 18100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ 18120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[1][0-3] stored in pu1_dst[4] */ 18140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[6][0-3] stored in pu1_dst[5] */ 18150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30); 18160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30); 18170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 18190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += out_stride; 18200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 18210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += out_stride; 18220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 18230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo1[4-7] */ 18250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 18260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3); 18270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ 18290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[1][4-7] stored in pu1_dst[6]*/ 18310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[6][4-7] stored in pu1_dst[7] */ 18320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31); 18330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31); 18340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 18360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += out_stride; 18370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 18380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += out_stride; 18390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89 18410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 18430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo2[0-3] */ 18450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 18460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 18470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[2][0-3] stored in pu1_dst[8]*/ 18490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[5][0-3] stored in pu1_dst[9] */ 18500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30); 18510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30); 18520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 18540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += out_stride; 18550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 18560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += out_stride; 18570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 18580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo2[4-7] */ 18600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 18610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 18620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[2][4-7] stored in pu1_dst[10]*/ 18640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[5][4-7] stored in pu1_dst[11] */ 18650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31); 18660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31); 18670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 18690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += out_stride; 18700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 18710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += out_stride; 18720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50 18740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89 18750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 18770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo3[0-3] */ 18790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 18800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 18810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[3][0-3] stored in pu1_dst[12]*/ 18830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[4][0-3] stored in pu1_dst[13] */ 18840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30); 18850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30); 18860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 18880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += out_stride; 18890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 18900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += out_stride; 18910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 18920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo3[4-7] */ 18940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 18950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3); 18960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[3][4-7] stored in pu1_dst[14]*/ 18980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[4][4-7] stored in pu1_dst[15] */ 18990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31); 19000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31); 19010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 19030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += out_stride; 19040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 19050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += out_stride; 19060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 19070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 19090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 19100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else if(zero_last8_rows_stg2) 19110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 19120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeo */ 19130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */ 19140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */ 19150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 19160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_16_even[3][0]); //83 19180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_16_even[4][0]); //36 19190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //0 19210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_temp += (stride); 19220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //4 19230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_temp += (stride * 8); 19240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(!i) 19260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 19270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_temp += (stride * 6 + 8); 19280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 19290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else 19300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 19310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_temp += (stride * 2 + 8); 19320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 19330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_temp -= (stride * 8); 19350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //6 19360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_temp -= (stride); 19370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //2 19380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_76 = _mm_setzero_si128(); 19410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83 36 19440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83 19450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's 19470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's 19480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 19500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); 19510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); 19530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2); 19540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 19550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eee */ 19570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */ 19580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */ 19590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 19600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Loading coeff and src for use in next block */ 19610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_76, m_temp_reg_70); 19640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0 19660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6); 19680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77); 19690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6); 19700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75 19720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_26 = m_temp_reg_24; 19740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_27 = m_temp_reg_25; 19750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); 19770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73); 19780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 19790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo */ 19810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 19820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD16 *pi2_scratch = temp_array; 19830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 out_stride = 8; 19840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo0[0-3] */ 19870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 19880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 19890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ 19910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20); 19920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20); 19930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[0][0-3] stored in pu1_dst[0] */ 19950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[7][0-3] stored in pu1_dst[1] */ 19960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); 19970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); 19980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 20000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += out_stride; 20010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)(pi2_scratch), m_temp_reg_35); 20020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += out_stride; 20030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 20040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo0[4-7] */ 20060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 20070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 20080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ 20100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21); 20110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21); 20120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[0][4-7] stored in pu1_dst[2] */ 20140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[7][4-7] stored in pu1_dst[3] */ 20150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31); 20170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31); 20180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 20200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += out_stride; 20210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 20220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += out_stride; 20230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18 20250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50 20260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 20280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo1[0-3] */ 20300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 20310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 20320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ 20340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22); 20350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22); 20360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[1][0-3] stored in pu1_dst[4] */ 20380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[6][0-3] stored in pu1_dst[5] */ 20390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30); 20400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30); 20410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 20430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += out_stride; 20440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 20450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += out_stride; 20460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 20470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo1[4-7] */ 20490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 20500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3); 20510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ 20530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23); 20540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23); 20550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[1][4-7] stored in pu1_dst[6]*/ 20570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[6][4-7] stored in pu1_dst[7] */ 20580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31); 20590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31); 20600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 20620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += out_stride; 20630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 20640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += out_stride; 20650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89 20670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 20690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo2[0-3] */ 20710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 20720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 20730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[2][0-3] stored in pu1_dst[8]*/ 20750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[5][0-3] stored in pu1_dst[9] */ 20760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30); 20770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30); 20780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 20800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += out_stride; 20810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 20820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += out_stride; 20830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 20840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo2[4-7] */ 20860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 20870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 20880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[2][4-7] stored in pu1_dst[10]*/ 20900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[5][4-7] stored in pu1_dst[11] */ 20910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31); 20920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31); 20930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 20950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += out_stride; 20960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 20970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += out_stride; 20980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50 21000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89 21010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 21030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo3[0-3] */ 21050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 21060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 21070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[3][0-3] stored in pu1_dst[12]*/ 21090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[4][0-3] stored in pu1_dst[13] */ 21100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30); 21110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30); 21120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 21140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += out_stride; 21150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 21160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += out_stride; 21170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 21180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo3[4-7] */ 21200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 21210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3); 21220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[3][4-7] stored in pu1_dst[14]*/ 21240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[4][4-7] stored in pu1_dst[15] */ 21250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31); 21260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31); 21270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 21290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += out_stride; 21300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 21310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += out_stride; 21320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 21330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 21340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 21350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else 21370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 21380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeo */ 21390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */ 21400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */ 21410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 21420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //0 21450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_temp += (stride); 21460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //4 21470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_temp += (stride * 7); 21480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_src_temp); //8 21490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_temp += (stride); 21500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_src_temp); //12 21510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(!i) 21520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 21530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_temp += (stride * 6 + 8); 21540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 21550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else 21560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 21570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_temp += (stride * 2 + 8); 21580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 21590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_src_temp); //14 21600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_temp -= (stride); 21610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_src_temp); //10 21620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_temp -= (stride * 7); 21630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //6 21640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_temp -= (stride); 21650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //2 21660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83 36 21680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83 21690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's 21710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's 21720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 21740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); 21750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); 21770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2); 21780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 21810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eee */ 21830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */ 21840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */ 21850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 21860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Loading coeff and src for use in next block */ 21870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[0][0]); //64 64 21880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[1][0]); //64 -64 21890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved LSB's 21910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved MSB's 21920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_0, m_coeff3); 21940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_0, m_coeff4); 21950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); 21970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_1, m_coeff4); 21980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75 22000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18 22010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 22030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo */ 22050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 22060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD16 *pi2_scratch = temp_array; 22070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 out_stride = 8; 22080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo0[0-3] */ 22120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 22130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); 22140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73); 22150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77); 22160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_77); 22170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 22200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2); 22210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ 22240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20); 22250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20); 22260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[0][0-3] stored in pi2_tmp[0][0-7] */ 22290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[7][0-3] stored in pi2_tmp[0][8-15] */ 22300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); 22310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); 22320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32); 22330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32); 22340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 22360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += out_stride; 22370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 22380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += out_stride; 22390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 22420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo0[4-7] */ 22440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 22450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1); 22470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2); 22480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ 22500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21); 22510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21); 22520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[0][4-7] stored in pi2_tmp[1][0-7] */ 22540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[7][4-7] stored in pi2_tmp[1][8-15] */ 22550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31); 22560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31); 22570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33); 22580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33); 22590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 22610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += out_stride; 22620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 22630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += out_stride; 22640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18 22660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50 22670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 22690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo1[0-3] */ 22710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 22720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 22730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4); 22740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ 22760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22); 22770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22); 22780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[1][0-3] stored in pi2_tmp[2][0-7] */ 22800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[6][0-3] stored in pi2_tmp[2][8-15] */ 22810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30); 22820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30); 22830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_32); 22840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_32); 22850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 22870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += out_stride; 22880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 22890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += out_stride; 22900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 22920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo1[4-7] */ 22940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 22950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3); 22960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 22970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 22980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ 22990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23); 23000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23); 23010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[1][4-7] stored in pi2_tmp[3][0-7] */ 23030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[6][4-7] stored in pi2_tmp[3][8-15] */ 23040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31); 23050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31); 23060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_33); 23070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_33); 23080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 23100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += out_stride; 23110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 23120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += out_stride; 23130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89 23140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[10][0]); //18 75 23150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 23160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo2[0-3] */ 23180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 23190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 23200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2); 23210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[2][0-3] stored in pi2_tmp[4][0-7] */ 23230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[5][0-3] stored in pi2_tmp[4][8-15] */ 23240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30); 23250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30); 23260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32); 23270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32); 23280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 23300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += out_stride; 23310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 23320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += out_stride; 23330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 23340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo2[4-7] */ 23360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 23370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1); 23380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2); 23390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[2][4-7] stored in pi2_tmp[5][0-7] */ 23410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[5][4-7] stored in pi2_tmp[5][8-15] */ 23420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31); 23430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31); 23440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33); 23450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33); 23460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 23480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += out_stride; 23490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 23500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += out_stride; 23510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50 23530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89 23540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 23560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo3[0-3] */ 23580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 23590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 23600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4); 23610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[3][0-3] stored in pi2_tmp[6][0-7] */ 23630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[4][0-3] stored in pi2_tmp[6][8-15] */ 23640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30); 23650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30); 23660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32); 23670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32); 23680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 23710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += out_stride; 23720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 23730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += out_stride; 23740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 23750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* eo3[4-7] */ 23770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 23780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3); 23790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 23800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[3][4-7] stored in pi2_tmp[7][0-7] */ 23820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* e[4][4-7] stored in pi2_tmp[7][8-15] */ 23830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31); 23840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31); 23850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33); 23860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33); 23870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 23890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += out_stride; 23900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 23910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_scratch += out_stride; 23920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 23930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 23940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 23950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 23960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(zero_last12_rows_stg2) 23970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 23980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o & stage 2 pre-transposed out */ 23990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 24000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 j; 24010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD16 *pi2_src_scratch = temp_array; 24020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8) : (pi2_tmp); 24030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 out_stride = (trans_size); 24040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 in_stride = (8) * 4; 24050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_temp = pi2_tmp + (stride * 4) + i * (stride * 2); 24070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //1 24090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_temp += (stride * 9); 24110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(0 == i) 24130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 24140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_temp -= (stride * 2 - 8); 24150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 24160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else 24170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 24180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_temp -= (stride * 6 - 8); 24190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 24200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_temp -= (stride * 9); 24210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //3 24230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(j = 0; j < 2; j++) 24260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 24270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(j) 24280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 24290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B 24300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 24310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else 24320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 24330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B 24340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 24350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87 24360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o0[0-3] */ 24380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 24390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 24400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 24420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 24430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57 24450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 24470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 24480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 24510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 24520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00); 24530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 24560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 24570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 24580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 24590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 24610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 24630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 24640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 24650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o1[0-3] */ 24670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 24680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 24690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 24710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 24720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9 24740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 24760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 24770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 24790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 24800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 24810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 24820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 24840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 24860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += ((!i) * out_stride + 8); 24870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 24880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o2[0-3] */ 24900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 24910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 24920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 24940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 24950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43 24970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 24980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 24990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 25000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 25020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 25030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 25040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 25050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 25070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 25090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 25100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 25110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o3[0-3] */ 25130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 25140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 25150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 25170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += 8; 25180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80 25200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 25220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 25230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 25250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 25260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 25270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 25280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 25300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 25320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += (i * out_stride + 8); 25330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 25340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o4[0-3] */ 25360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 25370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 25380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 25400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch -= in_stride; 25410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90 25430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 25450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 25460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 25490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 25500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 25510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 25520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 25540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 25560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 25570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 25580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o5[0-3] */ 25600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 25610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 25620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 25640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch -= in_stride; 25650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70 25670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 25690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 25700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 25730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 25740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 25750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 25760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 25780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 25800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += ((!i) * out_stride + 8); 25810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 25820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o6[0-3] */ 25840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 25850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 25860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 25880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch -= in_stride; 25890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25 25910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 25930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 25940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 25950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 25960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 25970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 25980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 25990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 26010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 26030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 26040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 26050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o7[0-3] */ 26070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 26080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 26090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 26110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += 8; 26120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 26140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 26150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 26170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 26180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 26190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 26200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 26220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 26240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += (i * out_stride + 8); 26250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 26260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 26290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 26300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 26310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else if(zero_last8_rows_stg2) 26320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 26330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o & stage 2 pre-transposed out */ 26340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 26350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 j; 26360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD16 *pi2_src_scratch = temp_array; 26370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8) : (pi2_tmp); 26380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 out_stride = (trans_size); 26390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 in_stride = (8) * 4; 26400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_temp = pi2_tmp + (stride * 4) + i * (stride * 2); 26420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //1 26450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_temp += (stride); 26460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //5 26470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_temp += (stride * 8); 26480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(0 == i) 26500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 26510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_temp -= (stride * 2 - 8); 26520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 26530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else 26540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 26550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_temp -= (stride * 6 - 8); 26560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 26570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_temp -= (stride * 8); 26590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //7 26600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_temp -= (stride); 26610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //3 26620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(j = 0; j < 2; j++) 26650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 26660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(j) 26670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 26680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B 26690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B 26700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 26710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else 26720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 26730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B 26740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B 26750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 26760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87 26770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70 26780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o0[0-3] */ 26800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 26810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 26820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 26830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 26850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 26860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57 26880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43 26890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 26910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 26920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 26930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 26960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 26970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 26980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00); 26990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 27010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 27020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 27030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 27040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 27060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 27080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 27090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 27100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o1[0-3] */ 27120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 27130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 27140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); 27150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 27170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 27180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9 27200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87 27210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25); 27230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 27240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 27250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 27270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 27280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 27290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 27300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 27320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 27340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += ((!i) * out_stride + 8); 27350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 27360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o2[0-3] */ 27380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 27390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 27400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 27410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 27430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 27440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43 27460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9 27470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21); 27490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 27500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 27510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 27530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 27540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 27550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 27560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 27580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 27600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 27610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 27620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o3[0-3] */ 27640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 27650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 27660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); 27670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 27690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += 8; 27700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80 27720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90 27730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25); 27750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 27760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 27770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 27790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 27800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 27810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 27820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 27840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 27860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += (i * out_stride + 8); 27870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 27880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o4[0-3] */ 27900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 27910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 27920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 27930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 27950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch -= in_stride; 27960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 27970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90 27980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25 27990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 28000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21); 28010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 28020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 28030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 28040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 28050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 28060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 28070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 28080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 28090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 28100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 28110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 28120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 28130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 28140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 28150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 28160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o5[0-3] */ 28170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 28180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 28190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); 28200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 28210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 28220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch -= in_stride; 28230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 28240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70 28250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80 28260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 28270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25); 28280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 28290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 28300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 28310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 28320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 28330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 28340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 28350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 28360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 28370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 28380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 28390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 28400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += ((!i) * out_stride + 8); 28410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 28420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 28430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o6[0-3] */ 28440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 28450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 28460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 28470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 28480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 28490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch -= in_stride; 28500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 28510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25 28520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57 28530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 28540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 28550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 28560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 28570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 28580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 28590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 28600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 28610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 28620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 28630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 28640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 28650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 28660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 28670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 28680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 28690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o7[0-3] */ 28700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 28710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 28720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); 28730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 28740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 28750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += 8; 28760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 28770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25); 28780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 28790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 28800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 28810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 28820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 28830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 28840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 28850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 28860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 28870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 28880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 28890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += (i * out_stride + 8); 28900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 28910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 28920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 28930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 28940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else 28950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 28960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o & stage 2 pre-transposed out */ 28970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 28980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 j; 28990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD16 *pi2_src_scratch = temp_array; 29000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8) : (pi2_tmp); 29010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 out_stride = (trans_size); 29020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 in_stride = (8) * 4; 29030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 29040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_temp = pi2_tmp + (stride * 4) + i * (stride * 2); 29050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 29060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 29070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //1 29080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_temp += (stride); 29090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //5 29100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_temp += (stride * 7); 29110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_src_temp); //9 29120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_temp += (stride); 29130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_src_temp); //13 29140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(0 == i) 29150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 29160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_temp -= (stride * 2 - 8); 29170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 29180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else 29190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 29200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_temp -= (stride * 6 - 8); 29210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 29220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_src_temp); //15 29230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_temp -= (stride); 29240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_src_temp); //11 29250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_temp -= (stride * 7); 29260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //7 29270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_temp -= (stride); 29280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //3 29290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 29300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 29310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(j = 0; j < 2; j++) 29320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 29330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 29340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(j) //H8B= higher 8 bytes L8B lower 8 bytes 29350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 29360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B 29370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B 29380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_12 = _mm_unpackhi_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 H8B 29390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 H8B 29400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 29410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else 29420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 29430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B 29440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B 29450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 L8B 29460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 L8B 29470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 29480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87 29490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70 29500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[2][0]); //57 43 29510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[3][0]); //25 9 29520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 29530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 29540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o0[0-3] */ 29550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 29560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 29570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 29580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 29590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 29600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 29610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 29620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 29630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 29640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 29650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57 29660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43 29670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[6][0]); //80 90 29680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[7][0]); //70 25 29690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 29700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 29710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 29720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 29730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 29740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 29750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 29760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 29770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_count = _mm_cvtsi32_si128(i4_shift); 29780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00); 29790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 29800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 29810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 29820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 29830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 29840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 29850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 29860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 29870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 29880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 29890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 29900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 29910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o1[0-3] */ 29920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 29930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 29940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); 29950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7); 29960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8); 29970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 29980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 29990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 30000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 30010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 30020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9 30030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87 30040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[10][0]); //25 -57 30050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[11][0]); //90 43 30060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 30070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25); 30080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27); 30090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26); 30100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 30110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 30120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 30130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 30140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 30150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 30160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 30170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 30180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 30190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 30200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 30210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += ((!i) * out_stride + 8); 30220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 30230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 30240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o2[0-3] */ 30250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 30260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 30270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 30280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 30290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 30300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 30310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 30320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 30330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 30340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43 30350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9 30360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[14][0]); //90 25 30370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[15][0]); //80 57 30380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 30390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21); 30400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23); 30410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22); 30420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 30430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 30440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 30450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 30460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 30470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 30480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 30490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 30500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 30510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 30520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 30530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 30540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 30550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 30560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o3[0-3] */ 30570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 30580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 30590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); 30600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7); 30610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8); 30620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 30630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 30640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 30650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += 8; 30660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 30670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80 30680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90 30690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[18][0]); //9 87 30700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[19][0]); //43 70 30710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 30720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25); 30730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27); 30740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26); 30750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 30760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 30770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 30780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 30790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 30800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 30810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 30820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 30830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 30840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 30850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 30860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += (i * out_stride + 8); 30870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 30880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 30890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o4[0-3] */ 30900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 30910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 30920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 30930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 30940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 30950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 30960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 30970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch -= in_stride; 30980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 30990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90 31000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25 31010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[22][0]); //87 -70 31020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[23][0]); //9 -80 31030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 31040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21); 31050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23); 31060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22); 31070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 31080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 31090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 31100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 31110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 31120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 31130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 31140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 31150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 31160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 31170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 31180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 31190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 31200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 31210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o5[0-3] */ 31220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 31230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 31240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); 31250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7); 31260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8); 31270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 31280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 31290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 31300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch -= in_stride; 31310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 31320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70 31330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80 31340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[26][0]); //43 9 31350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[27][0]); //57 -87 31360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 31370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25); 31380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27); 31390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26); 31400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 31410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 31420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 31430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 31440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 31450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 31460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 31470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 31480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 31490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 31500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 31510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += ((!i) * out_stride + 8); 31520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 31530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 31540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o6[0-3] */ 31550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 31560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 31570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 31580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 31590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 31600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 31610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 31620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 31630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch -= in_stride; 31640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 31650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25 31660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57 31670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[30][0]); //70 -80 31680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[31][0]); //87 -90 31690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 31700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 31710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 31720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23); 31730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 31740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 31750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 31760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 31770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 31780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 31790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 31800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 31810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 31820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 31830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 31840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 31850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += out_stride; 31860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 31870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 31880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* o7[0-3] */ 31890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 31900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 31910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); 31920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7); 31930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8); 31940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 31950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 31960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += 8; 31970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 31980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25); 31990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27); 32000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26); 32010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 32020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 32030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 32040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 32050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 32060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 32070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 32080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 32090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 32100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 32110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 32120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 32130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst_scratch += (i * out_stride + 8); 32140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 32150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 32160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 32170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 32180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 32190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 32200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 32210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Transpose */ 32220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 32230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD16 *pi2_src_scratch; 32240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UWORD8 *pu1_pred_temp = pu1_pred; 32250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 out_stride = dst_strd; 32260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 in_stride = trans_size; 32270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 j; 32280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_setzero_si128(); 32290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(i = 0; i < 2; i++) 32300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 32310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch = (i) ? (pi2_tmp + 8) : pi2_tmp; 32320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 32330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(j = 0; j < 2; j++) 32340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 32350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //b, a 32360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 32370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //d, c 32380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += ((!i) * in_stride + 8); 32390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //f, e 32400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += (in_stride); 32410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_33 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //h, g 32420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += (i * in_stride + 8); 32430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //j, i 32440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 32450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //l, k 32460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += ((!i) * in_stride + 8); 32470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_36 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //n, m 32480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += in_stride; 32490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_37 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //p, o 32500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src_scratch += (i * in_stride + 8); 32510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 32520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31); //ca3ca2ca1ca0 32530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30); //bd3bd2bd1bd0 32540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 32550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33); //ge3ge2ge1ge0 32560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32); //fh3fh2fh1fh0 32570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 32580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35); //ki3ki2ki1ki0 32590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34); //jl3jl2jl1jl0 32600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 32610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37); //om3om2om1om0 32620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36); //np3np2np1np0 32630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 32640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 32650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_30 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42); //ge1ca1ge0ca0 32660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_31 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42); //ge3ca3ge2ca2 32670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 32680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_32 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46); //om1ki1om0ki0 32690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_33 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46); //om3ki3om2ki2 32700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 32710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_34 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41); //bd1fh1bd0fh0 32720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_35 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41); //bd3fh3bd2fh2 32730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 32740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_36 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45); //jl1np1jl0np0 32750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_37 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45); //jl3np3jl2np2 32760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 32770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 32780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_unpacklo_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca0 32790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp); 32800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 32810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1); 32820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1); 32830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 32840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_44 = _mm_unpacklo_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp0 32850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_40 = _mm_add_epi16(m_temp_reg_40, m_temp_reg_0); 32860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_44 = _mm_add_epi16(m_temp_reg_44, m_temp_reg_12); 32870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 32880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44); 32890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20); 32900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += out_stride; 32910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_pred_temp += pred_strd; 32920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 32930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_41 = _mm_unpackhi_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca1 32940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp); 32950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 32960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1); 32970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1); 32980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 32990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_45 = _mm_unpackhi_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp0 33000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_41 = _mm_add_epi16(m_temp_reg_41, m_temp_reg_0); 33010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_45 = _mm_add_epi16(m_temp_reg_45, m_temp_reg_12); 33020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 33030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_41, m_temp_reg_45); 33040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20); 33050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += out_stride; 33060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_pred_temp += pred_strd; 33070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 33080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_unpacklo_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca2 33090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp); 33100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 33110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1); 33120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1); 33130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 33140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_46 = _mm_unpacklo_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp0 33150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_42 = _mm_add_epi16(m_temp_reg_42, m_temp_reg_0); 33160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_46 = _mm_add_epi16(m_temp_reg_46, m_temp_reg_12); 33170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 33180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_42, m_temp_reg_46); 33190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20); 33200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += out_stride; 33210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_pred_temp += pred_strd; 33220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 33230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_43 = _mm_unpackhi_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca3 33240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp); 33250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 33260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1); 33270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1); 33280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 33290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_47 = _mm_unpackhi_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp0 33300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_43 = _mm_add_epi16(m_temp_reg_43, m_temp_reg_0); 33310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_47 = _mm_add_epi16(m_temp_reg_47, m_temp_reg_12); 33320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 33330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_43, m_temp_reg_47); 33340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20); 33350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += out_stride; 33360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_pred_temp += pred_strd; 33370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 33380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 33390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 33400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar} 3341