10d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/******************************************************************************
20d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
30d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
40d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
50d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Licensed under the Apache License, Version 2.0 (the "License");
60d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* you may not use this file except in compliance with the License.
70d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* You may obtain a copy of the License at:
80d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
90d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* http://www.apache.org/licenses/LICENSE-2.0
100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Unless required by applicable law or agreed to in writing, software
120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* distributed under the License is distributed on an "AS IS" BASIS,
130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* See the License for the specific language governing permissions and
150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* limitations under the License.
160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar******************************************************************************/
180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/**
190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *******************************************************************************
200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @file
210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  ihevc_itrans_recon_32x32_atom_intr.c
220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @brief
240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Contains function definitions for inverse  quantization, inverse
250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * transform and reconstruction
260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @author
280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  100470
290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @par List of Functions:
310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  - ihevc_iquant_itrans_recon_32x32_ssse3()
320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @remarks
340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  None
350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *******************************************************************************
370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar */
380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <stdio.h>
390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <string.h>
400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_typedefs.h"
410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_platform_macros.h"
420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_macros.h"
430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_defs.h"
440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_trans_tables.h"
450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_iquant_itrans_recon.h"
460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_func_selector.h"
470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_trans_macros.h"
480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <immintrin.h>
530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <emmintrin.h>
540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <tmmintrin.h>
560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/**
600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *******************************************************************************
610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @brief
630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  This function performs inverse quantization, inverse  transform and
640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * reconstruction for 16x16 input block
650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @par Description:
670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Performs inverse quantization , inverse transform  and adds the
680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * prediction data and clips output to 8 bit
690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] pi2_src
710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Input 16x16 coefficients
720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] pi2_tmp
740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Temporary 16x16 buffer for storing inverse
750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  transform 1st stage output
760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] pu1_pred
780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Prediction 16x16 block
790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] pi2_dequant_coeff
810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Dequant Coeffs
820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[out] pu1_dst
840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Output 16x16 block
850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] qp_div
870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Quantization parameter / 6
880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] qp_rem
900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Quantization parameter % 6
910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] src_strd
930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Input stride
940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] pred_strd
960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Prediction stride
970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] dst_strd
990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Output Stride
1000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
1010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] zero_cols
1020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Zero columns in pi2_src
1030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
1040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @returns  Void
1050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
1060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @remarks
1070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  None
1080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
1090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *******************************************************************************
1100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar */
1110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/**/
1120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarvoid ihevc_itrans_recon_32x32_ssse3(WORD16 *pi2_src,
1140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                    WORD16 *pi2_tmp,
1150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                    UWORD8 *pu1_pred,
1160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                    UWORD8 *pu1_dst,
1170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                    WORD32 src_strd,
1180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                    WORD32 pred_strd,
1190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                    WORD32 dst_strd,
1200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                    WORD32 zero_cols,
1210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                    WORD32 zero_rows)
1220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar{
1230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* Inverse Transform */
1240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 j;
1260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD16 *pi2_tmp_orig;
1290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /*MEM_ALIGN16  WORD32 temp_array[1024];
1320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MEM_ALIGN16  WORD16 temp1_array[1024];*/
1330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD16 *o_temp_ptr;
1340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD16 *temp_ptr;
1350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_0;
1370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_1;
1380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_2;
1390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_3;
1400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_4;
1410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_5;
1420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_6;
1430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_7;
1440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_10;
1450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_11;
1460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_12;
1470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_13;
1480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_14;
1490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_15;
1500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_16;
1510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_17;
1520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_18;
1530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_19;
1540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_20;
1550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_21;
1560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_22;
1570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_23;
1580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_30;
1590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_31;
1600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_32;
1610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_33;
1620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_34;
1630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_35;
1640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_36;
1650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_37;
1660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_40;
1670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_41;
1680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_42;
1690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_43;
1700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_44;
1710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_45;
1720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_46;
1730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_47;
1740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_70;
1760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_71;
1770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_72;
1780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_73;
1790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_74;
1800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_75;
1810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_76;
1820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_77;
1830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_80;
1850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_81;
1860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_82;
1870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_83;
1880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_84;
1890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_85;
1900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_86;
1910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_87;
1920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_90;
1940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_91;
1950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_92;
1960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_93;
1970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_94;
1980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_95;
1990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_96;
2000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_97;
2010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_rdng_factor;
2030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_count;
2040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_coeff1, m_coeff2, m_coeff3, m_coeff4;
2050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_coeff5, m_coeff6, m_coeff7, m_coeff8;
2060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i temp1, temp2, temp3, temp4;
2080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i temp5, temp6, temp7, temp8;
2090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i all_zero_reg;
2110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 i;
2120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /*Lokesh*/
2140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32  zero_last24_cols_stg1;
2150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32  zero_last24_rows_stg1;
2160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32  zero_last28_rows_stg1;
2170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32  zero_last28_rows_stg2;
2190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32  zero_last24_rows_stg2;
2200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32  trans_size_stg1;
2220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 i4_shift = IT_SHIFT_STAGE_1;
2240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 trans_size = TRANS_SIZE_32;
2250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* Last 8 cols of 16x16 block are skipped based on the below flag : Lokesh */
2280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    zero_last24_cols_stg1 = ((zero_cols & 0xFFFFFF00) == 0xFFFFFF00) ? 1 : 0;
2290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    zero_last24_rows_stg1 = ((zero_rows & 0xFFFFFF00) == 0xFFFFFF00) ? 1 : 0;
2300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    zero_last28_rows_stg1 = ((zero_rows & 0xFFFFFFF0) == 0xFFFFFFF0) ? 1 : 0;
2310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    zero_last28_rows_stg2 = ((zero_cols & 0xFFFFFFF0) == 0xFFFFFFF0) ? 1 : 0;
2330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    zero_last24_rows_stg2 = zero_last24_cols_stg1;
2340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    if((zero_last28_rows_stg2) || (zero_last24_cols_stg1))
2360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
2370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        trans_size_stg1 = 8;
2380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
2400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    else
2410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
2420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        trans_size_stg1 = 32;
2430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
2440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    all_zero_reg = _mm_setzero_si128();
2460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    o_temp_ptr  = pi2_tmp;
2480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    temp_ptr = (pi2_tmp + 1024);
2490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    pi2_tmp += 2048;
2510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    pi2_tmp_orig = pi2_tmp;
2520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    for(i = 0; i < trans_size_stg1; i += 8)
2540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
2550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
2580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            WORD16 *pi2_tmp_src = pi2_src;
2590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_tmp_src);
2610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_tmp_src += (src_strd << 1);
2620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_tmp_src);
2630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_tmp_src += (src_strd << 1);
2640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_tmp_src);
2650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_tmp_src += (src_strd << 1);
2660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_tmp_src);
2670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_tmp_src += (src_strd << 1);
2680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_tmp_src);
2690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_tmp_src += (src_strd << 1);
2700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_tmp_src);
2710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_tmp_src += (src_strd << 1);
2720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_tmp_src);
2730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_tmp_src += (src_strd << 1);
2740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_tmp_src);
2750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_tmp_src += (src_strd << 1);
2760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_80 = _mm_load_si128((__m128i *)pi2_tmp_src);
2780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_tmp_src += (src_strd << 1);
2790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_81 = _mm_load_si128((__m128i *)pi2_tmp_src);
2800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_tmp_src += (src_strd << 1);
2810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_82 = _mm_load_si128((__m128i *)pi2_tmp_src);
2820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_tmp_src += (src_strd << 1);
2830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_83 = _mm_load_si128((__m128i *)pi2_tmp_src);
2840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_tmp_src += (src_strd << 1);
2850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_84 = _mm_load_si128((__m128i *)pi2_tmp_src);
2860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_tmp_src += (src_strd << 1);
2870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_85 = _mm_load_si128((__m128i *)pi2_tmp_src);
2880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_tmp_src += (src_strd << 1);
2890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_86 = _mm_load_si128((__m128i *)pi2_tmp_src);
2900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_tmp_src += (src_strd << 1);
2910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_87 = _mm_load_si128((__m128i *)pi2_tmp_src);
2920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
2930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        if(zero_last28_rows_stg1)
2950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
2960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* eeo */
2970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* eeeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
2980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* eeeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
2990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
3000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64
3010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
3030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
3050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/* eeeo[0]= m_temp_reg_20  */
3070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/* eeeo[1]= m_temp_reg_21  */
3080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/* eeee[0]= m_temp_reg_22  */
3090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/* eeee[1]= m_temp_reg_23  */
3100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eee[0] = eeee[0] + eeeo[0]; */
3120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_40 = m_temp_reg_14;
3130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eee[3] = eeee[0] - eeeo[0]; */
3150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_43 = m_temp_reg_14;
3160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eee[2] = eeee[1] - eeeo[1]; */
3180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_42 = m_temp_reg_14; //m_temp_reg_16;
3190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eee[1] = eeee[1] + eeeo[1];*/
3210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_41 = m_temp_reg_14; //m_temp_reg_16;
3220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
3240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
3260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
3280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/* eeeo[0]= m_temp_reg_20  */
3300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/* eeeo[1]= m_temp_reg_21  */
3310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/* eeee[0]= m_temp_reg_22  */
3320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/* eeee[1]= m_temp_reg_23  */
3330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eee[0] = eeee[0] + eeeo[0]; */
3350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_44 = m_temp_reg_14;
3360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eee[3] = eeee[0] - eeeo[0]; */
3380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_47 = m_temp_reg_14;
3390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eee[2] = eeee[1] - eeeo[1]; */
3410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_46 = m_temp_reg_14; //m_temp_reg_16;
3420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eee[1] = eeee[1] + eeeo[1];*/
3440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_45 = m_temp_reg_14; //m_temp_reg_16;
3450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
3480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* eo */
3490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
3500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                WORD16 *pi2_scratch = o_temp_ptr;
3510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90
3530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87
3540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80
3550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70
3560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57
3570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43
3580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25
3590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //9
3600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                //m_temp_reg_10 = _mm_cvtepi16_epi32(m_temp_reg_71);
3620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, all_zero_reg);
3630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
3650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo0[0-3] */
3670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
3680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    //m_temp_reg_14 = _mm_cvtepi16_epi32(m_temp_reg_71);
3710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_71, all_zero_reg);
3720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
3740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
3750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
3770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
3780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
3790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
3800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
3820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo0[4-7] */
3840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
3850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
3860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
3880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
3890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
3910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
3920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
3930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
3940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
3960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo1[0-3] */
3970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
3980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff2);
3990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30);
4010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30);
4020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
4040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
4050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
4060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
4070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
4090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo1[4-7] */
4110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
4120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff2);
4130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30);
4150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30);
4160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
4180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
4190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
4200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
4210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
4230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo2[0-3] */
4250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
4260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
4270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
4290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
4300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
4320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
4330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
4340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
4350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
4370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo2[4-7] */
4390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
4400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
4410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
4430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
4440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
4460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
4470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
4480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
4490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
4510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /**************************************************************************/
4530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo3[0-3] */
4560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
4570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff4);
4580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30);
4600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30);
4610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
4630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
4640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
4650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
4660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
4680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo3[4-7] */
4700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
4710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff4);
4720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30);
4740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30);
4750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
4770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
4780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
4790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
4800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
4820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo4[0-3] */
4850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
4860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
4870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30);
4890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30);
4900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
4920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
4930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
4940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
4950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
4970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo4[4-7] */
4980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
4990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
5000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30);
5020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30);
5030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
5050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
5060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
5070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
5080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
5100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /***********************************************************************/
5120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo5[0-3] */
5140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
5150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff6);
5160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
5180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
5190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
5210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
5220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
5230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
5240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
5260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo5[4-7] */
5290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
5300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff6);
5310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
5330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
5340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
5360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
5370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
5380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
5390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
5410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo6[0-3] */
5430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
5440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff7);
5450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30);
5470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30);
5480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
5500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
5510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
5520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
5530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
5550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo6[4-7] */
5580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
5590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff7);
5600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30);
5620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30);
5630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
5650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
5660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
5670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
5680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
5700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo7[0-3] */
5730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
5740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff8);
5750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
5770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
5780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
5800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
5810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
5820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
5830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
5850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo7[4-7] */
5880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
5890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff8);
5900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
5920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
5930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
5950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
5960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
5970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
5980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
6000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
6020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
6030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        else if(zero_last24_rows_stg1)
6040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
6050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
6060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eeo */
6070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eeeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
6080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eeeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
6090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83 36
6110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36 -83
6120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64 64
6140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
6160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
6180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eeeo[0]= m_temp_reg_20  */
6200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eeeo[1]= m_temp_reg_21  */
6210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eeee[0]= m_temp_reg_22  */
6220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eeee[1]= m_temp_reg_23  */
6230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eee[0] = eeee[0] + eeeo[0]; */
6250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_40 = m_temp_reg_14;
6260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eee[3] = eeee[0] - eeeo[0]; */
6280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_43 = m_temp_reg_14;
6290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eee[2] = eeee[1] - eeeo[1]; */
6310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_42 = m_temp_reg_14; //m_temp_reg_16;
6320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eee[1] = eeee[1] + eeeo[1];*/
6340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_41 = m_temp_reg_14; //m_temp_reg_16;
6350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* for row 4 to 7 */
6370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
6390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
6410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
6430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eeeo[0]= m_temp_reg_20  */
6450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eeeo[1]= m_temp_reg_21  */
6460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eeee[0]= m_temp_reg_22  */
6470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eeee[1]= m_temp_reg_23  */
6480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eee[0] = eeee[0] + eeeo[0]; */
6500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_44 = m_temp_reg_14;
6510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eee[3] = eeee[0] - eeeo[0]; */
6530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_47 = m_temp_reg_14;
6540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eee[2] = eeee[1] - eeeo[1]; */
6560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_46 = m_temp_reg_14; //m_temp_reg_16;
6570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eee[1] = eeee[1] + eeeo[1];*/
6590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_45 = m_temp_reg_14; //m_temp_reg_16;
6600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eeo[] */
6630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* for(k = 0; k < 4; k++) */
6640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75
6660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75
6670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18
6680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[5][0]); //50 18
6690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, all_zero_reg);
6710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8);
6730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_72, all_zero_reg);
6750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_33 = _mm_setzero_si128();
6770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eeo */
6790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
6800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* eeo0[0-3] */
6810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
6820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
6830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
6850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
6860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_90 = m_temp_reg_34;
6880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_97 = m_temp_reg_35;
6890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
6900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* eeo0[4-7] */
6910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
6920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
6930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
6950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
6960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_91 = m_temp_reg_34;
6980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_96 = m_temp_reg_35;
6990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
7010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* eeo1[0-3] */
7030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
7040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff2);
7050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /* e[1][0-3] stored in pi2_tmp[2][0-7] */
7070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /* e[6][0-3] stored in pi2_tmp[2][8-15] */
7080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30);
7090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30);
7100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_92 = m_temp_reg_34;
7120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_95 = m_temp_reg_35;
7130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
7150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* eo1[4-7] */
7170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
7180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff2);
7190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /* e[1][4-7] stored in pi2_tmp[3][0-7] */
7210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /* e[6][4-7] stored in pi2_tmp[3][8-15] */
7220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30);
7230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30);
7240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_93 = m_temp_reg_34;
7260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_94 = m_temp_reg_35;
7270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
7300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* eo2[0-3] */
7320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
7330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff4);
7340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /* e[2][0-3] stored in pi2_tmp[4][0-7] */
7360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /* e[5][0-3] stored in pi2_tmp[4][8-15] */
7370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
7380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
7390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        temp1 = m_temp_reg_34;
7410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        temp7 = m_temp_reg_35;
7420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
7440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* eo2[4-7] */
7460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
7470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff4);
7480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /* e[2][4-7] stored in pi2_tmp[5][0-7] */
7500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /* e[5][4-7] stored in pi2_tmp[5][8-15] */
7510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
7520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
7530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        temp2 = m_temp_reg_34;
7550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        temp6 = m_temp_reg_35;
7560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
7580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* eo3[0-3] */
7600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
7610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
7620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /* e[3][0-3] stored in pi2_tmp[6][0-7] */
7640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /* e[4][0-3] stored in pi2_tmp[6][8-15] */
7650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30);
7660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30);
7670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        temp3 = m_temp_reg_34;
7690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        temp5 = m_temp_reg_35;
7700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
7720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* eo3[4-7] */
7750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
7760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
7770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /* e[3][4-7] stored in pi2_tmp[7][0-7] */
7790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /* e[4][4-7] stored in pi2_tmp[7][8-15] */
7800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30);
7810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30);
7820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        temp4 = m_temp_reg_34;
7840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        temp8 = m_temp_reg_35;
7850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
7880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* All values of ee[] array in pi2_temp */
7890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
7910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[1][0]); //80 70
7920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57 43
7930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25 9
7940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
7960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
7980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8);
7990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
8010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
8020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* eo */
8030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
8040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                WORD16 *pi2_scratch = o_temp_ptr;
8050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo0[0-3] */
8070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
8080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
8090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_30);
8110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_30);
8120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
8140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
8150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
8160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
8170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
8190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo0[4-7] */
8220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
8230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
8240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
8260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_30);
8280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_30);
8290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
8310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
8320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
8330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
8340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
8360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87  57
8380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo1[0-3] */
8400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
8410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
8420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_30);
8440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_30);
8450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
8470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
8480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
8490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
8500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
8520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo1[4-7] */
8550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
8560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
8570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_30);
8590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_30);
8600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
8620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
8630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
8640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
8650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
8670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80  9
8690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo2[0-3] */
8710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
8720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
8740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(temp1, m_temp_reg_30);
8760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(temp1, m_temp_reg_30);
8770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
8790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
8800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
8810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
8820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
8840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo2[4-7] */
8860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
8870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
8890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(temp2, m_temp_reg_30);
8910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(temp2, m_temp_reg_30);
8920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
8940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
8950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
8960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
8970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
8990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /**************************************************************************/
9010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70  -43
9050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo3[0-3] */
9070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
9080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
9100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(temp3, m_temp_reg_30);
9120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(temp3, m_temp_reg_30);
9130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
9150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
9160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
9170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
9180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
9200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo3[4-7] */
9230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
9240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
9260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(temp4, m_temp_reg_30);
9280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(temp4, m_temp_reg_30);
9290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
9310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
9320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
9330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
9340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
9360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57  -80
9380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo4[0-3] */
9400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
9410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
9420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(temp5, m_temp_reg_30);
9440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(temp5, m_temp_reg_30);
9450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
9470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
9480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
9490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
9500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
9520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo4[4-7] */
9530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
9540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
9550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(temp8, m_temp_reg_30);
9570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(temp8, m_temp_reg_30);
9580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
9600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
9610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
9620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
9630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
9650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /***********************************************************************/
9670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43  -90
9690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo5[0-3] */
9710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
9720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
9740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(temp7, m_temp_reg_30);
9760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(temp7, m_temp_reg_30);
9770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
9790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
9800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
9810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
9820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
9840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo5[4-7] */
9870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
9880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
9890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(temp6, m_temp_reg_30);
9910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(temp6, m_temp_reg_30);
9920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
9940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
9950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
9960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
9970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
9990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25  -70
10010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo6[0-3] */
10030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
10040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
10050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_30);
10070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_30);
10080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
10100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
10110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
10120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
10130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
10150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo6[4-7] */
10180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
10190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
10210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_30);
10230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_30);
10240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
10260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
10270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
10280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
10290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
10310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9  -25
10330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo7[0-3] */
10350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
10360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
10380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_30);
10400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_30);
10410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
10430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
10440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
10450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
10460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
10480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo7[4-7] */
10510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
10520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
10530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_30);
10550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_30);
10560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
10580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
10590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
10600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
10610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
10630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
10650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
10670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        else
10680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
10690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
10710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eeo */
10720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eeeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
10730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eeeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
10740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83 36
10760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36 -83
10770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64 64
10790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[3][0]); //64 -64
10800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_84);
10820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_80);
10840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);  /* eeeo[0] */
10860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);  /* eeeo[1] */
10870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);  /* eeee[0] */
10890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff4);  /* eeee[1] */
10900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eeeo[0]= m_temp_reg_20  */
10930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eeeo[1]= m_temp_reg_21  */
10940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eeee[0]= m_temp_reg_22  */
10950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eeee[1]= m_temp_reg_23  */
10960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eee[0] = eeee[0] + eeeo[0]; */
10980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_40 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);  /* eeeo[0] */
10990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eee[3] = eeee[0] - eeeo[0]; */
11010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_43 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);  /* eeeo[1] */
11020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eee[2] = eeee[1] - eeeo[1]; */
11040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_42 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_22);  /* eeee[1] */
11050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eee[1] = eeee[1] + eeeo[1];*/
11070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_41 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_22);  /* eeee[0] */
11080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* for row 4 to 7 */
11100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_74 = _mm_srli_si128(m_temp_reg_74, 8);
11120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_84 = _mm_srli_si128(m_temp_reg_84, 8);
11130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* Interleaving row 8 and row 24*/
11150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_84);
11160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
11180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_80 = _mm_srli_si128(m_temp_reg_80, 8);
11190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_80);
11210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);  /* eeeo[0] */
11230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);  /* eeeo[1] */
11240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);  /* eeee[0] */
11260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff4);  /* eeee[1] */
11270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eeeo[0]= m_temp_reg_20  */
11300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eeeo[1]= m_temp_reg_21  */
11310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eeee[0]= m_temp_reg_22  */
11320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eeee[1]= m_temp_reg_23  */
11330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eee[0] = eeee[0] + eeeo[0]; */
11350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_44 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);  /* eeeo[0] */
11360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eee[3] = eeee[0] - eeeo[0]; */
11380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);  /* eeeo[1] */
11390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eee[2] = eeee[1] - eeeo[1]; */
11410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_22);  /* eeee[1] */
11420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eee[1] = eeee[1] + eeeo[1];*/
11440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_45 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_22);  /* eeee[0] */
11450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                // eeo[]
11480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* for(k = 0; k < 4; k++) */
11490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75
11520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[5][0]); //50 18
11530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eeo */
11550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
11560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* eeo0[0-3] */
11570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
11580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
11590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_86);
11600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
11620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
11630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
11650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_90 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
11670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
11680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
11700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8);
11720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_76 = _mm_srli_si128(m_temp_reg_76, 8);
11730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_82 = _mm_srli_si128(m_temp_reg_82, 8);
11740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_86 = _mm_srli_si128(m_temp_reg_86, 8);
11750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* eeo0[4-7] */
11770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
11780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
11790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_15 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_86);
11800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
11820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
11830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
11850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_91 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
11870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_96 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
11880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
11900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75 -18
11930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[7][0]); //89  50
11940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* eeo1[0-3] */
11960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
11970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
11980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
11990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30);
12010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30);
12020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_92 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_31);
12040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_95 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_31);
12050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
12070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* eeo1[4-7] */
12090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
12100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
12120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff4);
12130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30);
12150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30);
12160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_31);
12180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_94 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_31);
12190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
12220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[8][0]); //50 -89
12240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18  75
12250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* eeo2[0-3] */
12270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
12280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
12300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
12310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /* e[2][0-3] stored in pi2_tmp[4][0-7] */
12330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /* e[5][0-3] stored in pi2_tmp[4][8-15] */
12340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
12360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
12370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        temp1 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31);
12390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        temp7 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31);
12400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
12420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* eeo2[4-7] */
12440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
12450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
12470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff4);
12480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /* e[2][4-7] stored in pi2_tmp[5][0-7] */
12500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /* e[5][4-7] stored in pi2_tmp[5][8-15] */
12510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
12530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
12540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        temp2 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31);
12560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        temp6 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31);
12570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
12590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[10][0]); //18 -50
12610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[11][0]); //75  -89
12620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* eeo3[0-3] */
12640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
12650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
12670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
12680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /* e[3][0-3] stored in pi2_tmp[6][0-7] */
12700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /* e[4][0-3] stored in pi2_tmp[6][8-15] */
12710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30);
12730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30);
12740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        temp3 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31);
12760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        temp5 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31);
12770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
12800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* eeo3[4-7] */
12820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
12830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
12850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff4);
12860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /* e[3][4-7] stored in pi2_tmp[7][0-7] */
12880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /* e[4][4-7] stored in pi2_tmp[7][8-15] */
12890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30);
12910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30);
12920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        temp4 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31);
12930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        temp8 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31);
12940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
12960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* All values of ee[] array in pi2_temp */
12990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* for(k = 0; k < 8; k++) */
13010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
13020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[1][0]); //80 70
13030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57 43
13040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25 9
13050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
13060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
13070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* eo */
13080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
13090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                WORD16 *pi2_scratch = o_temp_ptr;
13100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
13120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
13130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_81, m_temp_reg_83);
13140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_85, m_temp_reg_87);
13150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
13170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8);
13180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_75 = _mm_srli_si128(m_temp_reg_75, 8);
13190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_77 = _mm_srli_si128(m_temp_reg_77, 8);
13200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_81 = _mm_srli_si128(m_temp_reg_81, 8);
13220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_83 = _mm_srli_si128(m_temp_reg_83, 8);
13230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_85 = _mm_srli_si128(m_temp_reg_85, 8);
13240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_87 = _mm_srli_si128(m_temp_reg_87, 8);
13250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo0[0-3] */
13270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
13280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
13290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
13300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
13320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
13340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
13350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
13370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
13390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_30);
13410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_30);
13420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
13440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
13450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
13460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
13470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
13490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo0[4-7] */
13500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
13510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
13520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_15 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
13530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_16 = _mm_unpacklo_epi16(m_temp_reg_81, m_temp_reg_83);
13540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_17 = _mm_unpacklo_epi16(m_temp_reg_85, m_temp_reg_87);
13550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
13570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
13580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
13600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
13620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
13630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
13650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
13670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_30);
13690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_30);
13700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
13720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
13730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
13740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
13750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
13770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87  57
13790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //0  -43
13800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80  90
13810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70  25
13820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo1[0-3] */
13840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
13850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
13870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
13880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
13900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
13920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
13930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
13950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_32);
13970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_30);
13990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_30);
14000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
14020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
14030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
14040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
14050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
14070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo1[4-7] */
14090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
14100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
14110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
14120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
14140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
14160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
14170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
14190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_32);
14210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_30);
14230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_30);
14240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
14260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
14270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
14280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
14290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
14310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80  9
14330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[9][0]); //70  87
14340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[10][0]); //-25  57
14350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[11][0]); //90  43
14360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo2[0-3] */
14380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
14390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
14400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
14410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31);
14430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
14450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
14460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
14480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
14500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(temp1, m_temp_reg_30);
14520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(temp1, m_temp_reg_30);
14530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
14550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
14560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
14570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
14580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
14600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo2[4-7] */
14630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
14640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
14660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
14670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31);
14690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
14710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
14720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
14740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
14760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(temp2, m_temp_reg_30);
14780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(temp2, m_temp_reg_30);
14790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
14810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
14820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
14830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
14840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
14860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /**************************************************************************/
14870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70  -43
14890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[13][0]); //-87  9
14900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[14][0]); //90  25
14910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[15][0]); //80  57
14920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo3[0-3] */
14940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
14950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
14960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
14970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
14990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
15010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
15020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_32, m_temp_reg_33);
15040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
15060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(temp3, m_temp_reg_30);
15080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(temp3, m_temp_reg_30);
15090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
15110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
15120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
15130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
15140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
15160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo3[4-7] */
15190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
15200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
15210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
15220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
15240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
15260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
15270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_32, m_temp_reg_33);
15290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
15310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(temp4, m_temp_reg_30);
15330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(temp4, m_temp_reg_30);
15340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
15360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
15370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
15380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
15390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
15410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57  -80
15430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[17][0]); //-25  90
15440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[18][0]); //9  87
15450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43  70
15460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo4[0-3] */
15480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
15490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
15510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
15520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
15540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
15560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
15570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32);
15590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
15610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(temp5, m_temp_reg_30);
15630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(temp5, m_temp_reg_30);
15640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
15660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
15670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
15680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
15690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
15710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo4[4-7] */
15740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
15750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
15760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
15770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
15790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
15810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
15820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32);
15840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
15860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(temp8, m_temp_reg_30);
15880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(temp8, m_temp_reg_30);
15890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
15910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
15920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
15930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
15940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
15960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /***********************************************************************/
15980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43  -90
16000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[21][0]); //57  25
16010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[22][0]); //-87  70
16020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[23][0]); //9  -80
16030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo5[0-3] */
16050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
16060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
16070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
16080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
16100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
16120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
16130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
16150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
16170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(temp7, m_temp_reg_30);
16190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(temp7, m_temp_reg_30);
16200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
16220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
16230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
16240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
16250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
16270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo5[4-7] */
16300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
16310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
16320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
16330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
16350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
16370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
16380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
16400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
16420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(temp6, m_temp_reg_30);
16440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(temp6, m_temp_reg_30);
16450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
16470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
16480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
16490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
16500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
16520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25  -70
16540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[25][0]); //90  -80
16550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[26][0]); //43  9
16560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[27][0]); //-57  87
16570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo6[0-3] */
16590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
16600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
16620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
16630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
16650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
16670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
16680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
16700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
16720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_30);
16740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_30);
16750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
16770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
16780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
16790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
16800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
16820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo6[4-7] */
16850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
16860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
16870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
16880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
16900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
16920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
16930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
16950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
16970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_30);
16990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_30);
17000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
17020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
17030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
17040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
17050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
17070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9  -25
17090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[29][0]); //43  -57
17100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[30][0]); //70  -80
17110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[31][0]); //87  -90
17120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo7[0-3] */
17140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
17150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
17170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
17180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
17200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
17220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
17230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
17250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
17270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_30);
17290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_30);
17300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
17320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
17330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
17340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
17350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
17370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo7[4-7] */
17400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
17410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
17430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
17440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
17460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
17480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
17490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
17510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
17530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_30);
17550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_30);
17560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
17580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
17590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
17600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
17610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
17630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
17650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
17670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /*  All e[] are done */
17680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /****************************/
17690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
17720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            WORD16 *pi2_tmp_src = pi2_src + src_strd;
17740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_tmp_src);
17760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_tmp_src += (src_strd << 1);
17770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_tmp_src);
17780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_tmp_src += (src_strd << 1);
17790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_tmp_src);
17800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_tmp_src += (src_strd << 1);
17810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_tmp_src);
17820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_tmp_src += (src_strd << 1);
17830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_tmp_src);
17840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_tmp_src += (src_strd << 1);
17850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_tmp_src);
17860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_tmp_src += (src_strd << 1);
17870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_tmp_src);
17880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_tmp_src += (src_strd << 1);
17890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_tmp_src);
17900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_tmp_src += (src_strd << 1);
17910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_80 = _mm_load_si128((__m128i *)pi2_tmp_src);
17930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_tmp_src += (src_strd << 1);
17940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_81 = _mm_load_si128((__m128i *)pi2_tmp_src);
17950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_tmp_src += (src_strd << 1);
17960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_82 = _mm_load_si128((__m128i *)pi2_tmp_src);
17970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_tmp_src += (src_strd << 1);
17980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_83 = _mm_load_si128((__m128i *)pi2_tmp_src);
17990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_tmp_src += (src_strd << 1);
18000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_84 = _mm_load_si128((__m128i *)pi2_tmp_src);
18010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_tmp_src += (src_strd << 1);
18020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_85 = _mm_load_si128((__m128i *)pi2_tmp_src);
18030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_tmp_src += (src_strd << 1);
18040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_86 = _mm_load_si128((__m128i *)pi2_tmp_src);
18050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_tmp_src += (src_strd << 1);
18060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_87 = _mm_load_si128((__m128i *)pi2_tmp_src);
18070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
18080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        if(zero_last28_rows_stg1)
18100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
18110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* o & stage 1 out */
18120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
18130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                WORD32 j;
18140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                WORD16 *pi2_src_scratch = o_temp_ptr;
18150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                WORD16 *pi2_dst_scratch = temp_ptr;
18160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                WORD32 out_stride = (trans_size << 1);
18170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                WORD32 in_stride = trans_size;
18180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                for(j = 0; j < 2; j++)
18200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
18210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    if(j)
18220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
18230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
18240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
18250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
18260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71);
18280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
18300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o0[0-3] */
18320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
18330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
18340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
18360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch += in_stride;
18370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
18390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
18400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
18420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_count = _mm_cvtsi32_si128(i4_shift);
18430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
18440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
18450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
18470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
18480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
18490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
18500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
18520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
18540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += out_stride;
18550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
18570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
18590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o1[0-3] */
18610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
18620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
18640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
18660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch += in_stride;
18670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
18690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
18700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
18720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_count = _mm_cvtsi32_si128(i4_shift);
18730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
18740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
18750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
18770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
18780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
18790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
18800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
18820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
18840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += out_stride;
18850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
18870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
18890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o2[0-3] */
18910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
18920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
18940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
18960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch += in_stride;
18970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
18990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
19000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
19020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_count = _mm_cvtsi32_si128(i4_shift);
19030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
19040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
19050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
19070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
19080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
19090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
19100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
19120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
19140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += out_stride;
19150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
19170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
19190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o3[0-3] */
19210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
19220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
19230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
19250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch += in_stride;
19260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
19280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
19290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
19310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_count = _mm_cvtsi32_si128(i4_shift);
19320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
19330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
19340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
19360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
19370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
19380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
19390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
19410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
19430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += out_stride;
19440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
19460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
19480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o4[0-3] */
19500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
19510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
19520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
19540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch += in_stride;
19550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
19570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
19580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
19600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_count = _mm_cvtsi32_si128(i4_shift);
19610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
19620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
19630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
19650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
19660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
19670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
19680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
19700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
19720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += out_stride;
19730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
19750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
19770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o5[0-3] */
19790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
19800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
19820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
19840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch += in_stride;
19850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
19870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
19880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
19900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_count = _mm_cvtsi32_si128(i4_shift);
19910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
19920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
19930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
19950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
19960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
19970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
19980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
20000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
20020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += out_stride;
20030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
20050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
20070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o6[0-3] */
20090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
20100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
20110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
20130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch += in_stride;
20140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
20160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
20170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
20190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_count = _mm_cvtsi32_si128(i4_shift);
20200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
20210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
20220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
20240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
20250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
20260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
20270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
20290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
20310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += out_stride;
20320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
20340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
20360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o7[0-3] */
20380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
20390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
20410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
20430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch += 8;
20440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
20460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
20470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
20490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_count = _mm_cvtsi32_si128(i4_shift);
20500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
20510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
20520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
20540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
20550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
20560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
20570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
20590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
20610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += 8;
20620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
20640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
20660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o8[0-3] */
20680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
20690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
20700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
20720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch -= in_stride;
20730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
20750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
20760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
20780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_count = _mm_cvtsi32_si128(i4_shift);
20790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
20800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
20810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
20830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
20840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
20850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
20860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
20880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
20900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch -= out_stride;
20910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
20920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
20940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o9[0-3] */
20960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
20970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
20980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
21000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch -= in_stride;
21010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
21030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
21040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
21060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_count = _mm_cvtsi32_si128(i4_shift);
21070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
21080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
21090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
21110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
21120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
21130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
21140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
21160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
21180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch -= out_stride;
21190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
21200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
21220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o10[0-3] */
21240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
21250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
21260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
21280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch -= in_stride;
21290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
21310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
21320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
21340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_count = _mm_cvtsi32_si128(i4_shift);
21350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
21360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
21370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
21390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
21400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
21410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
21420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
21440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
21460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch -= out_stride;
21470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
21480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
21500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o11[0-3] */
21520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
21530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
21540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
21560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch -= in_stride;
21570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
21590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
21600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
21620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_count = _mm_cvtsi32_si128(i4_shift);
21630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
21640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
21650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
21670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
21680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
21690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
21700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
21720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
21740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch -= out_stride;
21750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
21770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
21790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o12[0-3] */
21810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
21820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
21830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
21850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch -= in_stride;
21860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
21880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
21890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
21910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_count = _mm_cvtsi32_si128(i4_shift);
21920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
21930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
21940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
21960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
21970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
21980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
21990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
22010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
22030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch -= out_stride;
22040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
22060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
22080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o13[0-3] */
22100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
22110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
22120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
22140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch -= in_stride;
22150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
22170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
22180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
22200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_count = _mm_cvtsi32_si128(i4_shift);
22210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
22220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
22230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
22250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
22260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
22270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
22280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
22300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
22320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch -= out_stride;
22330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
22340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
22360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o14[0-3] */
22380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
22390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
22400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
22420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch -= in_stride;
22430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
22450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
22460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
22480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_count = _mm_cvtsi32_si128(i4_shift);
22490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
22500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
22510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
22530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
22540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
22550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
22560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
22580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
22600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch -= out_stride;
22610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
22630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
22650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o15[0-3] */
22670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
22680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
22690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
22710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch += 8;
22720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
22740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
22750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
22770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_count = _mm_cvtsi32_si128(i4_shift);
22780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
22790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
22800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
22820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
22830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
22840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
22850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
22870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
22890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += 8;
22900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
22910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
22930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
22940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
22950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        else if(zero_last24_rows_stg1)
22960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
22970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* o & stage 1 out */
22980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
22990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                WORD32 j;
23000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                WORD16 *pi2_src_scratch = o_temp_ptr;
23010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                WORD16 *pi2_dst_scratch = temp_ptr;
23020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                WORD32 out_stride = (trans_size << 1);
23030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                WORD32 in_stride = trans_size;
23040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                for(j = 0; j < 2; j++)
23060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
23070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    if(j)
23080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
23090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
23100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
23110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8);
23120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8);
23130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
23140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved
23160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 interleaved
23170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
23190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]);
23200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o0[0-3] */
23220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
23230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
23250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
23260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
23280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
23300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch += in_stride;
23310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
23330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
23340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
23360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_count = _mm_cvtsi32_si128(i4_shift);
23370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
23380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
23390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
23410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
23420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
23430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
23440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
23460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
23480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += out_stride;
23490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
23510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
23530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]);
23540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o1[0-3] */
23560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
23570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
23580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
23590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
23610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
23630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch += in_stride;
23640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
23660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
23670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
23690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_count = _mm_cvtsi32_si128(i4_shift);
23700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
23710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
23720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
23740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
23750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
23760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
23770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
23790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
23810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += out_stride;
23820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
23840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
23860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]);
23870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o2[0-3] */
23890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
23900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
23910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
23920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
23940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
23960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch += in_stride;
23970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
23990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
24000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
24020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_count = _mm_cvtsi32_si128(i4_shift);
24030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
24040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
24050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
24070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
24080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
24090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
24100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
24120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
24140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += out_stride;
24150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
24170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
24190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]);
24200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o3[0-3] */
24220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
24230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
24240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
24250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
24270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
24290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch += in_stride;
24300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
24320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
24330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
24350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_count = _mm_cvtsi32_si128(i4_shift);
24360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
24370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
24380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
24400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
24410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
24420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
24430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
24450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
24470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += out_stride;
24480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
24500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
24520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]);
24530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o4[0-3] */
24550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
24560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
24570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
24580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
24600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
24620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch += in_stride;
24630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
24650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
24660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
24680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_count = _mm_cvtsi32_si128(i4_shift);
24690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
24700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
24710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
24730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
24740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
24750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
24760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
24780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
24800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += out_stride;
24810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
24830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
24850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]);
24860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o5[0-3] */
24880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
24890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
24900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
24910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
24930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
24950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch += in_stride;
24960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
24980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
24990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
25010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_count = _mm_cvtsi32_si128(i4_shift);
25020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
25030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
25040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
25060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
25070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
25080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
25090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
25110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
25130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += out_stride;
25140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
25160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
25180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]);
25190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o6[0-3] */
25210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
25220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
25230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
25240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
25260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
25280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch += in_stride;
25290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
25310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
25320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
25340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_count = _mm_cvtsi32_si128(i4_shift);
25350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
25360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
25370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
25390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
25400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
25410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
25420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
25440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
25460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += out_stride;
25470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
25490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
25510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]);
25520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o7[0-3] */
25540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
25550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
25560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
25570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
25590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
25610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch += 8;
25620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
25640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
25650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
25670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_count = _mm_cvtsi32_si128(i4_shift);
25680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
25690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
25700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
25720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
25730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
25740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
25750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
25770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
25790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += 8;
25800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
25820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
25840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]);
25850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o8[0-3] */
25870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
25880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
25890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
25900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
25920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
25940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch -= in_stride;
25950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
25970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
25980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
26000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_count = _mm_cvtsi32_si128(i4_shift);
26010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
26020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
26030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
26050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
26060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
26070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
26080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
26100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
26120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch -= out_stride;
26130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
26140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
26160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]);
26170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o9[0-3] */
26190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
26200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
26210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
26220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
26240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
26260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch -= in_stride;
26270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
26290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
26300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
26320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_count = _mm_cvtsi32_si128(i4_shift);
26330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
26340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
26350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
26370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
26380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
26390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
26400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
26420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
26440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch -= out_stride;
26450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
26460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
26480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]);
26490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o10[0-3] */
26510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
26520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
26530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
26540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
26560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
26580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch -= in_stride;
26590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
26610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
26620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
26640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_count = _mm_cvtsi32_si128(i4_shift);
26650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
26660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
26670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
26690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
26700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
26710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
26720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
26740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
26760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch -= out_stride;
26770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
26780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
26800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]);
26810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o11[0-3] */
26830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
26840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
26860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
26870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
26890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
26910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch -= in_stride;
26920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
26940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
26950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
26970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_count = _mm_cvtsi32_si128(i4_shift);
26980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
26990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
27000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
27020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
27030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
27040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
27050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
27070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
27090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch -= out_stride;
27100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
27120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
27140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]);
27150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o12[0-3] */
27170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
27180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
27190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
27200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
27220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
27240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch -= in_stride;
27250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
27270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
27280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
27300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_count = _mm_cvtsi32_si128(i4_shift);
27310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
27320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
27330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
27350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
27360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
27370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
27380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
27400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
27420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch -= out_stride;
27430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
27450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
27470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]);
27480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o13[0-3] */
27500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
27510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
27520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
27530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
27550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
27570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch -= in_stride;
27580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
27600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
27610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
27630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_count = _mm_cvtsi32_si128(i4_shift);
27640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
27650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
27660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
27680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
27690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
27700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
27710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
27730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
27750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch -= out_stride;
27760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
27770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
27790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]);
27800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o14[0-3] */
27820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
27830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
27840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
27850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
27870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
27890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch -= in_stride;
27900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
27920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
27930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
27950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_count = _mm_cvtsi32_si128(i4_shift);
27960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
27970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
27980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
28000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
28010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
28020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
28030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
28050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
28070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch -= out_stride;
28080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
28100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
28120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]);
28130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o15[0-3] */
28150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
28160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
28170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
28180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
28200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
28220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch += 8;
28230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
28250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
28260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
28280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_count = _mm_cvtsi32_si128(i4_shift);
28290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
28300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
28310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
28330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
28340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
28350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
28360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
28380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
28400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += 8;
28410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
28420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
28440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
28450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
28460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        else
28470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
28480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* o & stage 1 out */
28490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
28500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                WORD32 j;
28510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                WORD16 *pi2_src_scratch = o_temp_ptr;
28520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                WORD16 *pi2_dst_scratch = temp_ptr;
28530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                WORD32 out_stride = (trans_size << 1);
28540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                WORD32 in_stride = trans_size;
28550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                for(j = 0; j < 2; j++)
28580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
28590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    if(j)
28600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
28610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
28620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
28630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8);
28640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8);
28650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_74 = _mm_srli_si128(m_temp_reg_74, 8);
28660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_75 = _mm_srli_si128(m_temp_reg_75, 8);
28670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_76 = _mm_srli_si128(m_temp_reg_76, 8);
28680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_77 = _mm_srli_si128(m_temp_reg_77, 8);
28690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_80 = _mm_srli_si128(m_temp_reg_80, 8);
28710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_81 = _mm_srli_si128(m_temp_reg_81, 8);
28720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_82 = _mm_srli_si128(m_temp_reg_82, 8);
28730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_83 = _mm_srli_si128(m_temp_reg_83, 8);
28740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_84 = _mm_srli_si128(m_temp_reg_84, 8);
28750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_85 = _mm_srli_si128(m_temp_reg_85, 8);
28760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_86 = _mm_srli_si128(m_temp_reg_86, 8);
28770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_87 = _mm_srli_si128(m_temp_reg_87, 8);
28780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
28790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
28810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]);
28820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[2][0]);
28830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[3][0]);
28840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[4][0]);
28850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[5][0]);
28860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[6][0]);
28870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[7][0]);
28880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved
28900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 interleaved
28910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 interleaved
28920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 interleaved
28930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    temp1 = _mm_unpacklo_epi16(m_temp_reg_80, m_temp_reg_81); //row 17 and row 19 interleaved
28940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    temp2 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_83); //row 21 and row 23 interleaved
28950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    temp3 = _mm_unpacklo_epi16(m_temp_reg_84, m_temp_reg_85); //row 25 and row 27 interleaved
28960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    temp4 = _mm_unpacklo_epi16(m_temp_reg_86, m_temp_reg_87); //row 29 and row 31 interleaved
28970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o0[0-3] */
29000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
29010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
29020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
29030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
29040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
29050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
29070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
29080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
29100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
29120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
29130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
29140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
29150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
29170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
29180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
29200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
29220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
29240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch += in_stride;
29250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
29270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
29280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
29300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_count = _mm_cvtsi32_si128(i4_shift);
29310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
29320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
29330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
29350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
29360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
29370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
29380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
29400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
29420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += out_stride;
29430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
29450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
29470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]);
29480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[10][0]);
29490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[11][0]);
29500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[12][0]);
29510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[13][0]);
29520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[14][0]);
29530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[15][0]);
29540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o1[0-3] */
29570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
29580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
29590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
29600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
29610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
29620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
29640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
29650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_20);
29670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
29690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
29700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
29710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
29720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
29740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
29750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
29770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
29790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
29810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch += in_stride;
29820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
29840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
29850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
29870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_count = _mm_cvtsi32_si128(i4_shift);
29880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
29890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
29900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
29920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
29930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
29940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
29950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
29970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
29990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += out_stride;
30000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
30020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
30040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]);
30050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[18][0]);
30060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[19][0]);
30070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[20][0]);
30080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[21][0]);
30090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[22][0]);
30100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[23][0]);
30110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o2[0-3] */
30130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
30140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
30150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
30160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
30170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
30180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
30200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
30210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
30230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
30250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
30260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
30270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
30280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_41);
30300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
30310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_42);
30330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
30350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
30370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch += in_stride;
30380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
30400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
30410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
30430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_count = _mm_cvtsi32_si128(i4_shift);
30440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
30450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
30460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
30480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
30490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
30500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
30510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
30530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
30550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += out_stride;
30560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
30580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
30610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]);
30620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[26][0]);
30630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[27][0]);
30640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[28][0]);
30650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[29][0]);
30660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[30][0]);
30670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[31][0]);
30680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o3[0-3] */
30700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
30710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
30720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
30730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
30740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
30750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
30770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
30780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
30800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
30820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
30830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
30840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
30850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_40);
30870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
30880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
30900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
30920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
30940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch += in_stride;
30950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
30970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
30980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
31000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_count = _mm_cvtsi32_si128(i4_shift);
31010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
31020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
31030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
31050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
31060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
31070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
31080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
31100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
31120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += out_stride;
31130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
31150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
31170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]);
31180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[34][0]);
31190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[35][0]);
31200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[36][0]);
31210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[37][0]);
31220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[38][0]);
31230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[39][0]);
31240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o4[0-3] */
31260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
31270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
31280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
31290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
31300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
31310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
31330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
31340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
31360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
31380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
31390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
31400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
31410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
31430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
31440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
31460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
31480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
31500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch += in_stride;
31510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
31530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
31540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
31560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_count = _mm_cvtsi32_si128(i4_shift);
31570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
31580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
31590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
31610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
31620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
31630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
31640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
31660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
31680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += out_stride;
31690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
31710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
31740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]);
31750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[42][0]);
31760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[43][0]);
31770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[44][0]);
31780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[45][0]);
31790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[46][0]);
31800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[47][0]);
31810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o5[0-3] */
31830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
31840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
31850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
31860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
31870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
31880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
31900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
31910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
31930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
31950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
31960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
31970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
31980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
32000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
32010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
32030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
32050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
32070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch += in_stride;
32080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
32100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
32110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
32130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_count = _mm_cvtsi32_si128(i4_shift);
32140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
32150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
32160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
32180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
32190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
32200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
32210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
32230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
32250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += out_stride;
32260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
32280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
32300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]);
32310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[50][0]);
32320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[51][0]);
32330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[52][0]);
32340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[53][0]);
32350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[54][0]);
32360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[55][0]);
32370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o6[0-3] */
32400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
32410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
32420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
32430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
32440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
32450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
32470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
32480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
32500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
32520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
32530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
32540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
32550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
32570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
32580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
32600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
32620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
32640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch += in_stride;
32650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
32670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
32680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
32700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_count = _mm_cvtsi32_si128(i4_shift);
32710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
32720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
32730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
32750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
32760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
32770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
32780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
32800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
32820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += out_stride;
32830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
32850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
32870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]);
32880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[58][0]);
32890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[59][0]);
32900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[60][0]);
32910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[61][0]);
32920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[62][0]);
32930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[63][0]);
32940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o7[0-3] */
32960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
32970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
32980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
32990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
33000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
33010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
33020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
33030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
33040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
33050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
33060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
33070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
33080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
33090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
33100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
33110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
33120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
33130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
33140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
33150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
33160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
33170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
33180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
33190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
33200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch += 8;
33210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
33220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
33230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
33240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
33250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
33260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_count = _mm_cvtsi32_si128(i4_shift);
33270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
33280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
33290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
33300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
33310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
33320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
33330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
33340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
33350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
33360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
33370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
33380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += 8;
33390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
33400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
33410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
33420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
33430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]);
33440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[66][0]);
33450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[67][0]);
33460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[68][0]);
33470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[69][0]);
33480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[70][0]);
33490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[71][0]);
33500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
33510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
33520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o8[0-3] */
33530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
33540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
33550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
33560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
33570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
33580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
33590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
33600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
33610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
33620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
33630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
33640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
33650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
33660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
33670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
33680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
33690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
33700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
33710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
33720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
33730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
33740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
33750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
33760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
33770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
33780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch -= in_stride;
33790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
33800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
33810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
33820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
33830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
33840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_count = _mm_cvtsi32_si128(i4_shift);
33850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
33860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
33870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
33880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
33890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
33900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
33910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
33920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
33930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
33940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
33950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
33960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch -= out_stride;
33970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
33980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
33990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
34000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]);
34010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[74][0]);
34020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[75][0]);
34030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[76][0]);
34040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[77][0]);
34050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[78][0]);
34060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[79][0]);
34070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
34080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
34090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o9[0-3] */
34100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
34110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
34120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
34130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
34140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
34150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
34160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
34170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
34180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
34190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
34200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
34210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
34220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
34230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
34240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
34250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
34260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
34270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
34280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
34290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
34300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
34310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
34320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
34330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
34340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch -= in_stride;
34350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
34360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
34370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
34380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
34390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
34400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_count = _mm_cvtsi32_si128(i4_shift);
34410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
34420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
34430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
34440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
34450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
34460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
34470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
34480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
34490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
34500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
34510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
34520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch -= out_stride;
34530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
34540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
34550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
34560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]);
34570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[82][0]);
34580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[83][0]);
34590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[84][0]);
34600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[85][0]);
34610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[86][0]);
34620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[87][0]);
34630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
34640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o10[0-3] */
34650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
34660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
34670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
34680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
34690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
34700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
34710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
34720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
34730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
34740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
34750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
34760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
34770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
34780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
34790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
34800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
34810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
34820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
34830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
34840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
34850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
34860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
34870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
34880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
34890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch -= in_stride;
34900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
34910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
34920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
34930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
34940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
34950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_count = _mm_cvtsi32_si128(i4_shift);
34960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
34970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
34980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
34990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
35000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
35010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
35020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
35030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
35040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
35050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
35060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
35070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch -= out_stride;
35080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
35090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
35100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
35110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]);
35120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[90][0]);
35130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[91][0]);
35140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[92][0]);
35150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[93][0]);
35160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[94][0]);
35170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[95][0]);
35180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
35190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o11[0-3] */
35200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
35210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
35220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
35230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
35240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
35250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
35260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
35270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
35280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
35290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
35300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
35310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
35320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
35330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
35340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
35350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
35360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
35370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
35380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
35390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
35400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
35410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
35420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
35430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
35440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch -= in_stride;
35450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
35460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
35470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
35480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
35490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
35500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_count = _mm_cvtsi32_si128(i4_shift);
35510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
35520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
35530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
35540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
35550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
35560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
35570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
35580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
35590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
35600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
35610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
35620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch -= out_stride;
35630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
35640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
35650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
35660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
35670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]);
35680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[98][0]);
35690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[99][0]);
35700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[100][0]);
35710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[101][0]);
35720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[102][0]);
35730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[103][0]);
35740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
35750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
35760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o12[0-3] */
35770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
35780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
35790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
35800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
35810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
35820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
35830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
35840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
35850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
35860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
35870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
35880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
35890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
35900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
35910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
35920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
35930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
35940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
35950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
35960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
35970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
35980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
35990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
36000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
36010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch -= in_stride;
36020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
36030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
36040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
36050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
36060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
36070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_count = _mm_cvtsi32_si128(i4_shift);
36080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
36090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
36100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
36110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
36120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
36130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
36140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
36150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
36160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
36170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
36180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
36190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch -= out_stride;
36200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
36210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
36220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
36230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
36240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]);
36250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[106][0]);
36260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[107][0]);
36270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[108][0]);
36280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[109][0]);
36290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[110][0]);
36300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[111][0]);
36310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
36320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
36330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o13[0-3] */
36340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
36350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
36360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
36370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
36380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
36390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
36400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
36410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
36420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
36430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
36440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
36450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
36460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
36470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
36480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
36490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
36500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
36510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
36520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
36530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
36540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
36550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
36560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
36570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
36580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch -= in_stride;
36590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
36600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
36610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
36620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
36630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
36640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_count = _mm_cvtsi32_si128(i4_shift);
36650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
36660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
36670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
36680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
36690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
36700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
36710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
36720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
36730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
36740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
36750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
36760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch -= out_stride;
36770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
36780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
36790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
36800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]);
36810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[114][0]);
36820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[115][0]);
36830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[116][0]);
36840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[117][0]);
36850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[118][0]);
36860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[119][0]);
36870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
36880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
36890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o14[0-3] */
36900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
36910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
36920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
36930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
36940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
36950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
36960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
36970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
36980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
36990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
37000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
37010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
37020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
37030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
37040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
37050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
37060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
37070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
37080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
37090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
37100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
37110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
37120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
37130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
37140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch -= in_stride;
37150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
37160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
37170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
37180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
37190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
37200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_count = _mm_cvtsi32_si128(i4_shift);
37210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
37220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
37230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
37240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
37250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
37260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
37270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
37280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
37290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
37300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
37310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
37320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch -= out_stride;
37330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
37340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
37350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
37360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
37370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]);
37380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[122][0]);
37390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[123][0]);
37400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[124][0]);
37410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[125][0]);
37420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[126][0]);
37430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[127][0]);
37440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
37450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o15[0-3] */
37460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
37470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
37480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
37490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
37500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
37510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
37520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
37530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
37540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
37550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
37560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
37570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
37580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
37590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
37600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
37610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
37620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
37630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
37640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
37650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
37660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
37670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
37680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
37690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
37700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch += 8;
37710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
37720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
37730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
37740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
37750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
37760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_count = _mm_cvtsi32_si128(i4_shift);
37770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
37780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
37790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
37800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
37810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
37820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
37830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
37840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
37850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
37860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
37870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
37880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += 8;
37890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
37900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
37910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
37920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
37930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
37940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* Transpose */
37950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
37960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            WORD16 *pi2_src_scratch = temp_ptr;
37970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            WORD16 *pi2_dst_scratch = pi2_tmp;
37980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            WORD32 in_stride = (trans_size << 1);
37990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
38000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            for(j = 0; j < 2; j++)
38010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
38020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
38030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_scratch += in_stride;
38040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_31 = _mm_load_si128((__m128i *)pi2_src_scratch);
38050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_scratch += in_stride;
38060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_32 = _mm_load_si128((__m128i *)pi2_src_scratch);
38070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_scratch += in_stride;
38080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_33 = _mm_load_si128((__m128i *)pi2_src_scratch);
38090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_scratch += in_stride;
38100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_34 = _mm_load_si128((__m128i *)pi2_src_scratch);
38110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_scratch += in_stride;
38120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_35 = _mm_load_si128((__m128i *)pi2_src_scratch);
38130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_scratch += in_stride;
38140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_36 = _mm_load_si128((__m128i *)pi2_src_scratch);
38150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_scratch += in_stride;
38160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_37 = _mm_load_si128((__m128i *)pi2_src_scratch);
38170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_scratch += 8;
38180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
38190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_src_scratch);
38200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_scratch -= in_stride;
38210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_src_scratch);
38220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_scratch -= in_stride;
38230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_src_scratch);
38240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_scratch -= in_stride;
38250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_src_scratch);
38260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_scratch -= in_stride;
38270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_src_scratch);
38280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_scratch -= in_stride;
38290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_src_scratch);
38300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_scratch -= in_stride;
38310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_src_scratch);
38320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_scratch -= in_stride;
38330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_src_scratch);
38340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_scratch += 8;
38350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
38360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
38370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31);
38380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30);
38390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
38400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33);
38410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32);
38420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
38430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35);
38440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34);
38450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
38460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37);
38470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36);
38480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
38490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_80 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71);
38500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_81 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_70);
38510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
38520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_82 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73);
38530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_83 = _mm_unpackhi_epi16(m_temp_reg_73, m_temp_reg_72);
38540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
38550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_84 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75);
38560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_85 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_74);
38570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
38580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_86 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77);
38590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_87 = _mm_unpackhi_epi16(m_temp_reg_77, m_temp_reg_76);
38600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
38610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /****************/
38620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
38630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42);
38640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42);
38650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
38660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46);
38670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46);
38680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
38690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_80, m_temp_reg_82);
38700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_80, m_temp_reg_82);
38710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
38720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_84, m_temp_reg_86);
38730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_84, m_temp_reg_86);
38740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
38750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_90 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41);
38760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_91 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41);
38770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
38780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_92 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45);
38790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_93 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45);
38800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
38810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_94 = _mm_unpacklo_epi32(m_temp_reg_83, m_temp_reg_81);
38820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_95 = _mm_unpackhi_epi32(m_temp_reg_83, m_temp_reg_81);
38830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
38840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_96 = _mm_unpacklo_epi32(m_temp_reg_87, m_temp_reg_85);
38850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_97 = _mm_unpackhi_epi32(m_temp_reg_87, m_temp_reg_85);
38860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
38870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /******************/
38880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
38890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_30 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_2);
38900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_31 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_2);
38910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
38920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_32 = _mm_unpacklo_epi64(m_temp_reg_92, m_temp_reg_90);
38930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_33 = _mm_unpackhi_epi64(m_temp_reg_92, m_temp_reg_90);
38940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
38950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_34 = _mm_unpacklo_epi64(m_temp_reg_4, m_temp_reg_6);
38960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_35 = _mm_unpackhi_epi64(m_temp_reg_4, m_temp_reg_6);
38970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
38980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_36 = _mm_unpacklo_epi64(m_temp_reg_96, m_temp_reg_94);
38990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_37 = _mm_unpackhi_epi64(m_temp_reg_96, m_temp_reg_94);
39000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
39010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_80 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_3);
39020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_81 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_3);
39030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
39040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_82 = _mm_unpacklo_epi64(m_temp_reg_93, m_temp_reg_91);
39050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_83 = _mm_unpackhi_epi64(m_temp_reg_93, m_temp_reg_91);
39060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
39070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_84 = _mm_unpacklo_epi64(m_temp_reg_5, m_temp_reg_7);
39080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_85 = _mm_unpackhi_epi64(m_temp_reg_5, m_temp_reg_7);
39090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
39100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_86 = _mm_unpacklo_epi64(m_temp_reg_97, m_temp_reg_95);
39110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_87 = _mm_unpackhi_epi64(m_temp_reg_97, m_temp_reg_95);
39120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
39130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size), m_temp_reg_30);
39140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 8), m_temp_reg_34);
39150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 16), m_temp_reg_36);
39160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 24), m_temp_reg_32);
39170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
39180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size), m_temp_reg_31);
39190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 8), m_temp_reg_35);
39200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 16), m_temp_reg_37);
39210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 24), m_temp_reg_33);
39220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
39230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size), m_temp_reg_80);
39240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 8), m_temp_reg_84);
39250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 16), m_temp_reg_86);
39260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 24), m_temp_reg_82);
39270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
39280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size), m_temp_reg_81);
39290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 8), m_temp_reg_85);
39300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 16), m_temp_reg_87);
39310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 24), m_temp_reg_83);
39320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
39330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_dst_scratch += 4 * trans_size;
39340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
39350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
39360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        pi2_src += 8;
39370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//      pi2_dequant_coeff +=8;
39380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        pi2_tmp += 8 * trans_size;
39390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        zero_cols = zero_cols >> 1;
39400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
39410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
39420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    if(trans_size_stg1 != TRANS_SIZE_32)
39430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
39440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_10 = _mm_setzero_si128();
39450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
39460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        for(i = trans_size_stg1; i < 32; i += 8)
39470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
39480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            WORD16 *pi2_dst_scratch = pi2_tmp;
39490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
39500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size), m_temp_reg_10);
39510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 8), m_temp_reg_10);
39520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 16), m_temp_reg_10);
39530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 24), m_temp_reg_10);
39540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
39550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size), m_temp_reg_10);
39560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 8), m_temp_reg_10);
39570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 16), m_temp_reg_10);
39580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 24), m_temp_reg_10);
39590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
39600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size), m_temp_reg_10);
39610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 8), m_temp_reg_10);
39620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 16), m_temp_reg_10);
39630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 24), m_temp_reg_10);
39640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
39650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size), m_temp_reg_10);
39660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 8), m_temp_reg_10);
39670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 16), m_temp_reg_10);
39680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 24), m_temp_reg_10);
39690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
39700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_store_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size), m_temp_reg_10);
39710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_store_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size + 8), m_temp_reg_10);
39720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_store_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size + 16), m_temp_reg_10);
39730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_store_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size + 24), m_temp_reg_10);
39740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
39750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_store_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size), m_temp_reg_10);
39760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_store_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size + 8), m_temp_reg_10);
39770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_store_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size + 16), m_temp_reg_10);
39780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_store_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size + 24), m_temp_reg_10);
39790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
39800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_store_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size), m_temp_reg_10);
39810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_store_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size + 8), m_temp_reg_10);
39820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_store_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size + 16), m_temp_reg_10);
39830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_store_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size + 24), m_temp_reg_10);
39840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
39850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_store_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size), m_temp_reg_10);
39860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_store_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size + 8), m_temp_reg_10);
39870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_store_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size + 16), m_temp_reg_10);
39880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_store_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size + 24), m_temp_reg_10);
39890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
39900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_tmp += 8 * trans_size;
39910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
39920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
39930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
39940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    pi2_tmp = pi2_tmp_orig;
39950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
39960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* Inverse Transform 2nd stage */
39970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
39980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    for(j = 0; j < trans_size; j += 4)
39990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
40000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        i4_shift = IT_SHIFT_STAGE_2;
40010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
40020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
40030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        if(zero_last28_rows_stg2)
40040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
40050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
40060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
40070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
40080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87
40090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80
40100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70
40110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57
40120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43
40130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25
40140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //9
40150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
40160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_10 = _mm_loadu_si128((__m128i *)&pi2_tmp[2 * trans_size]);
40170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
40180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_10, all_zero_reg);
40190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
40200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo0[0-3] */
40210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
40220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_90 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
40230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
40240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
40250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo1[0-3] */
40260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
40270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_91 = _mm_madd_epi16(m_temp_reg_10, m_coeff2);
40280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
40290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
40300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo2[0-3] */
40310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
40320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_92 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
40330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
40340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
40350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo3[0-3] */
40360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
40370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_93 = _mm_madd_epi16(m_temp_reg_10, m_coeff4);
40380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
40390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo4[0-3] */
40400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
40410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_94 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
40420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
40430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
40440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo5[0-3] */
40450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
40460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_95 = _mm_madd_epi16(m_temp_reg_10, m_coeff6);
40470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
40480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
40490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo6[0-3] */
40500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
40510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_96 = _mm_madd_epi16(m_temp_reg_10, m_coeff7);
40520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
40530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo7[0-3] */
40540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
40550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_97 = _mm_madd_epi16(m_temp_reg_10, m_coeff8);
40560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
40570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
40580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
40590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64
40600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
40610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[0 * trans_size]);
40620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
40630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
40640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
40650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
40660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
40670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
40680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
40690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* e[]*/
40700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
40710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp1 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_90);  /* ee[0] */
40720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp2 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_90);  /* ee[15] */
40730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
40740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp3 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_91);  /* ee[1] */
40750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp4 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_91);  /* ee[14] */
40760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
40770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp5 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_92);  /* ee[2] */
40780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp6 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_92);  /* ee[13] */
40790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
40800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp7 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_93);  /* ee[3] */
40810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp8 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_93);  /* ee[12] */
40820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
40830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_90 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_94);  /* ee[4] */
40840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_94);  /* ee[11] */
40850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
40860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_92 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_95);  /* ee[5] */
40870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_95);  /* ee[10] */
40880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
40890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_94 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_96);  /* ee[6] */
40900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_95 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_96);  /* ee[9] */
40910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
40920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_96 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_97);  /* ee[7] */
40930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_97);  /* ee[8] */
40940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
40950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /*o[k]*/
40960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
40970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
40980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                WORD16 *pi2_dst_scratch = temp_ptr;
40990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                WORD32 out_stride = 8;
41000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
41010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
41020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
41030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[trans_size]);
41040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_71 = _mm_loadu_si128((__m128i *)&pi2_tmp[3 * trans_size]);
41050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
41060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved
41070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
41080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
41090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* o0[0-3] */
41100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
41110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
41120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
41130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_sub_epi32(temp1, m_temp_reg_20);
41140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(temp1, m_temp_reg_20);
41150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
41160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
41170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_count = _mm_cvtsi32_si128(i4_shift);
41180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
41190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
41200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
41210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
41220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
41230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
41240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
41250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
41260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
41270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
41280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
41290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_dst_scratch += out_stride;
41300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
41310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
41320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
41330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
41340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
41350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* o1[0-3] */
41360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
41370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
41380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
41390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_sub_epi32(temp3, m_temp_reg_20);
41400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(temp3, m_temp_reg_20);
41410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
41420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
41430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_count = _mm_cvtsi32_si128(i4_shift);
41440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
41450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
41460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
41470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
41480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
41490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
41500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
41510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
41520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
41530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
41540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
41550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_dst_scratch += out_stride;
41560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
41570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
41580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
41590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
41600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
41610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* o2[0-3] */
41620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
41630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
41640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
41650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_sub_epi32(temp5, m_temp_reg_20);
41660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(temp5, m_temp_reg_20);
41670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
41680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
41690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_count = _mm_cvtsi32_si128(i4_shift);
41700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
41710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
41720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
41730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
41740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
41750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
41760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
41770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
41780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
41790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
41800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
41810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_dst_scratch += out_stride;
41820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
41830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
41840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
41850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
41860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
41870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* o3[0-3] */
41880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
41890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
41900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
41910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_sub_epi32(temp7, m_temp_reg_20);
41920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(temp7, m_temp_reg_20);
41930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
41940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
41950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_count = _mm_cvtsi32_si128(i4_shift);
41960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
41970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
41980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
41990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
42000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
42010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
42020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
42030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
42040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
42050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
42060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
42070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_dst_scratch += out_stride;
42080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
42090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
42100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
42110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
42120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
42130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* o4[0-3] */
42140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
42150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
42160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
42170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_20);
42180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_20);
42190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
42200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
42210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_count = _mm_cvtsi32_si128(i4_shift);
42220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
42230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
42240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
42250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
42260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
42270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
42280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
42290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
42300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
42310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
42320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
42330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_dst_scratch += out_stride;
42340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
42350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
42360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
42370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
42380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
42390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* o5[0-3] */
42400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
42410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
42420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
42430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_20);
42440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_20);
42450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
42460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
42470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_count = _mm_cvtsi32_si128(i4_shift);
42480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
42490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
42500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
42510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
42520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
42530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
42540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
42550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
42560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
42570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
42580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
42590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_dst_scratch += out_stride;
42600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
42610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
42620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
42630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
42640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
42650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* o6[0-3] */
42660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
42670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
42680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
42690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_20);
42700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_20);
42710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
42720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
42730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_count = _mm_cvtsi32_si128(i4_shift);
42740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
42750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
42760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
42770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
42780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
42790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
42800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
42810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
42820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
42830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
42840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
42850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_dst_scratch += out_stride;
42860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
42870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
42880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
42890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
42900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
42910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* o7[0-3] */
42920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
42930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
42940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
42950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_20);
42960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_20);
42970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
42980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
42990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_count = _mm_cvtsi32_si128(i4_shift);
43000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
43010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
43020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
43030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
43040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
43050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
43060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
43070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
43080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
43090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
43100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
43110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_dst_scratch += 8;
43120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
43130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
43140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
43150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
43160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
43170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* o8[0-3] */
43180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
43190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
43200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
43210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_20);
43220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_20);
43230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
43240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
43250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_count = _mm_cvtsi32_si128(i4_shift);
43260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
43270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
43280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
43290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
43300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
43310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
43320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
43330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
43340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
43350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
43360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
43370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_dst_scratch += out_stride;
43380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
43390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
43400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
43410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
43420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* o9[0-3] */
43430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
43440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
43450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
43460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_20);
43470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_20);
43480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
43490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
43500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_count = _mm_cvtsi32_si128(i4_shift);
43510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
43520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
43530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
43540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
43550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
43560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
43570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
43580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
43590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
43600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
43610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
43620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_dst_scratch += out_stride;
43630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
43640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
43650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
43660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
43670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
43680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* o10[0-3] */
43690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
43700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
43710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
43720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_20);
43730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_20);
43740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
43750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
43760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_count = _mm_cvtsi32_si128(i4_shift);
43770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
43780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
43790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
43800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
43810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
43820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
43830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
43840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
43850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
43860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
43870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
43880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_dst_scratch += out_stride;
43890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
43900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
43910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
43920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
43930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* o11[0-3] */
43940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
43950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
43960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
43970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_20);
43980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_20);
43990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
44000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
44010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_count = _mm_cvtsi32_si128(i4_shift);
44020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
44030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
44040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
44050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
44060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
44070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
44080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
44090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
44100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
44110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
44120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
44130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_dst_scratch += out_stride;
44140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
44150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
44160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
44170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
44180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
44190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* o12[0-3] */
44200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
44210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
44220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
44230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(temp8, m_temp_reg_20);
44240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sub_epi32(temp8, m_temp_reg_20);
44250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
44260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
44270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_count = _mm_cvtsi32_si128(i4_shift);
44280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
44290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
44300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
44310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
44320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
44330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
44340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
44350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
44360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
44370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
44380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
44390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_dst_scratch += out_stride;
44400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
44410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
44420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
44430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
44440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
44450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* o13[0-3] */
44460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
44470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
44480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
44490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(temp6, m_temp_reg_20);
44500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sub_epi32(temp6, m_temp_reg_20);
44510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
44520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
44530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_count = _mm_cvtsi32_si128(i4_shift);
44540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
44550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
44560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
44570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
44580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
44590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
44600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
44610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
44620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
44630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
44640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
44650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_dst_scratch += out_stride;
44660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
44670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
44680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
44690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
44700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* o14[0-3] */
44710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
44720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
44730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
44740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(temp4, m_temp_reg_20);
44750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sub_epi32(temp4, m_temp_reg_20);
44760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
44770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
44780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_count = _mm_cvtsi32_si128(i4_shift);
44790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
44800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
44810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
44820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
44830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
44840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
44850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
44860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
44870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
44880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
44890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
44900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_dst_scratch += out_stride;
44910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
44920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
44930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
44940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
44950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
44960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* o15[0-3] */
44970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
44980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
44990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
45000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(temp2, m_temp_reg_20);
45010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sub_epi32(temp2, m_temp_reg_20);
45020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
45030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
45040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_count = _mm_cvtsi32_si128(i4_shift);
45050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
45060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
45070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
45080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
45090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
45100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
45110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
45120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
45130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
45140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
45150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
45160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_dst_scratch += 8;
45170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
45180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
45190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
45200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
45210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
45220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        else if(zero_last24_rows_stg2)
45230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
45240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* eo */
45250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
45260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
45270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
45280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_10 = _mm_loadu_si128((__m128i *)&pi2_tmp[2 * trans_size]);
45290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_11 = _mm_loadu_si128((__m128i *)&pi2_tmp[6 * trans_size]);
45300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
45310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_10, m_temp_reg_11);
45320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
45330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
45340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo0[0-3] */
45350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
45360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_90 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
45370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
45380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
45390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
45400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87  57
45410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
45420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo1[0-3] */
45430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
45440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_91 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
45450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
45460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
45470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80  9
45480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
45490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo2[0-3] */
45500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
45510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_92 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
45520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
45530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
45540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
45550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70  -43
45560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
45570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo3[0-3] */
45580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
45590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
45600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_93 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
45610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
45620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
45630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
45640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57  -80
45650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
45660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo4[0-3] */
45670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
45680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_94 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
45690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
45700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
45710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
45720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43  -90
45730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
45740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo5[0-3] */
45750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
45760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_95 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
45770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
45780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
45790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25  -70
45800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo6[0-3] */
45810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
45820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_96 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
45830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
45840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
45850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9  -25
45860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo7[0-3] */
45870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
45880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_97 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
45890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
45900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
45910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
45920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
45930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
45940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* eeo */
45950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
45960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
45970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75
45980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75
45990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18
46000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[8][0]); //50
46010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
46020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[4 * trans_size]);
46030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
46040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, all_zero_reg);
46050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
46060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eeo0[0-3] */
46070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
46080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    temp1 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
46090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
46100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
46110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
46120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eeo1[0-3] */
46130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
46140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    temp2 = _mm_madd_epi16(m_temp_reg_10, m_coeff2);
46150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
46160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
46170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
46180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo2[0-3] */
46190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
46200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    temp3 = _mm_madd_epi16(m_temp_reg_10, m_coeff4);
46210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
46220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
46230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
46240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
46250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo3[0-3] */
46260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
46270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    temp4 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
46280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
46290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
46300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
46310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
46320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
46330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83
46340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36
46350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64
46360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
46370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[0 * trans_size]);
46380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
46390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //m_temp_reg_1 = _mm_cvtepi16_epi32(m_temp_reg_70);
46400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
46410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
46420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
46430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
46440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
46450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_70 = _mm_add_epi32(m_temp_reg_14, temp1);  /* ee[0] */
46460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_71 = _mm_sub_epi32(m_temp_reg_14, temp1);  /* ee[7] */
46470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
46480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_72 = _mm_add_epi32(m_temp_reg_16, temp2);  /* ee[1] */
46490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_73 = _mm_sub_epi32(m_temp_reg_16, temp2);  /* ee[6] */
46500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
46510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_74 = _mm_add_epi32(m_temp_reg_16, temp3);  /* ee[2] */
46520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_75 = _mm_sub_epi32(m_temp_reg_16, temp3);  /* ee[5] */
46530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
46540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_76 = _mm_add_epi32(m_temp_reg_14, temp4);  /* ee[3] */
46550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_77 = _mm_sub_epi32(m_temp_reg_14, temp4);  /* ee[4] */
46560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
46570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* e[]*/
46580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
46590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp1 = _mm_add_epi32(m_temp_reg_70, m_temp_reg_90);  /* ee[0] */
46600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp2 = _mm_sub_epi32(m_temp_reg_70, m_temp_reg_90);  /* ee[15] */
46610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
46620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp3 = _mm_add_epi32(m_temp_reg_72, m_temp_reg_91);  /* ee[1] */
46630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp4 = _mm_sub_epi32(m_temp_reg_72, m_temp_reg_91);  /* ee[14] */
46640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
46650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp5 = _mm_add_epi32(m_temp_reg_74, m_temp_reg_92);  /* ee[2] */
46660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp6 = _mm_sub_epi32(m_temp_reg_74, m_temp_reg_92);  /* ee[13] */
46670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
46680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp7 = _mm_add_epi32(m_temp_reg_76, m_temp_reg_93);  /* ee[3] */
46690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp8 = _mm_sub_epi32(m_temp_reg_76, m_temp_reg_93);  /* ee[12] */
46700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
46710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_90 = _mm_add_epi32(m_temp_reg_77, m_temp_reg_94);  /* ee[4] */
46720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_77, m_temp_reg_94);  /* ee[11] */
46730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
46740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_92 = _mm_add_epi32(m_temp_reg_75, m_temp_reg_95);  /* ee[5] */
46750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_75, m_temp_reg_95);  /* ee[10] */
46760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
46770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_94 = _mm_add_epi32(m_temp_reg_73, m_temp_reg_96);  /* ee[6] */
46780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_95 = _mm_sub_epi32(m_temp_reg_73, m_temp_reg_96);  /* ee[9] */
46790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
46800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_96 = _mm_add_epi32(m_temp_reg_71, m_temp_reg_97);  /* ee[7] */
46810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_71, m_temp_reg_97);  /* ee[8] */
46820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
46830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /*o[k] */
46840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
46850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
46860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                WORD16 *pi2_dst_scratch = temp_ptr;
46870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                WORD32 out_stride = 8;
46880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
46890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
46900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]);
46910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
46920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[trans_size]);
46930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_71 = _mm_loadu_si128((__m128i *)&pi2_tmp[3 * trans_size]);
46940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[5 * trans_size]);
46950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_73 = _mm_loadu_si128((__m128i *)&pi2_tmp[7 * trans_size]);
46960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
46970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71);
46980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73);
46990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
47000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* o0[0-3] */
47010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
47020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
47030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
47040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
47050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
47060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
47070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_sub_epi32(temp1, m_temp_reg_20);
47080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(temp1, m_temp_reg_20);
47090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
47100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
47110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_count = _mm_cvtsi32_si128(i4_shift);
47120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
47130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
47140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
47150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
47160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
47170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
47180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
47190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
47200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
47210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
47220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
47230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_dst_scratch += out_stride;
47240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
47250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
47260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
47270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
47280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
47290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]);
47300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
47310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* o1[0-3] */
47320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
47330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
47340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
47350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
47360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
47370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
47380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_sub_epi32(temp3, m_temp_reg_20);
47390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(temp3, m_temp_reg_20);
47400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
47410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
47420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_count = _mm_cvtsi32_si128(i4_shift);
47430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
47440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
47450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
47460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
47470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
47480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
47490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
47500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
47510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
47520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
47530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
47540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_dst_scratch += out_stride;
47550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
47560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
47570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
47580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
47590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]);
47600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
47610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* o2[0-3] */
47620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
47630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
47640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
47650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
47660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
47670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
47680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(temp5, m_temp_reg_20);
47690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sub_epi32(temp5, m_temp_reg_20);
47700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
47710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
47720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_count = _mm_cvtsi32_si128(i4_shift);
47730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
47740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
47750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
47760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
47770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
47780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
47790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
47800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
47810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
47820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
47830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
47840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_dst_scratch += out_stride;
47850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
47860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
47870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
47880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
47890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]);
47900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
47910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* o3[0-3] */
47920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
47930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
47940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
47950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
47960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
47970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
47980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(temp7, m_temp_reg_20);
47990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sub_epi32(temp7, m_temp_reg_20);
48000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
48010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
48020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_count = _mm_cvtsi32_si128(i4_shift);
48030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
48040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
48050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
48060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
48070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
48080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
48090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
48100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
48110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
48120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
48130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
48140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_dst_scratch += out_stride;
48150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
48160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
48170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
48180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
48190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]);
48200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
48210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* o4[0-3] */
48220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
48230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
48240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
48250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
48260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
48270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
48280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_20);
48290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_20);
48300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
48310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
48320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_count = _mm_cvtsi32_si128(i4_shift);
48330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
48340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
48350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
48360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
48370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
48380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
48390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
48400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
48410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
48420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
48430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
48440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_dst_scratch += out_stride;
48450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
48460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
48470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
48480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
48490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]);
48500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
48510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* o5[0-3] */
48520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
48530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
48540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
48550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
48560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
48570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
48580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_20);
48590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_20);
48600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
48610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
48620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_count = _mm_cvtsi32_si128(i4_shift);
48630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
48640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
48650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
48660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
48670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
48680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
48690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
48700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
48710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
48720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
48730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
48740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_dst_scratch += out_stride;
48750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
48760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
48770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
48780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
48790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]);
48800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
48810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* o6[0-3] */
48820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
48830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
48840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
48850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
48860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
48870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
48880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_20);
48890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_20);
48900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
48910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
48920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_count = _mm_cvtsi32_si128(i4_shift);
48930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
48940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
48950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
48960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
48970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
48980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
48990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
49000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
49010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
49020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
49030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
49040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_dst_scratch += out_stride;
49050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
49060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
49070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
49080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
49090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]);
49100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
49110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* o7[0-3] */
49120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
49130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
49140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
49150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
49160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
49170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
49180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_20);
49190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_20);
49200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
49210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
49220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_count = _mm_cvtsi32_si128(i4_shift);
49230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
49240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
49250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
49260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
49270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
49280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
49290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
49300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
49310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
49320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
49330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
49340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_dst_scratch += 8;
49350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
49360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
49370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
49380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
49390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]);
49400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
49410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* o8[0-3] */
49420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
49430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
49440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
49450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
49460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
49470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
49480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_20);
49490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_20);
49500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
49510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
49520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_count = _mm_cvtsi32_si128(i4_shift);
49530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
49540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
49550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
49560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
49570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
49580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
49590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
49600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
49610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
49620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
49630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
49640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_dst_scratch += out_stride;
49650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
49660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
49670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
49680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]);
49690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
49700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* o9[0-3] */
49710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
49720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
49730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
49740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
49750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
49760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
49770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_20);
49780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_20);
49790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
49800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
49810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_count = _mm_cvtsi32_si128(i4_shift);
49820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
49830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
49840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
49850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
49860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
49870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
49880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
49890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
49900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
49910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
49920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
49930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_dst_scratch += out_stride;
49940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
49950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
49960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
49970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]);
49980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
49990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* o10[0-3] */
50000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
50010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
50020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
50030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
50040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
50050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
50060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_20);
50070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_20);
50080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
50090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
50100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_count = _mm_cvtsi32_si128(i4_shift);
50110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
50120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
50130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
50140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
50150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
50160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
50170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
50180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
50190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
50200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
50210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
50220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_dst_scratch += out_stride;
50230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
50240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
50250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
50260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]);
50270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
50280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* o11[0-3] */
50290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
50300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
50310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
50320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
50330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
50340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
50350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_20);
50360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_20);
50370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
50380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
50390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_count = _mm_cvtsi32_si128(i4_shift);
50400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
50410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
50420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
50430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
50440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
50450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
50460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
50470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
50480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
50490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
50500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
50510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_dst_scratch += out_stride;
50520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
50530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
50540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
50550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
50560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]);
50570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
50580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* o12[0-3] */
50590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
50600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
50610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
50620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
50630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
50640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
50650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(temp8, m_temp_reg_20);
50660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sub_epi32(temp8, m_temp_reg_20);
50670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
50680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
50690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_count = _mm_cvtsi32_si128(i4_shift);
50700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
50710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
50720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
50730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
50740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
50750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
50760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
50770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
50780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
50790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
50800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
50810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_dst_scratch += out_stride;
50820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
50830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
50840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
50850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
50860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]);
50870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
50880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* o13[0-3] */
50890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
50900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
50910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
50920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
50930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
50940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
50950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(temp6, m_temp_reg_20);
50960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sub_epi32(temp6, m_temp_reg_20);
50970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
50980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
50990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_count = _mm_cvtsi32_si128(i4_shift);
51000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
51010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
51020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
51030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
51040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
51050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
51060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
51070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
51080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
51090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
51100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
51110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_dst_scratch += out_stride;
51120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
51130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
51140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
51150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]);
51160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
51170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* o14[0-3] */
51180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
51190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
51200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
51210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
51220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
51230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
51240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(temp4, m_temp_reg_20);
51250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sub_epi32(temp4, m_temp_reg_20);
51260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
51270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
51280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_count = _mm_cvtsi32_si128(i4_shift);
51290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
51300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
51310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
51320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
51330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
51340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
51350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
51360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
51370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
51380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
51390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
51400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_dst_scratch += out_stride;
51410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
51420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
51430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
51440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]);
51450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
51460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* o15[0-3] */
51470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
51480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
51490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
51500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
51510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
51520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
51530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(temp2, m_temp_reg_20);
51540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sub_epi32(temp2, m_temp_reg_20);
51550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
51560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
51570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_count = _mm_cvtsi32_si128(i4_shift);
51580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
51590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
51600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
51610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
51620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
51630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
51640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
51650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
51660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
51670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
51680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
51690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_dst_scratch += 8;
51700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
51710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
51720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
51730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
51740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        else
51750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
51760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* eo */
51770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
51780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
51790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
51800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[1][0]); //80 70
51810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57 43
51820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25 9
51830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
51840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
51850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_10 = _mm_loadu_si128((__m128i *)&pi2_tmp[2 * trans_size]);
51860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_11 = _mm_loadu_si128((__m128i *)&pi2_tmp[6 * trans_size]);
51870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_12 = _mm_loadu_si128((__m128i *)&pi2_tmp[10 * trans_size]);
51880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_13 = _mm_loadu_si128((__m128i *)&pi2_tmp[14 * trans_size]);
51890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_18 = _mm_loadu_si128((__m128i *)&pi2_tmp[18 * trans_size]);
51900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_19 = _mm_loadu_si128((__m128i *)&pi2_tmp[22 * trans_size]);
51910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_20 = _mm_loadu_si128((__m128i *)&pi2_tmp[26 * trans_size]);
51920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_21 = _mm_loadu_si128((__m128i *)&pi2_tmp[30 * trans_size]);
51930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
51940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_10, m_temp_reg_11);
51950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_12, m_temp_reg_13);
51960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_18, m_temp_reg_19);
51970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_21);
51980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
51990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo0[0-3] */
52000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
52010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
52020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
52030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
52040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
52050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
52060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
52070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
52080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
52090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
52100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
52110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_90 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
52120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
52130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
52140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
52150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87  57
52160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //0  -43
52170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80  90
52180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70  25
52190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
52200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo1[0-3] */
52210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
52220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
52230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
52240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
52250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
52260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
52270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
52280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
52290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
52300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
52310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
52320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_32);
52330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
52340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
52350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
52360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80  9
52370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[9][0]); //70  87
52380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[10][0]); //-25  57
52390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[11][0]); //90  43
52400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
52410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo2[0-3] */
52420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
52430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
52440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
52450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
52460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31);
52470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
52480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
52490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
52500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
52510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
52520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
52530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_92 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
52540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
52550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
52560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
52570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70  -43
52580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[13][0]); //-87  9
52590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[14][0]); //90  25
52600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[15][0]); //80  57
52610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
52620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo3[0-3] */
52630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
52640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
52650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
52660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
52670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
52680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
52690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
52700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
52710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
52720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_32, m_temp_reg_33);
52730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
52740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_93 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
52750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
52760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
52770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
52780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57  -80
52790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[17][0]); //-25  90
52800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[18][0]); //9  87
52810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43  70
52820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
52830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
52840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo4[0-3] */
52850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
52860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
52870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
52880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
52890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
52900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
52910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
52920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
52930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
52940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32);
52950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
52960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_94 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
52970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
52980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
52990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
53000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43  -90
53010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[21][0]); //57  25
53020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[22][0]); //-87  70
53030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[23][0]); //9  -80
53040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
53050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo5[0-3] */
53060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
53070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
53080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
53090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
53100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
53110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
53120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
53130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
53140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
53150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
53160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
53170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_95 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
53180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
53190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
53200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25  -70
53210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[25][0]); //90  -80
53220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[26][0]); //43  9
53230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[27][0]); //-57  87
53240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
53250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo6[0-3] */
53260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
53270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
53280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
53290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
53300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
53310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
53320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
53330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
53340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
53350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
53360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
53370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_96 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
53380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
53390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
53400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
53410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9  -25
53420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[29][0]); //43  -57
53430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[30][0]); //70  -80
53440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[31][0]); //87  -90
53450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
53460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo7[0-3] */
53470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
53480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
53490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
53500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
53510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
53520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
53530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
53540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
53550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
53560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
53570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
53580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_97 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
53590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
53600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
53610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
53620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
53630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
53640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
53650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* eeo */
53660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
53670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75
53680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[5][0]); //50 18
53690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
53700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[4 * trans_size]);
53710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_76 = _mm_loadu_si128((__m128i *)&pi2_tmp[12 * trans_size]);
53720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_82 = _mm_loadu_si128((__m128i *)&pi2_tmp[20 * trans_size]);
53730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_86 = _mm_loadu_si128((__m128i *)&pi2_tmp[28 * trans_size]);
53740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
53750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eeo0[0-3] */
53760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
53770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
53780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
53790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_86);
53800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
53810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
53820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
53830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
53840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    temp1 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
53850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
53860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
53870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
53880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75 -18
53890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[7][0]); //89  50
53900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
53910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eeo1[0-3] */
53920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
53930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
53940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
53950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
53960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    temp2 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31);
53970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
53980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
53990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
54000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[8][0]); //50 -89
54010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18  75
54020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
54030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo2[0-3] */
54040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
54050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
54060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
54070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
54080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    temp3 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
54090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
54100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
54110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
54120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[10][0]); //18 -50
54130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[11][0]); //75  -89
54140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
54150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo3[0-3] */
54160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
54170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
54180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
54190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
54200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    temp4 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
54210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
54220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
54230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
54240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
54250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
54260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
54270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83 36
54280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36 -83
54290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
54300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64 64
54310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[3][0]); //64 -64
54320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
54330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_74 = _mm_loadu_si128((__m128i *)&pi2_tmp[8 * trans_size]);
54340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_84 = _mm_loadu_si128((__m128i *)&pi2_tmp[24 * trans_size]);
54350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
54360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_84);
54370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
54380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[0 * trans_size]);
54390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_80 = _mm_loadu_si128((__m128i *)&pi2_tmp[16 * trans_size]);
54400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
54410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_80);
54420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
54430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);  /* eeeo[0] */
54440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);  /* eeeo[1] */
54450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
54460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);  /* eeee[0] */
54470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff4);  /* eeee[1] */
54480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
54490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/* eeeo[0]= m_temp_reg_20  */
54500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/* eeeo[1]= m_temp_reg_21  */
54510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/* eeee[0]= m_temp_reg_22  */
54520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/* eeee[1]= m_temp_reg_23  */
54530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
54540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* eee[0] = eeee[0] + eeeo[0]; */
54550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_40 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);  /* eeeo[0] */
54560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
54570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* eee[3] = eeee[0] - eeeo[0]; */
54580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_43 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);  /* eeeo[1] */
54590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
54600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* eee[2] = eeee[1] - eeeo[1]; */
54610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_42 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_22);  /* eeee[1] */
54620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
54630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* eee[1] = eeee[1] + eeeo[1];*/
54640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_41 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_22);  /* eeee[0] */
54650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
54660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_70 = _mm_add_epi32(m_temp_reg_40, temp1);  /* ee[0] */
54670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_71 = _mm_sub_epi32(m_temp_reg_40, temp1);  /* ee[7] */
54680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
54690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_72 = _mm_add_epi32(m_temp_reg_41, temp2);  /* ee[1] */
54700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_73 = _mm_sub_epi32(m_temp_reg_41, temp2);  /* ee[6] */
54710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
54720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_74 = _mm_add_epi32(m_temp_reg_42, temp3);  /* ee[2] */
54730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_75 = _mm_sub_epi32(m_temp_reg_42, temp3);  /* ee[5] */
54740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
54750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_76 = _mm_add_epi32(m_temp_reg_43, temp4);  /* ee[3] */
54760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_77 = _mm_sub_epi32(m_temp_reg_43, temp4);  /* ee[4] */
54770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
54780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/* e[]*/
54790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
54800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp1 = _mm_add_epi32(m_temp_reg_70, m_temp_reg_90);  /* ee[0] */
54810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp2 = _mm_sub_epi32(m_temp_reg_70, m_temp_reg_90);  /* ee[15] */
54820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
54830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp3 = _mm_add_epi32(m_temp_reg_72, m_temp_reg_91);  /* ee[1] */
54840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp4 = _mm_sub_epi32(m_temp_reg_72, m_temp_reg_91);  /* ee[14] */
54850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
54860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp5 = _mm_add_epi32(m_temp_reg_74, m_temp_reg_92);  /* ee[2] */
54870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp6 = _mm_sub_epi32(m_temp_reg_74, m_temp_reg_92);  /* ee[13] */
54880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
54890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp7 = _mm_add_epi32(m_temp_reg_76, m_temp_reg_93);  /* ee[3] */
54900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp8 = _mm_sub_epi32(m_temp_reg_76, m_temp_reg_93);  /* ee[12] */
54910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
54920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_90 = _mm_add_epi32(m_temp_reg_77, m_temp_reg_94);  /* ee[4] */
54930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_77, m_temp_reg_94);  /* ee[11] */
54940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
54950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_92 = _mm_add_epi32(m_temp_reg_75, m_temp_reg_95);  /* ee[5] */
54960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_75, m_temp_reg_95);  /* ee[10] */
54970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
54980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_94 = _mm_add_epi32(m_temp_reg_73, m_temp_reg_96);  /* ee[6] */
54990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_95 = _mm_sub_epi32(m_temp_reg_73, m_temp_reg_96);  /* ee[9] */
55000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
55010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_96 = _mm_add_epi32(m_temp_reg_71, m_temp_reg_97);  /* ee[7] */
55020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_71, m_temp_reg_97);  /* ee[8] */
55030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
55040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/*o[k] */
55050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
55060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
55070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                WORD16 *pi2_dst_scratch = temp_ptr;
55080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                WORD32 out_stride = 8;
55090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
55100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
55110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]);
55120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[2][0]);
55130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[3][0]);
55140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[4][0]);
55150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[5][0]);
55160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[6][0]);
55170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[7][0]);
55180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
55190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
55200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[trans_size]);
55210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_71 = _mm_loadu_si128((__m128i *)&pi2_tmp[3 * trans_size]);
55220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[5 * trans_size]);
55230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_73 = _mm_loadu_si128((__m128i *)&pi2_tmp[7 * trans_size]);
55240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_74 = _mm_loadu_si128((__m128i *)&pi2_tmp[9 * trans_size]);
55250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_75 = _mm_loadu_si128((__m128i *)&pi2_tmp[11 * trans_size]);
55260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_76 = _mm_loadu_si128((__m128i *)&pi2_tmp[13 * trans_size]);
55270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_77 = _mm_loadu_si128((__m128i *)&pi2_tmp[15 * trans_size]);
55280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
55290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_80 = _mm_loadu_si128((__m128i *)&pi2_tmp[17 * trans_size]);
55300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_81 = _mm_loadu_si128((__m128i *)&pi2_tmp[19 * trans_size]);
55310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_82 = _mm_loadu_si128((__m128i *)&pi2_tmp[21 * trans_size]);
55320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_83 = _mm_loadu_si128((__m128i *)&pi2_tmp[23 * trans_size]);
55330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_84 = _mm_loadu_si128((__m128i *)&pi2_tmp[25 * trans_size]);
55340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_85 = _mm_loadu_si128((__m128i *)&pi2_tmp[27 * trans_size]);
55350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_86 = _mm_loadu_si128((__m128i *)&pi2_tmp[29 * trans_size]);
55360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_87 = _mm_loadu_si128((__m128i *)&pi2_tmp[31 * trans_size]);
55370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
55380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved
55390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 interleaved
55400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 interleaved
55410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 interleaved
55420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_80, m_temp_reg_81); //row 17 and row 19 interleaved
55430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_15 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_83); //row 21 and row 23 interleaved
55440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_16 = _mm_unpacklo_epi16(m_temp_reg_84, m_temp_reg_85); //row 25 and row 27 interleaved
55450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_17 = _mm_unpacklo_epi16(m_temp_reg_86, m_temp_reg_87); //row 29 and row 31 interleaved
55460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
55470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* o0[0-3] */
55480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
55490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
55500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
55510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
55520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
55530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
55540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
55550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
55560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
55570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
55580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
55590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
55600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
55610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
55620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
55630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
55640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
55650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
55660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
55670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
55680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
55690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
55700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
55710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_sub_epi32(temp1, m_temp_reg_20);
55720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(temp1, m_temp_reg_20);
55730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
55740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
55750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_count = _mm_cvtsi32_si128(i4_shift);
55760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
55770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
55780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
55790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
55800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
55810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
55820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
55830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
55840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
55850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
55860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
55870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_dst_scratch += out_stride;
55880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
55890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
55900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
55910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
55920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]);
55930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[10][0]);
55940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[11][0]);
55950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[12][0]);
55960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[13][0]);
55970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[14][0]);
55980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[15][0]);
55990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
56000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* o1[0-3] */
56010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
56020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
56030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
56040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
56050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
56060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
56070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
56080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
56090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
56100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_20);
56110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
56120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
56130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
56140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
56150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
56160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
56170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
56180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
56190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
56200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
56210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
56220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
56230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
56240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(temp3, m_temp_reg_20);
56250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sub_epi32(temp3, m_temp_reg_20);
56260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
56270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
56280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_count = _mm_cvtsi32_si128(i4_shift);
56290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
56300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
56310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
56320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
56330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
56340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
56350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
56360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
56370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
56380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
56390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
56400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_dst_scratch += out_stride;
56410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
56420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
56430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
56440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
56450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]);
56460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[18][0]);
56470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[19][0]);
56480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[20][0]);
56490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[21][0]);
56500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[22][0]);
56510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[23][0]);
56520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
56530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* o2[0-3] */
56540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
56550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
56560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
56570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
56580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
56590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
56600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
56610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
56620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
56630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
56640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
56650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
56660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
56670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
56680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
56690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
56700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_41);
56710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
56720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
56730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_42);
56740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
56750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
56760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
56770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(temp5, m_temp_reg_20);
56780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sub_epi32(temp5, m_temp_reg_20);
56790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
56800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
56810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_count = _mm_cvtsi32_si128(i4_shift);
56820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
56830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
56840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
56850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
56860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
56870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
56880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
56890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
56900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
56910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
56920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
56930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_dst_scratch += out_stride;
56940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
56950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
56960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
56970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
56980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]);
56990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[26][0]);
57000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[27][0]);
57010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[28][0]);
57020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[29][0]);
57030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[30][0]);
57040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[31][0]);
57050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
57060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* o3[0-3] */
57070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
57080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
57090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
57100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
57110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
57120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
57130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
57140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
57150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
57160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
57170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
57180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
57190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
57200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
57210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
57220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
57230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_40);
57240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
57250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
57260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
57270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
57280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
57290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
57300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(temp7, m_temp_reg_20);
57310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sub_epi32(temp7, m_temp_reg_20);
57320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
57330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
57340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_count = _mm_cvtsi32_si128(i4_shift);
57350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
57360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
57370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
57380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
57390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
57400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
57410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
57420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
57430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
57440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
57450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
57460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_dst_scratch += out_stride;
57470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
57480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
57490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
57500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
57510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]);
57520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[34][0]);
57530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[35][0]);
57540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[36][0]);
57550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[37][0]);
57560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[38][0]);
57570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[39][0]);
57580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
57590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* o4[0-3] */
57600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
57610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
57620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
57630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
57640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
57650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
57660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
57670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
57680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
57690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
57700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
57710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
57720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
57730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
57740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
57750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
57760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
57770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
57780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
57790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
57800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
57810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
57820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
57830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_20);
57840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_20);
57850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
57860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_count = _mm_cvtsi32_si128(i4_shift);
57870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
57880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
57890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
57900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
57910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
57920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
57930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
57940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
57950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
57960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
57970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
57980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_dst_scratch += out_stride;
57990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
58000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
58010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
58020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
58030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]);
58040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[42][0]);
58050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[43][0]);
58060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[44][0]);
58070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[45][0]);
58080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[46][0]);
58090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[47][0]);
58100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
58110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* o5[0-3] */
58120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
58130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
58140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
58150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
58160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
58170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
58180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
58190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
58200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
58210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
58220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
58230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
58240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
58250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
58260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
58270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
58280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
58290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
58300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
58310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
58320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
58330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
58340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
58350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_20);
58360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_20);
58370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
58380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
58390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_count = _mm_cvtsi32_si128(i4_shift);
58400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
58410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
58420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
58430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
58440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
58450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
58460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
58470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
58480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
58490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
58500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
58510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_dst_scratch += out_stride;
58520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
58530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
58540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
58550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
58560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]);
58570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[50][0]);
58580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[51][0]);
58590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[52][0]);
58600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[53][0]);
58610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[54][0]);
58620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[55][0]);
58630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
58640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* o6[0-3] */
58650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
58660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
58670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
58680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
58690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
58700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
58710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
58720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
58730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
58740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
58750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
58760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
58770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
58780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
58790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
58800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
58810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
58820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
58830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
58840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
58850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
58860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
58870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
58880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_20);
58890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_20);
58900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
58910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
58920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_count = _mm_cvtsi32_si128(i4_shift);
58930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
58940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
58950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
58960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
58970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
58980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
58990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
59000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
59010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
59020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
59030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
59040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_dst_scratch += out_stride;
59050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
59060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
59070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
59080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
59090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]);
59100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[58][0]);
59110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[59][0]);
59120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[60][0]);
59130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[61][0]);
59140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[62][0]);
59150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[63][0]);
59160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
59170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* o7[0-3] */
59180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
59190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
59200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
59210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
59220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
59230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
59240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
59250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
59260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
59270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
59280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
59290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
59300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
59310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
59320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
59330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
59340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
59350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
59360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
59370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
59380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
59390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
59400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
59410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_20);
59420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_20);
59430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
59440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
59450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_count = _mm_cvtsi32_si128(i4_shift);
59460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
59470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
59480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
59490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
59500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
59510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
59520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
59530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
59540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
59550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
59560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
59570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_dst_scratch += 8;
59580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
59590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
59600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
59610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
59620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]);
59630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[66][0]);
59640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[67][0]);
59650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[68][0]);
59660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[69][0]);
59670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[70][0]);
59680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[71][0]);
59690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
59700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* o8[0-3] */
59710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
59720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
59730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
59740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
59750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
59760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
59770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
59780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
59790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
59800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
59810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
59820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
59830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
59840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
59850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
59860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
59870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
59880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
59890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
59900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
59910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
59920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
59930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
59940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_20);
59950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_20);
59960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
59970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
59980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_count = _mm_cvtsi32_si128(i4_shift);
59990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
60000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
60010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
60020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
60030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
60040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
60050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
60060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
60070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
60080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
60090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
60100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_dst_scratch += out_stride;
60110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
60120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
60130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
60140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]);
60150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[74][0]);
60160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[75][0]);
60170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[76][0]);
60180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[77][0]);
60190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[78][0]);
60200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[79][0]);
60210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
60220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* o9[0-3] */
60230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
60240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
60250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
60260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
60270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
60280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
60290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
60300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
60310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
60320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
60330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
60340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
60350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
60360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
60370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
60380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
60390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
60400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
60410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
60420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
60430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
60440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
60450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
60460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_20);
60470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_20);
60480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
60490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
60500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_count = _mm_cvtsi32_si128(i4_shift);
60510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
60520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
60530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
60540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
60550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
60560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
60570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
60580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
60590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
60600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
60610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
60620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_dst_scratch += out_stride;
60630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
60640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
60650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
60660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]);
60670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[82][0]);
60680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[83][0]);
60690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[84][0]);
60700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[85][0]);
60710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[86][0]);
60720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[87][0]);
60730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
60740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* o10[0-3] */
60750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
60760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
60770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
60780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
60790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
60800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
60810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
60820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
60830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
60840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
60850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
60860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
60870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
60880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
60890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
60900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
60910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
60920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
60930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
60940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
60950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
60960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
60970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
60980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_20);
60990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_20);
61000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
61010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
61020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_count = _mm_cvtsi32_si128(i4_shift);
61030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
61040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
61050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
61060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
61070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
61080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
61090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
61100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
61110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
61120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
61130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
61140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_dst_scratch += out_stride;
61150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
61160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
61170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
61180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
61190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]);
61200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[90][0]);
61210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[91][0]);
61220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[92][0]);
61230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[93][0]);
61240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[94][0]);
61250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[95][0]);
61260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
61270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* o11[0-3] */
61280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
61290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
61300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
61310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
61320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
61330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
61340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
61350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
61360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
61370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
61380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
61390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
61400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
61410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
61420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
61430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
61440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
61450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
61460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
61470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
61480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
61490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
61500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
61510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_20);
61520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_20);
61530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
61540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
61550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_count = _mm_cvtsi32_si128(i4_shift);
61560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
61570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
61580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
61590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
61600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
61610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
61620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
61630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
61640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
61650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
61660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
61670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_dst_scratch += out_stride;
61680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
61690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
61700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
61710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
61720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]);
61730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[98][0]);
61740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[99][0]);
61750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[100][0]);
61760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[101][0]);
61770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[102][0]);
61780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[103][0]);
61790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
61800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* o12[0-3] */
61810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
61820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
61830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
61840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
61850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
61860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
61870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
61880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
61890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
61900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
61910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
61920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
61930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
61940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
61950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
61960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
61970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
61980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
61990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
62000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
62010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
62020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
62030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
62040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(temp8, m_temp_reg_20);
62050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sub_epi32(temp8, m_temp_reg_20);
62060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
62070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
62080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_count = _mm_cvtsi32_si128(i4_shift);
62090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
62100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
62110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
62120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
62130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
62140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
62150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
62160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
62170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
62180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
62190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
62200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_dst_scratch += out_stride;
62210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
62220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
62230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
62240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
62250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]);
62260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[106][0]);
62270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[107][0]);
62280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[108][0]);
62290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[109][0]);
62300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[110][0]);
62310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[111][0]);
62320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
62330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* o13[0-3] */
62340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
62350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
62360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
62370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
62380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
62390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
62400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
62410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
62420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
62430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
62440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
62450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
62460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
62470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
62480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
62490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
62500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
62510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
62520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
62530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
62540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
62550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
62560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
62570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(temp6, m_temp_reg_20);
62580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sub_epi32(temp6, m_temp_reg_20);
62590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
62600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
62610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_count = _mm_cvtsi32_si128(i4_shift);
62620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
62630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
62640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
62650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
62660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
62670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
62680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
62690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
62700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
62710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
62720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
62730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_dst_scratch += out_stride;
62740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
62750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
62760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
62770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]);
62780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[114][0]);
62790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[115][0]);
62800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[116][0]);
62810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[117][0]);
62820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[118][0]);
62830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[119][0]);
62840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
62850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* o14[0-3] */
62860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
62870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
62880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
62890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
62900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
62910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
62920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
62930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
62940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
62950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
62960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
62970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
62980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
62990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
63000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
63010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
63020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
63030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
63040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
63050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
63060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
63070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
63080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
63090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(temp4, m_temp_reg_20);
63100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sub_epi32(temp4, m_temp_reg_20);
63110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
63120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
63130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_count = _mm_cvtsi32_si128(i4_shift);
63140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
63150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
63160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
63170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
63180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
63190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
63200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
63210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
63220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
63230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
63240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
63250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_dst_scratch += out_stride;
63260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
63270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
63280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
63290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
63300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]);
63310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[122][0]);
63320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[123][0]);
63330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[124][0]);
63340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[125][0]);
63350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[126][0]);
63360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[127][0]);
63370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
63380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* o15[0-3] */
63390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
63400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
63410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
63420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
63430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
63440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
63450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
63460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
63470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
63480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
63490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
63500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
63510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
63520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
63530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
63540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
63550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
63560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
63570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
63580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
63590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
63600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
63610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
63620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(temp2, m_temp_reg_20);
63630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sub_epi32(temp2, m_temp_reg_20);
63640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
63650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
63660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_count = _mm_cvtsi32_si128(i4_shift);
63670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
63680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
63690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
63700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
63710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
63720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
63730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
63740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
63750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
63760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
63770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
63780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_dst_scratch += 8;
63790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
63800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
63810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
63820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
63830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
63840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
63850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* Transpose */
63860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
63870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
63880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            WORD16 *pi2_src_scratch = temp_ptr;
63890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            WORD32 out_stride = dst_strd;
63900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            WORD32 in_stride = 8;
63910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
63920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
63930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src_scratch += in_stride;
63940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_31 = _mm_load_si128((__m128i *)pi2_src_scratch);
63950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src_scratch += in_stride;
63960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_32 = _mm_load_si128((__m128i *)pi2_src_scratch);
63970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src_scratch += in_stride;
63980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_33 = _mm_load_si128((__m128i *)pi2_src_scratch);
63990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src_scratch += in_stride;
64000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_34 = _mm_load_si128((__m128i *)pi2_src_scratch);
64010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src_scratch += in_stride;
64020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_35 = _mm_load_si128((__m128i *)pi2_src_scratch);
64030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src_scratch += in_stride;
64040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_36 = _mm_load_si128((__m128i *)pi2_src_scratch);
64050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src_scratch += in_stride;
64060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_37 = _mm_load_si128((__m128i *)pi2_src_scratch);
64070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src_scratch += 8;
64080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
64090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_src_scratch);
64100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src_scratch += in_stride;
64110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_src_scratch);
64120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src_scratch += in_stride;
64130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_src_scratch);
64140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src_scratch += in_stride;
64150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_src_scratch);
64160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src_scratch += in_stride;
64170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_src_scratch);
64180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src_scratch += in_stride;
64190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_src_scratch);
64200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src_scratch += in_stride;
64210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_src_scratch);
64220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src_scratch += in_stride;
64230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_src_scratch);
64240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src_scratch += 8;
64250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
64260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
64270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31);
64280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30);
64290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
64300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33);
64310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32);
64320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
64330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35);
64340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34);
64350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
64360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37);
64370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36);
64380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
64390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_80 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71);
64400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_81 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_70);
64410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
64420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_82 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73);
64430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_83 = _mm_unpackhi_epi16(m_temp_reg_73, m_temp_reg_72);
64440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
64450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_84 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75);
64460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_85 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_74);
64470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
64480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_86 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77);
64490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_87 = _mm_unpackhi_epi16(m_temp_reg_77, m_temp_reg_76);
64500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
64510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
64520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42);
64530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42);
64540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
64550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46);
64560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46);
64570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
64580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_80, m_temp_reg_82);
64590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_80, m_temp_reg_82);
64600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
64610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_84, m_temp_reg_86);
64620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_84, m_temp_reg_86);
64630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
64640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_90 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41);
64650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_91 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41);
64660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
64670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_92 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45);
64680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_93 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45);
64690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
64700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_94 = _mm_unpacklo_epi32(m_temp_reg_83, m_temp_reg_81);
64710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_95 = _mm_unpackhi_epi32(m_temp_reg_83, m_temp_reg_81);
64720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
64730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_96 = _mm_unpacklo_epi32(m_temp_reg_87, m_temp_reg_85);
64740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_97 = _mm_unpackhi_epi32(m_temp_reg_87, m_temp_reg_85);
64750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
64760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
64770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_30 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_2);       // row0 = 0-7
64780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_31 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_2);       // row1 = 0-7
64790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
64800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_32 = _mm_unpacklo_epi64(m_temp_reg_92, m_temp_reg_90);     // row0=24-31
64810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_33 = _mm_unpackhi_epi64(m_temp_reg_92, m_temp_reg_90);     // row1=24-31
64820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
64830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_34 = _mm_unpacklo_epi64(m_temp_reg_4, m_temp_reg_6);       // row0=8-15
64840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_35 = _mm_unpackhi_epi64(m_temp_reg_4, m_temp_reg_6);       // row1=8-15
64850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
64860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_36 = _mm_unpacklo_epi64(m_temp_reg_96, m_temp_reg_94);     // row0=16-23
64870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_37 = _mm_unpackhi_epi64(m_temp_reg_96, m_temp_reg_94);     // row1=16-23
64880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
64890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_80 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_3);      // row2 =0-7
64900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_81 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_3);      // row3 =0-7
64910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
64920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_82 = _mm_unpacklo_epi64(m_temp_reg_93, m_temp_reg_91);    // row2=24-31
64930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_83 = _mm_unpackhi_epi64(m_temp_reg_93, m_temp_reg_91);    // row3=24-31
64940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
64950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_84 = _mm_unpacklo_epi64(m_temp_reg_5, m_temp_reg_7);      // row2=8-15
64960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_85 = _mm_unpackhi_epi64(m_temp_reg_5, m_temp_reg_7);      // row3=8-15
64970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
64980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_86 = _mm_unpacklo_epi64(m_temp_reg_97, m_temp_reg_95);    // row2=16-23
64990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_87 = _mm_unpackhi_epi64(m_temp_reg_97, m_temp_reg_95);    // row3=16-23
65000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
65010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred);
65020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
65030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
65040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
65050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_40 = _mm_add_epi16(m_temp_reg_30, m_temp_reg_0);
65060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
65070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
65080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
65090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
65100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_44 = _mm_add_epi16(m_temp_reg_34, m_temp_reg_0);
65110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
65120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
65130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
65140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
65150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16));
65160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
65170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
65180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
65190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_40 = _mm_add_epi16(m_temp_reg_36, m_temp_reg_0);
65200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
65210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
65220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
65230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
65240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_44 = _mm_add_epi16(m_temp_reg_32, m_temp_reg_0);
65250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
65260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
65270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20);
65280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_dst += out_stride;
65290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_pred += pred_strd;
65300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
65310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
65320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred);
65330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
65340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
65350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
65360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_40 = _mm_add_epi16(m_temp_reg_31, m_temp_reg_0);
65370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
65380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
65390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
65400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
65410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_44 = _mm_add_epi16(m_temp_reg_35, m_temp_reg_0);
65420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
65430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
65440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
65450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
65460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16));
65470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
65480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
65490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
65500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_40 = _mm_add_epi16(m_temp_reg_37, m_temp_reg_0);
65510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
65520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
65530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
65540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
65550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_44 = _mm_add_epi16(m_temp_reg_33, m_temp_reg_0);
65560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
65570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
65580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20);
65590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_dst += out_stride;
65600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_pred += pred_strd;
65610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
65620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred);
65630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
65640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
65650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
65660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_40 = _mm_add_epi16(m_temp_reg_80, m_temp_reg_0);
65670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
65680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
65690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
65700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
65710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_44 = _mm_add_epi16(m_temp_reg_84, m_temp_reg_0);
65720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
65730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
65740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
65750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
65760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16));
65770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
65780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
65790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
65800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_40 = _mm_add_epi16(m_temp_reg_86, m_temp_reg_0);
65810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
65820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
65830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
65840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
65850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_44 = _mm_add_epi16(m_temp_reg_82, m_temp_reg_0);
65860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
65870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
65880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20);
65890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_dst += out_stride;
65900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_pred += pred_strd;
65910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
65920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
65930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred);
65940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
65950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
65960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
65970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_40 = _mm_add_epi16(m_temp_reg_81, m_temp_reg_0);
65980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
65990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
66000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
66010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
66020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_44 = _mm_add_epi16(m_temp_reg_85, m_temp_reg_0);
66030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
66040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
66050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
66060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
66070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16));
66080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
66090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
66100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
66110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_40 = _mm_add_epi16(m_temp_reg_87, m_temp_reg_0);
66120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
66130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
66140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
66150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
66160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_44 = _mm_add_epi16(m_temp_reg_83, m_temp_reg_0);
66170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
66180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
66190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20);
66200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_dst += out_stride;
66210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_pred += pred_strd;
66220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
66230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
66240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        pi2_tmp += 4;
66250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
66260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar}
66270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
66280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6629