10d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/******************************************************************************
20d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
30d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
40d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
50d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Licensed under the Apache License, Version 2.0 (the "License");
60d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* you may not use this file except in compliance with the License.
70d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* You may obtain a copy of the License at:
80d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
90d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* http://www.apache.org/licenses/LICENSE-2.0
100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Unless required by applicable law or agreed to in writing, software
120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* distributed under the License is distributed on an "AS IS" BASIS,
130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* See the License for the specific language governing permissions and
150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* limitations under the License.
160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar******************************************************************************/
180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/**
190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *******************************************************************************
200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @file
210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  ihevc_iquant_itrans_recon_atom_intr.c
220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @brief
240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Contains function definitions for inverse  quantization, inverse
250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * transform and reconstruction
260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @author
280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  100470
290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  100592 (edited by)
300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @par List of Functions:
320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  - ihevc_iquant_itrans_recon_16x16_ssse3()
330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @remarks
350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  None
360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *******************************************************************************
380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar */
390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <stdio.h>
400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <string.h>
410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_typedefs.h"
420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_macros.h"
430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_platform_macros.h"
440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_defs.h"
450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_trans_tables.h"
460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_itrans_recon.h"
470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_func_selector.h"
480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_trans_macros.h"
490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <immintrin.h>
530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <emmintrin.h>
540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <tmmintrin.h>
560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/**
590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *******************************************************************************
600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @brief
620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  This function performs inverse quantization, inverse  transform and
630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * reconstruction for 16x16 input block
640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @par Description:
660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Performs inverse quantization , inverse transform  and adds the
670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * prediction data and clips output to 8 bit
680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] pi2_src
700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Input 16x16 coefficients
710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] pi2_tmp
730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Temporary 16x16 buffer for storing inverse
740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  transform 1st stage output
750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] pu1_pred
770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Prediction 16x16 block
780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] pi2_dequant_coeff
800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Dequant Coeffs
810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[out] pu1_dst
830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Output 16x16 block
840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] qp_div
860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Quantization parameter / 6
870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] qp_rem
890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Quantization parameter % 6
900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] src_strd
920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Input stride
930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] pred_strd
950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Prediction stride
960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] dst_strd
980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Output Stride
990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
1000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] zero_cols
1010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Zero columns in pi2_src
1020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
1030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @returns  Void
1040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
1050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @remarks
1060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  None
1070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
1080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *******************************************************************************
1090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar */
1100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarvoid ihevc_itrans_recon_16x16_ssse3(WORD16 *pi2_src,
1120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                    WORD16 *pi2_tmp,
1130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                    UWORD8 *pu1_pred,
1140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                    UWORD8 *pu1_dst,
1150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                    WORD32 src_strd,
1160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                    WORD32 pred_strd,
1170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                    WORD32 dst_strd,
1180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                    WORD32 zero_cols,
1190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                    WORD32 zero_rows)
1200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar{
1210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_0;
1220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_1;
1230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_10;
1240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_11;
1250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_12;
1260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_13;
1270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_14;
1280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_20;
1300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_21;
1310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_22;
1320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_23;
1330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_24;
1340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_25;
1350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_26;
1360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_27;
1370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_30;
1380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_31;
1390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_32;
1400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_33;
1410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_34;
1420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_35;
1430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_36;
1440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_37;
1450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_40;
1460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_41;
1470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_42;
1480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_43;
1490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_44;
1500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_45;
1510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_46;
1520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_47;
1530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_70;
1550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_71;
1560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_72;
1570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_73;
1580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_74;
1590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_75;
1600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_76;
1610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_77;
1620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_rdng_factor;
1630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_count;
1640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_coeff1, m_coeff2, m_coeff3, m_coeff4;
1650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_coeff5, m_coeff6, m_coeff7, m_coeff8;
1660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 i;
1680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/*Lokesh*/
1690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32  zero_last8_cols_stg1;
1700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32  zero_last8_rows_stg1;
1710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32  zero_last12_rows_stg1;
1720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32  zero_last12_rows_stg2;
1730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32  zero_last8_rows_stg2;
1740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32  loop = 0;
1760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 i4_shift = IT_SHIFT_STAGE_1;
1780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 trans_size = TRANS_SIZE_16;
1790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* Following 3 instructions replicates the value in the */
1840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* lower 16 bits of m_add_iq in the entire register */
1850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* Last 8 cols of 16x16 block are skipped based on the below flag : Lokesh */
1870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    zero_last8_cols_stg1 = ((zero_cols & 0xFF00) == 0xFF00) ? 1 : 0;
1890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    zero_last8_rows_stg1 = ((zero_rows & 0xFF00) == 0xFF00) ? 1 : 0;
1900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    zero_last12_rows_stg1 = ((zero_rows & 0xFFF0) == 0xFFF0) ? 1 : 0;
1910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    zero_last12_rows_stg2 = ((zero_cols & 0xFFF0) == 0xFFF0) ? 1 : 0;
1930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    zero_last8_rows_stg2 = zero_last8_cols_stg1;
1940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    if(zero_last8_cols_stg1)
1950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
1960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        loop = 1;
1970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
1980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    else
1990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        loop = 2;
2000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* i = 0 => lower 8 samples */
2020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* i = 1 => higher 8 samples */
2030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    for(i = 0; i < loop; i++)
2040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
2050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
2060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            WORD32 sample_half_index = i << 3;
2070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            WORD16 *pi2_tmp_src = pi2_src + sample_half_index;
2080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            WORD16 *pi2_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp;
2090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_tmp_src);
2110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_tmp_src += (src_strd << 1);
2120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_tmp_src);
2130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_tmp_src += (src_strd << 1);
2140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_tmp_src);
2150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_tmp_src += (src_strd << 1);
2160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_tmp_src);
2170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_tmp_src += (src_strd << 1);
2180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_tmp_src);
2190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_tmp_src += (src_strd << 1);
2200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_tmp_src);
2210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_tmp_src += (src_strd << 1);
2220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_tmp_src);
2230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_tmp_src += (src_strd << 1);
2240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_tmp_src);
2250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_tmp_src += (src_strd << 1);
2260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* If last 12 rows are zero : Rishab */
2310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            if(zero_last12_rows_stg1)
2320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
2330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eee */
2350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
2360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
2370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
2380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* Loading coeff and src for use in next block */
2390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_77, m_temp_reg_70); //to get sign
2410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0
2420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6);
2440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77);
2460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6);
2480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
2500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_26 = m_temp_reg_24;
2520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_27 = m_temp_reg_25;
2530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
2540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo */
2560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo0[0-3] */
2580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
2590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
2600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
2610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
2650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[0][0-3] stored in pi2_tmp[0][0-7] */
2670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[7][0-3] stored in pi2_tmp[0][8-15] */
2680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30);
2690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30);
2700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
2730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
2750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
2770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo0[4-7] */
2800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
2810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
2820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
2840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[0][4-7] stored in pi2_tmp[1][0-7] */
2860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[7][4-7] stored in pi2_tmp[1][8-15] */
2870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31);
2880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31);
2890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
2920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
2940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
2960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
2970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo1[0-3] */
2990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
3000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
3010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
3030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[1][0-3] stored in pi2_tmp[2][0-7] */
3050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[6][0-3] stored in pi2_tmp[2][8-15] */
3060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30);
3070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30);
3080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
3100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
3110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
3120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
3130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
3140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo1[4-7] */
3160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
3170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
3180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
3200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[1][4-7] stored in pi2_tmp[3][0-7] */
3220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[6][4-7] stored in pi2_tmp[3][8-15] */
3230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31);
3240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31);
3250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
3270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
3280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
3290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
3300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
3320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
3340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo2[0-3] */
3360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
3370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[2][0-3] stored in pi2_tmp[4][0-7] */
3400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[5][0-3] stored in pi2_tmp[4][8-15] */
3410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30);
3420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30);
3430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
3450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
3460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
3470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
3480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
3500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo2[4-7] */
3520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
3530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
3540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[2][4-7] stored in pi2_tmp[5][0-7] */
3560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[5][4-7] stored in pi2_tmp[5][8-15] */
3570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31);
3580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31);
3590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
3620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
3630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
3640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
3650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
3670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
3680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo3[0-3] */
3700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
3710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
3720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[3][0-3] stored in pi2_tmp[6][0-7] */
3740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[4][0-3] stored in pi2_tmp[6][8-15] */
3750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30);
3760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30);
3770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
3790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
3800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
3810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
3820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
3830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo3[4-7] */
3850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
3860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
3870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[3][4-7] stored in pi2_tmp[7][0-7] */
3890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[4][4-7] stored in pi2_tmp[7][8-15] */
3900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31);
3910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31);
3920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
3940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
3950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
3960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
3970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
3980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
3990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* If last 8 rows are zero : Rishab */
4000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            else if(zero_last8_rows_stg1)
4010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
4020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eeo */
4030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
4040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
4050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
4060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83  36
4070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83
4080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's
4100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's
4110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
4130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
4140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
4160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
4170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
4190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eee */
4210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
4220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
4230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
4240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* Loading coeff and src for use in next block */
4250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_77, m_temp_reg_70); //to  get signs
4260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0
4270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6);
4290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77);
4310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6);
4330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_26 = m_temp_reg_24;
4350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_27 = m_temp_reg_25;
4360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
4380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18
4390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
4400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo0[0-3] */
4420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
4430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
4440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
4450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
4490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20);
4500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20);
4510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[0][0-3] stored in pi2_tmp[0][0-7] */
4530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[7][0-3] stored in pi2_tmp[0][8-15] */
4540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
4550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
4560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
4580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
4590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
4600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
4610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
4630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo0[4-7] */
4650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
4660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
4670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
4690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21);
4700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21);
4710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[0][4-7] stored in pi2_tmp[1][0-7] */
4730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[7][4-7] stored in pi2_tmp[1][8-15] */
4740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
4750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
4760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
4780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
4790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
4800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
4810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
4830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50
4840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
4860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo1[0-3] */
4880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
4890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
4900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
4920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22);
4930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22);
4940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[1][0-3] stored in pi2_tmp[2][0-7] */
4960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[6][0-3] stored in pi2_tmp[2][8-15] */
4970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
4980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
4990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
5010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
5020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
5030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
5040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
5060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo1[4-7] */
5080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
5090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
5100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
5120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23);
5130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23);
5140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[1][4-7] stored in pi2_tmp[3][0-7] */
5160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[6][4-7] stored in pi2_tmp[3][8-15] */
5170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31);
5180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31);
5190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
5210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
5220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
5230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
5240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
5260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[10][0]); //18 75
5270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
5290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo2[0-3] */
5310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
5320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[2][0-3] stored in pi2_tmp[4][0-7] */
5350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[5][0-3] stored in pi2_tmp[4][8-15] */
5360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
5370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
5380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
5400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
5410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
5420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
5430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
5450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo2[4-7] */
5470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
5480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
5490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[2][4-7] stored in pi2_tmp[5][0-7] */
5510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[5][4-7] stored in pi2_tmp[5][8-15] */
5520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31);
5530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31);
5540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
5560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
5570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
5580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
5590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
5610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89
5620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
5640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo3[0-3] */
5660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
5670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
5680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[3][0-3] stored in pi2_tmp[6][0-7] */
5700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[4][0-3] stored in pi2_tmp[6][8-15] */
5710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
5720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
5730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
5750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
5760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
5770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
5780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
5790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo3[4-7] */
5810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
5820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
5830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[3][4-7] stored in pi2_tmp[7][0-7] */
5850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[4][4-7] stored in pi2_tmp[7][8-15] */
5860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31);
5870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31);
5880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
5900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
5910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
5920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
5930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
5940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            } /* If all the rows are non-zero : Rishab */
5950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            else
5960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
5970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eeo */
5980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
5990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
6000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
6020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83  36
6030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83
6040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's
6060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's
6070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
6090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
6100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
6120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
6130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
6140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eee */
6160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
6170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
6180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
6190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* Loading coeff and src for use in next block */
6200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[0][0]); //64  64
6210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[1][0]); //64 -64
6220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved LSB's
6240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved MSB's
6250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
6270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_0, m_coeff4);
6280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
6300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_1, m_coeff4);
6310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
6330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18
6340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
6360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo0[0-3] */
6370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
6380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
6390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
6400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
6410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_77);
6420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
6440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2);
6450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
6480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20);
6490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20);
6500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[0][0-3] stored in pi2_tmp[0][0-7] */
6520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[7][0-3] stored in pi2_tmp[0][8-15] */
6530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
6540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
6550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
6560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
6570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
6590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
6600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
6610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
6620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
6650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo0[4-7] */
6670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
6680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
6690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2);
6700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
6720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21);
6730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21);
6740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[0][4-7] stored in pi2_tmp[1][0-7] */
6760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[7][4-7] stored in pi2_tmp[1][8-15] */
6770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
6780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
6790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
6800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
6810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
6830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
6840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
6850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
6860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
6880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50
6890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
6910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo1[0-3] */
6930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
6940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
6950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4);
6960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
6980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22);
6990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22);
7000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[1][0-3] stored in pi2_tmp[2][0-7] */
7020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[6][0-3] stored in pi2_tmp[2][8-15] */
7030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
7040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
7050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_32);
7060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_32);
7070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
7090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
7100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
7110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
7120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
7140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo1[4-7] */
7160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
7170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
7180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
7190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
7210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23);
7220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23);
7230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[1][4-7] stored in pi2_tmp[3][0-7] */
7250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[6][4-7] stored in pi2_tmp[3][8-15] */
7260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31);
7270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31);
7280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_33);
7290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_33);
7300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
7320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
7330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
7340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
7350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
7360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[10][0]); //18 75
7370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
7380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo2[0-3] */
7400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
7410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
7420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2);
7430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[2][0-3] stored in pi2_tmp[4][0-7] */
7450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[5][0-3] stored in pi2_tmp[4][8-15] */
7460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
7470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
7480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
7490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
7500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
7520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
7530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
7540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
7550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
7560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo2[4-7] */
7580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
7590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
7600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2);
7610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[2][4-7] stored in pi2_tmp[5][0-7] */
7630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[5][4-7] stored in pi2_tmp[5][8-15] */
7640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31);
7650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31);
7660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
7670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
7680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
7700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
7710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
7720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
7730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
7750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89
7760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
7780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo3[0-3] */
7800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
7810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
7820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4);
7830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[3][0-3] stored in pi2_tmp[6][0-7] */
7850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[4][0-3] stored in pi2_tmp[6][8-15] */
7860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
7870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
7880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
7890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
7900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
7930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
7940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
7950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
7960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
7970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo3[4-7] */
7990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
8000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
8010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
8020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[3][4-7] stored in pi2_tmp[7][0-7] */
8040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[4][4-7] stored in pi2_tmp[7][8-15] */
8050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31);
8060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31);
8070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
8080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
8090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
8110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
8120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
8130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += 8;
8140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
8150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
8170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
8180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
8200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            WORD32 sample_half_index = i << 3;
8210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            WORD16 *pi2_tmp_src = pi2_src + sample_half_index + src_strd;
8220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_tmp_src);
8240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_tmp_src += (src_strd << 1);
8250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_tmp_src);
8260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_tmp_src += (src_strd << 1);
8270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_tmp_src);
8280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_tmp_src += (src_strd << 1);
8290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_tmp_src);
8300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_tmp_src += (src_strd << 1);
8310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_tmp_src);
8320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_tmp_src += (src_strd << 1);
8330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_tmp_src);
8340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_tmp_src += (src_strd << 1);
8350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_tmp_src);
8360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_tmp_src += (src_strd << 1);
8370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_tmp_src);
8380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_tmp_src += (src_strd << 1);
8390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
8400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* o & stage 1 out */
8420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
8430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            WORD32 j;
8440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            WORD16 *pi2_src_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp;
8450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp;
8460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            WORD32 out_stride = (trans_size << 1);
8470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            WORD32 in_stride = trans_size << 1;
8480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            if(zero_last12_rows_stg1)
8500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
8510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                for(j = 0; j < 2; j++)
8520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
8530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    if(j) //H8B= higher 8 bytes L8B lower 8 bytes
8540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
8550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
8560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
8570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    else
8580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
8590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
8600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
8610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
8620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o0[0-3] */
8650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
8660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
8670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
8700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch += in_stride;
8710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
8730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
8760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
8770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
8790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_count = _mm_cvtsi32_si128(i4_shift);
8800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00);
8810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
8830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
8840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
8850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
8860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
8880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
8900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += out_stride;
8910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
8920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o1[0-3] */
8940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
8950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
8960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
8990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch += in_stride;
9000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
9020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
9040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
9050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
9070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
9080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
9090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
9100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
9120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
9140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += out_stride;
9150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
9160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o2[0-3] */
9180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
9190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
9200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
9220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch += in_stride;
9230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
9250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
9270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
9280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
9300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
9310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
9320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
9330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
9350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
9370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += out_stride;
9380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
9390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o3[0-3] */
9410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
9420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
9430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
9450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch += 8;
9460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
9480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
9500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
9510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
9530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
9540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
9550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
9560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
9580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
9600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += 8;
9610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
9620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o4[0-3] */
9640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
9650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
9660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
9680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch -= in_stride;
9690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
9710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
9730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
9740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
9760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
9770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
9780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
9790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
9810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
9830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch -= out_stride;
9840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
9850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o5[0-3] */
9870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
9880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
9890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
9920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch -= in_stride;
9930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
9950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
9970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
9980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
10000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
10010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
10020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
10030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
10050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
10070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch -= out_stride;
10080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
10090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o6[0-3] */
10110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
10120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
10130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
10150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch -= in_stride;
10160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
10180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
10200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
10210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
10230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
10240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
10250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
10260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
10280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
10300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch -= out_stride;
10310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
10320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o7[0-3] */
10340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
10350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
10360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
10380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch += 8;
10390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
10410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
10420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
10440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
10450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
10460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
10470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
10490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
10510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += 8;
10520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
10530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
10540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
10550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            else if(zero_last8_rows_stg1)
10560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
10570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                for(j = 0; j < 2; j++)
10580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
10590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    if(j)
10600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
10610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
10620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B
10630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
10640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    else
10650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
10660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
10670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B
10680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
10690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
10700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70
10710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o0[0-3] */
10730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
10740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
10750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
10760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
10780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch += in_stride;
10790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
10810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43
10820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
10840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
10850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
10860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
10890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_count = _mm_cvtsi32_si128(i4_shift);
10900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00);
10920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
10940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
10950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
10960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
10970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
10990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
11010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += out_stride;
11020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
11030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o1[0-3] */
11050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
11060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
11070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
11080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
11100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch += in_stride;
11110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
11130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87
11140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
11160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
11170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
11180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
11200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
11210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
11220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
11230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
11250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
11270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += out_stride;
11280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
11290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o2[0-3] */
11310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
11320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
11330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
11340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
11360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch += in_stride;
11370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
11390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9
11400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
11420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
11430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
11440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
11460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
11470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
11480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
11490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
11510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
11530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += out_stride;
11540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
11550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o3[0-3] */
11570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
11580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
11590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
11600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
11620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch += 8;
11630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
11650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90
11660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25);
11680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
11690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
11700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
11720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
11730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
11740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
11750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
11770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
11790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += 8;
11800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
11810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o4[0-3] */
11830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
11840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
11850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
11860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
11880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch -= in_stride;
11890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
11910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25
11920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
11940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
11950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
11960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
11990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
12000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
12010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
12020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
12040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
12060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch -= out_stride;
12070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
12080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o5[0-3] */
12100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
12110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
12120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
12130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
12150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch -= in_stride;
12160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
12180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80
12190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
12210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
12220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
12230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
12260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
12270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
12280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
12290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
12310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
12330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch -= out_stride;
12340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
12350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o6[0-3] */
12370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
12380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
12390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
12400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
12420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch -= in_stride;
12430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
12450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57
12460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
12480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
12490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
12500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
12520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
12530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
12540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
12550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
12570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
12590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch -= out_stride;
12600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
12610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o7[0-3] */
12630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
12640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
12650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
12660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
12680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch += 8;
12690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
12710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
12720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
12730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
12750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
12760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
12770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
12780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
12800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
12820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += 8;
12830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
12840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
12850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
12870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            else
12880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
12890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                for(j = 0; j < 2; j++)
12930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
12940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    if(j) //H8B= higher 8 bytes L8B lower 8 bytes
12950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
12960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
12970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B
12980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_12 = _mm_unpackhi_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 H8B
12990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 H8B
13000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
13010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    else
13020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
13030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
13040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B
13050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 L8B
13060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 L8B
13070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
13080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
13090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70
13100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[2][0]); //57 43
13110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[3][0]); //25  9
13120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o0[0-3] */
13150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
13160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
13170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
13180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
13190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
13200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
13230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch += in_stride;
13240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
13260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43
13270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[6][0]); //80 90
13280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[7][0]); //70 25
13290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
13310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
13320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
13330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
13340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
13350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
13370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_count = _mm_cvtsi32_si128(i4_shift);
13380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
13390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
13400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
13420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
13430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
13440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
13450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
13470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
13490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += out_stride;
13500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
13510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o1[0-3] */
13530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
13540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
13550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
13560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
13570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
13580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
13610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch += in_stride;
13620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
13640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87
13650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[10][0]); //25 -57
13660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[11][0]); //90 43
13670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
13690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27);
13700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26);
13710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
13720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
13730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
13750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
13760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
13770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
13780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
13800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
13820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += out_stride;
13830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
13840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o2[0-3] */
13860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
13870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
13880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
13890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
13900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
13910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
13930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch += in_stride;
13940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
13960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9
13970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[14][0]); //90 25
13980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[15][0]); //80 57
13990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
14010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
14020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22);
14030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
14040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
14050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
14070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
14080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
14090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
14100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
14120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
14140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += out_stride;
14150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
14160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o3[0-3] */
14180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
14190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
14200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
14210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
14220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
14230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
14260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch += 8;
14270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
14290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90
14300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[18][0]); //9 87
14310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[19][0]); //43 70
14320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25);
14340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27);
14350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26);
14360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
14370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
14380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
14400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
14410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
14420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
14430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
14450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
14470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += 8;
14480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
14490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o4[0-3] */
14510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
14520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
14530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
14540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
14550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
14560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
14580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch -= in_stride;
14590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
14610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25
14620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[22][0]); //87 -70
14630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[23][0]); //9 -80
14640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
14660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
14670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22);
14680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
14690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
14700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
14720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
14730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
14740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
14750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
14770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
14790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch -= out_stride;
14800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
14810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o5[0-3] */
14830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
14840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
14850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
14860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
14870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
14880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
14910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch -= in_stride;
14920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
14940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80
14950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[26][0]); //43 9
14960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[27][0]); //57 -87
14970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
14990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27);
15000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26);
15010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
15020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
15030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
15050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
15060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
15070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
15080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
15100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
15120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch -= out_stride;
15130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
15140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o6[0-3] */
15160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
15170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
15180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
15190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
15200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
15210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
15240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch -= in_stride;
15250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
15270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57
15280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[30][0]); //70 -80
15290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[31][0]); //87 -90
15300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
15330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
15340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
15350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
15360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
15370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
15390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
15400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
15410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
15420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
15440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
15460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch -= out_stride;
15470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
15480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o7[0-3] */
15500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
15510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
15520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
15530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
15540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
15550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
15570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch += 8;
15580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
15600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27);
15610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26);
15620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
15630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
15640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
15670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
15680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
15690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
15700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
15720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
15740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += 8;
15750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
15760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
15770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
15780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
15790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* Transpose */
15810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
15820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            WORD16 *pi2_src_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp;
15830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            WORD16 *pi2_dst_scratch = ((i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp);
15840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            WORD32 out_stride = (trans_size << 1);
15850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            WORD32 in_stride = (trans_size << 1);
15860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            WORD32 j;
15870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            for(j = 0; j < 2; j++)
15890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
15900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //b, a
15910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_scratch += in_stride;
15920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_31 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //d, c
15930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_scratch += in_stride;
15940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_32 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //f, e
15950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_scratch += in_stride;
15960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_33 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //h, g
15970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_scratch += 8;
15980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_34 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //j, i
15990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_scratch -= in_stride;
16000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_35 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //l, k
16010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_scratch -= in_stride;
16020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_36 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //n, m
16030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_scratch -= in_stride;
16040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_37 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //p, o
16050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_scratch += 8;
16060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31); //ca3ca2ca1ca0
16080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30); //bd3bd2bd1bd0
16090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33); //ge3ge2ge1ge0
16110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32); //fh3fh2fh1fh0
16120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35); //ki3ki2ki1ki0
16140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34); //jl3jl2jl1jl0
16150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37); //om3om2om1om0
16170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36); //np3np2np1np0
16180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_30 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42); //ge1ca1ge0ca0
16210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_31 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42); //ge3ca3ge2ca2
16220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_32 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46); //om1ki1om0ki0
16240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_33 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46); //om3ki3om2ki2
16250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_34 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41); //bd1fh1bd0fh0
16270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_35 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41); //bd3fh3bd2fh2
16280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_36 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45); //jl1np1jl0np0
16300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_37 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45); //jl3np3jl2np2
16310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_40 = _mm_unpacklo_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca0
16340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_41 = _mm_unpackhi_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca1
16350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_42 = _mm_unpacklo_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca2
16370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_43 = _mm_unpackhi_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca3
16380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_44 = _mm_unpacklo_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp0
16400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_45 = _mm_unpackhi_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp1
16410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_46 = _mm_unpacklo_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp2
16430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_47 = _mm_unpackhi_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp3
16440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
16460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_dst_scratch += out_stride;
16470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_44);
16480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_dst_scratch += out_stride;
16490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_41);
16500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_dst_scratch += out_stride;
16510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_45);
16520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_dst_scratch += 8;
16530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_42);
16540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_dst_scratch -= out_stride;
16550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_46);
16560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_dst_scratch -= out_stride;
16570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_43);
16580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_dst_scratch -= out_stride;
16590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_47);
16600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_dst_scratch += 8;
16610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
16620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
16630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
16640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    if(zero_last8_cols_stg1)
16660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
16670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        WORD16 *pi2_dst_scratch = (pi2_tmp + 8 * trans_size);
16680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        WORD32 out_stride = (trans_size << 1);
16690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        WORD32 j;
16700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_40 = _mm_setzero_si128();
16720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        for(j = 0; j < 2; j++)
16730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
16740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
16750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_dst_scratch += out_stride;
16760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
16770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_dst_scratch += out_stride;
16780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
16790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_dst_scratch += out_stride;
16800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
16810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_dst_scratch += 8;
16820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
16830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_dst_scratch -= out_stride;
16840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
16850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_dst_scratch -= out_stride;
16860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
16870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_dst_scratch -= out_stride;
16880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
16890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_dst_scratch += 8;
16900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
16910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
16920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* Stage 2 */
16970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    for(i = 0; i < 2; i++)
16980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
16990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        WORD16 *pi2_src_temp = (i) ? (pi2_tmp + 2 * trans_size) : (WORD16 *)(pi2_tmp);
17000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        WORD32 stride = (trans_size);
17010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        MEM_ALIGN16 WORD16 temp_array[256];
17020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        i4_shift = IT_SHIFT_STAGE_2;
17040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        if(zero_last12_rows_stg2)
17060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
17070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* eeo */
17080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
17090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
17100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
17110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //0
17120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_temp += (stride * 9);
17140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                if(!i)
17160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
17170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_src_temp += (stride * 6 + 8);
17180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
17190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                else
17200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
17210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_src_temp += (stride * 2 + 8);
17220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
17230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_temp -= (stride * 9);
17250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //2
17270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_20 = _mm_setzero_si128();
17290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_22 = _mm_setzero_si128();
17300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_21 = _mm_setzero_si128();
17320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_23 = _mm_setzero_si128();
17330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
17340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* eee */
17360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
17370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
17380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
17390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* Loading coeff and src for use in next block */
17400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* Loading coeff and src for use in next block */
17420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_20, m_temp_reg_70);
17430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0
17450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6);
17470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77);
17490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6);
17500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
17520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18
17530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_26 = m_temp_reg_24;
17550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_27 = m_temp_reg_25;
17560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*  */
17570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_20);
17590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_20);
17600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
17610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* eo */
17630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
17640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                WORD16 *pi2_scratch = temp_array;
17650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                WORD32 out_stride = 8;
17660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo0[0-3] */
17690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
17700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
17710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
17730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[0][0-3] stored in pu1_dst[0] */
17750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[7][0-3] stored in pu1_dst[1] */
17760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30);
17770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30);
17780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
17800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += out_stride;
17810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)(pi2_scratch), m_temp_reg_35);
17820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += out_stride;
17830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
17840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo0[4-7] */
17860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
17870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
17880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
17900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[0][4-7] stored in pu1_dst[2] */
17920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[7][4-7] stored in pu1_dst[3] */
17930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31);
17950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31);
17960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
17980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += out_stride;
17990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
18000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += out_stride;
18010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
18030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50
18040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
18060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo1[0-3] */
18080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
18090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
18100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
18120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[1][0-3] stored in pu1_dst[4] */
18140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[6][0-3] stored in pu1_dst[5] */
18150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30);
18160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30);
18170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
18190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += out_stride;
18200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
18210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += out_stride;
18220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
18230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo1[4-7] */
18250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
18260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
18270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
18290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[1][4-7] stored in pu1_dst[6]*/
18310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[6][4-7] stored in pu1_dst[7] */
18320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31);
18330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31);
18340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
18360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += out_stride;
18370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
18380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += out_stride;
18390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
18410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
18430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo2[0-3] */
18450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
18460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
18470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[2][0-3] stored in pu1_dst[8]*/
18490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[5][0-3] stored in pu1_dst[9] */
18500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30);
18510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30);
18520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
18540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += out_stride;
18550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
18560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += out_stride;
18570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
18580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo2[4-7] */
18600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
18610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
18620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[2][4-7] stored in pu1_dst[10]*/
18640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[5][4-7] stored in pu1_dst[11] */
18650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31);
18660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31);
18670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
18690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += out_stride;
18700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
18710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += out_stride;
18720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
18740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89
18750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
18770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo3[0-3] */
18790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
18800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
18810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[3][0-3] stored in pu1_dst[12]*/
18830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[4][0-3] stored in pu1_dst[13] */
18840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30);
18850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30);
18860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
18880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += out_stride;
18890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
18900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += out_stride;
18910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
18920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo3[4-7] */
18940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
18950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
18960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[3][4-7] stored in pu1_dst[14]*/
18980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[4][4-7] stored in pu1_dst[15] */
18990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31);
19000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31);
19010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
19030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += out_stride;
19040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
19050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += out_stride;
19060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
19070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
19090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
19100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        else if(zero_last8_rows_stg2)
19110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
19120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* eeo */
19130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
19140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
19150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
19160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_16_even[3][0]); //83
19180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_16_even[4][0]); //36
19190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //0
19210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_temp += (stride);
19220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //4
19230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_temp += (stride * 8);
19240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                if(!i)
19260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
19270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_src_temp += (stride * 6 + 8);
19280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
19290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                else
19300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
19310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_src_temp += (stride * 2 + 8);
19320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
19330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_temp -= (stride * 8);
19350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //6
19360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_temp -= (stride);
19370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //2
19380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_76 = _mm_setzero_si128();
19410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83  36
19440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83
19450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's
19470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's
19480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
19500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
19510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
19530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
19540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
19550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* eee */
19570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
19580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
19590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
19600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* Loading coeff and src for use in next block */
19610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_76, m_temp_reg_70);
19640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0
19660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6);
19680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77);
19690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6);
19700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
19720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_26 = m_temp_reg_24;
19740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_27 = m_temp_reg_25;
19750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
19770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
19780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
19790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* eo */
19810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
19820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                WORD16 *pi2_scratch = temp_array;
19830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                WORD32 out_stride = 8;
19840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo0[0-3] */
19870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
19880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
19890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
19910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20);
19920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20);
19930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[0][0-3] stored in pu1_dst[0] */
19950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[7][0-3] stored in pu1_dst[1] */
19960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
19970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
19980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
20000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += out_stride;
20010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)(pi2_scratch), m_temp_reg_35);
20020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += out_stride;
20030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
20040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo0[4-7] */
20060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
20070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
20080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
20100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21);
20110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21);
20120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[0][4-7] stored in pu1_dst[2] */
20140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[7][4-7] stored in pu1_dst[3] */
20150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
20170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
20180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
20200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += out_stride;
20210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
20220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += out_stride;
20230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
20250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50
20260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
20280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo1[0-3] */
20300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
20310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
20320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
20340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22);
20350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22);
20360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[1][0-3] stored in pu1_dst[4] */
20380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[6][0-3] stored in pu1_dst[5] */
20390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
20400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
20410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
20430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += out_stride;
20440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
20450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += out_stride;
20460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
20470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo1[4-7] */
20490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
20500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
20510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
20530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23);
20540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23);
20550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[1][4-7] stored in pu1_dst[6]*/
20570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[6][4-7] stored in pu1_dst[7] */
20580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31);
20590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31);
20600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
20620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += out_stride;
20630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
20640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += out_stride;
20650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
20670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
20690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo2[0-3] */
20710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
20720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
20730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[2][0-3] stored in pu1_dst[8]*/
20750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[5][0-3] stored in pu1_dst[9] */
20760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
20770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
20780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
20800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += out_stride;
20810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
20820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += out_stride;
20830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
20840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo2[4-7] */
20860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
20870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
20880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[2][4-7] stored in pu1_dst[10]*/
20900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[5][4-7] stored in pu1_dst[11] */
20910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31);
20920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31);
20930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
20950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += out_stride;
20960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
20970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += out_stride;
20980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
21000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89
21010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
21030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo3[0-3] */
21050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
21060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
21070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[3][0-3] stored in pu1_dst[12]*/
21090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[4][0-3] stored in pu1_dst[13] */
21100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
21110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
21120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
21140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += out_stride;
21150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
21160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += out_stride;
21170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
21180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo3[4-7] */
21200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
21210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
21220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[3][4-7] stored in pu1_dst[14]*/
21240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[4][4-7] stored in pu1_dst[15] */
21250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31);
21260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31);
21270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
21290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += out_stride;
21300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
21310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += out_stride;
21320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
21330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
21340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
21350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        else
21370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
21380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* eeo */
21390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
21400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
21410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
21420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //0
21450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_temp += (stride);
21460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //4
21470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_temp += (stride * 7);
21480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_src_temp); //8
21490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_temp += (stride);
21500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_src_temp); //12
21510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                if(!i)
21520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
21530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_src_temp += (stride * 6 + 8);
21540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
21550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                else
21560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
21570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_src_temp += (stride * 2 + 8);
21580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
21590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_src_temp); //14
21600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_temp -= (stride);
21610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_src_temp); //10
21620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_temp -= (stride * 7);
21630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //6
21640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_temp -= (stride);
21650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //2
21660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83  36
21680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83
21690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's
21710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's
21720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
21740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
21750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
21770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
21780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
21810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* eee */
21830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
21840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
21850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
21860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* Loading coeff and src for use in next block */
21870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[0][0]); //64  64
21880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[1][0]); //64 -64
21890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved LSB's
21910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved MSB's
21920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
21940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_0, m_coeff4);
21950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
21970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_1, m_coeff4);
21980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
22000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18
22010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
22030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* eo */
22050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
22060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                WORD16 *pi2_scratch = temp_array;
22070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                WORD32 out_stride = 8;
22080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo0[0-3] */
22120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
22130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
22140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
22150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
22160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_77);
22170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
22200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2);
22210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
22240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20);
22250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20);
22260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[0][0-3] stored in pi2_tmp[0][0-7] */
22290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[7][0-3] stored in pi2_tmp[0][8-15] */
22300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
22310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
22320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
22330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
22340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
22360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += out_stride;
22370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
22380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += out_stride;
22390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
22420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo0[4-7] */
22440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
22450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
22470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2);
22480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
22500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21);
22510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21);
22520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[0][4-7] stored in pi2_tmp[1][0-7] */
22540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[7][4-7] stored in pi2_tmp[1][8-15] */
22550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
22560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
22570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
22580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
22590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
22610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += out_stride;
22620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
22630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += out_stride;
22640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
22660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50
22670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
22690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo1[0-3] */
22710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
22720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
22730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4);
22740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
22760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22);
22770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22);
22780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[1][0-3] stored in pi2_tmp[2][0-7] */
22800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[6][0-3] stored in pi2_tmp[2][8-15] */
22810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
22820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
22830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_32);
22840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_32);
22850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
22870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += out_stride;
22880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
22890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += out_stride;
22900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
22920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo1[4-7] */
22940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
22950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
22960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
22970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
22990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23);
23000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23);
23010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[1][4-7] stored in pi2_tmp[3][0-7] */
23030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[6][4-7] stored in pi2_tmp[3][8-15] */
23040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31);
23050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31);
23060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_33);
23070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_33);
23080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
23100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += out_stride;
23110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
23120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += out_stride;
23130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
23140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[10][0]); //18 75
23150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
23160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo2[0-3] */
23180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
23190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
23200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2);
23210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[2][0-3] stored in pi2_tmp[4][0-7] */
23230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[5][0-3] stored in pi2_tmp[4][8-15] */
23240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
23250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
23260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
23270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
23280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
23300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += out_stride;
23310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
23320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += out_stride;
23330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
23340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo2[4-7] */
23360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
23370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
23380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2);
23390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[2][4-7] stored in pi2_tmp[5][0-7] */
23410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[5][4-7] stored in pi2_tmp[5][8-15] */
23420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31);
23430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31);
23440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
23450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
23460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
23480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += out_stride;
23490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
23500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += out_stride;
23510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
23530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89
23540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
23560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo3[0-3] */
23580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
23590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
23600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4);
23610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[3][0-3] stored in pi2_tmp[6][0-7] */
23630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[4][0-3] stored in pi2_tmp[6][8-15] */
23640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
23650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
23660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
23670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
23680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
23710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += out_stride;
23720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
23730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += out_stride;
23740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
23750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* eo3[4-7] */
23770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
23780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
23790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
23800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[3][4-7] stored in pi2_tmp[7][0-7] */
23820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* e[4][4-7] stored in pi2_tmp[7][8-15] */
23830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31);
23840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31);
23850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
23860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
23870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
23890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += out_stride;
23900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
23910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_scratch += out_stride;
23920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
23930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
23940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
23950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        if(zero_last12_rows_stg2)
23970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
23980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* o & stage 2 pre-transposed out */
23990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
24000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                WORD32 j;
24010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                WORD16 *pi2_src_scratch = temp_array;
24020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8) : (pi2_tmp);
24030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                WORD32 out_stride = (trans_size);
24040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                WORD32 in_stride = (8) * 4;
24050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_temp = pi2_tmp + (stride * 4) + i * (stride * 2);
24070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //1
24090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_temp += (stride * 9);
24110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                if(0 == i)
24130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
24140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_src_temp -= (stride * 2 - 8);
24150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
24160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                else
24170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
24180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_src_temp -= (stride * 6 - 8);
24190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
24200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_temp -= (stride * 9);
24210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //3
24230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                for(j = 0; j < 2; j++)
24260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
24270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    if(j)
24280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
24290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
24300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
24310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    else
24320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
24330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
24340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
24350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
24360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o0[0-3] */
24380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
24390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
24400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
24420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch += in_stride;
24430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
24450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
24470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
24480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
24510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_count = _mm_cvtsi32_si128(i4_shift);
24520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00);
24530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
24560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
24570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
24580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
24590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
24610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
24630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += out_stride;
24640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
24650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o1[0-3] */
24670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
24680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
24690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
24710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch += in_stride;
24720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
24740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
24760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
24770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
24790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
24800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
24810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
24820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
24840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
24860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += ((!i) * out_stride + 8);
24870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
24880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o2[0-3] */
24900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
24910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
24920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
24940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch += in_stride;
24950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
24970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
24990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
25000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
25020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
25030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
25040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
25050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
25070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
25090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += out_stride;
25100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
25110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o3[0-3] */
25130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
25140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
25150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
25170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch += 8;
25180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
25200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
25220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
25230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
25250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
25260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
25270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
25280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
25300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
25320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += (i * out_stride + 8);
25330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
25340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o4[0-3] */
25360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
25370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
25380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
25400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch -= in_stride;
25410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
25430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
25450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
25460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
25490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
25500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
25510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
25520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
25540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
25560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += out_stride;
25570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
25580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o5[0-3] */
25600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
25610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
25620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
25640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch -= in_stride;
25650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
25670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
25690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
25700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
25730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
25740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
25750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
25760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
25780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
25800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += ((!i) * out_stride + 8);
25810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
25820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o6[0-3] */
25840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
25850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
25860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
25880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch -= in_stride;
25890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
25910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
25930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
25940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
25960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
25970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
25980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
25990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
26010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
26030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += out_stride;
26040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
26050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o7[0-3] */
26070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
26080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
26090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
26110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch += 8;
26120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
26140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
26150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
26170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
26180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
26190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
26200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
26220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
26240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += (i * out_stride + 8);
26250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
26260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
26290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
26300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
26310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        else if(zero_last8_rows_stg2)
26320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
26330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* o & stage 2 pre-transposed out */
26340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
26350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                WORD32 j;
26360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                WORD16 *pi2_src_scratch = temp_array;
26370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8) : (pi2_tmp);
26380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                WORD32 out_stride = (trans_size);
26390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                WORD32 in_stride = (8) * 4;
26400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_temp = pi2_tmp + (stride * 4) + i * (stride * 2);
26420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //1
26450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_temp += (stride);
26460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //5
26470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_temp += (stride * 8);
26480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                if(0 == i)
26500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
26510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_src_temp -= (stride * 2 - 8);
26520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
26530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                else
26540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
26550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_src_temp -= (stride * 6 - 8);
26560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
26570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_temp -= (stride * 8);
26590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //7
26600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_temp -= (stride);
26610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //3
26620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                for(j = 0; j < 2; j++)
26650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
26660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    if(j)
26670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
26680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
26690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B
26700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
26710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    else
26720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
26730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
26740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B
26750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
26760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
26770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70
26780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o0[0-3] */
26800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
26810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
26820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
26830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
26850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch += in_stride;
26860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
26880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43
26890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
26910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
26920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
26930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
26960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_count = _mm_cvtsi32_si128(i4_shift);
26970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00);
26990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
27010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
27020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
27030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
27040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
27060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
27080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += out_stride;
27090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
27100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o1[0-3] */
27120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
27130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
27140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
27150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
27170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch += in_stride;
27180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
27200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87
27210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
27230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
27240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
27250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
27270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
27280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
27290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
27300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
27320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
27340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += ((!i) * out_stride + 8);
27350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
27360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o2[0-3] */
27380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
27390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
27400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
27410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
27430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch += in_stride;
27440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
27460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9
27470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
27490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
27500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
27510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
27530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
27540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
27550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
27560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
27580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
27600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += out_stride;
27610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
27620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o3[0-3] */
27640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
27650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
27660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
27670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
27690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch += 8;
27700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
27720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90
27730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25);
27750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
27760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
27770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
27790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
27800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
27810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
27820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
27840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
27860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += (i * out_stride + 8);
27870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
27880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o4[0-3] */
27900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
27910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
27920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
27930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
27950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch -= in_stride;
27960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
27980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25
27990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
28010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
28020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
28030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
28060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
28070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
28080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
28090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
28110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
28130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += out_stride;
28140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
28150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o5[0-3] */
28170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
28180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
28190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
28200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
28220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch -= in_stride;
28230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
28250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80
28260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
28280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
28290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
28300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
28330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
28340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
28350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
28360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
28380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
28400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += ((!i) * out_stride + 8);
28410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
28420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o6[0-3] */
28440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
28450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
28460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
28470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
28490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch -= in_stride;
28500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
28520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57
28530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
28550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
28560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
28570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
28590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
28600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
28610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
28620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
28640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
28660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += out_stride;
28670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
28680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o7[0-3] */
28700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
28710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
28720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
28730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
28750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch += 8;
28760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
28780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
28790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
28800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
28820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
28830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
28840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
28850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
28870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
28890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += (i * out_stride + 8);
28900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
28910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
28920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
28930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
28940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        else
28950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
28960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* o & stage 2 pre-transposed out */
28970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
28980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                WORD32 j;
28990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                WORD16 *pi2_src_scratch = temp_array;
29000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8) : (pi2_tmp);
29010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                WORD32 out_stride = (trans_size);
29020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                WORD32 in_stride = (8) * 4;
29030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_temp = pi2_tmp + (stride * 4) + i * (stride * 2);
29050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //1
29080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_temp += (stride);
29090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //5
29100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_temp += (stride * 7);
29110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_src_temp); //9
29120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_temp += (stride);
29130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_src_temp); //13
29140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                if(0 == i)
29150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
29160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_src_temp -= (stride * 2 - 8);
29170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
29180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                else
29190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
29200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_src_temp -= (stride * 6 - 8);
29210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
29220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_src_temp); //15
29230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_temp -= (stride);
29240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_src_temp); //11
29250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_temp -= (stride * 7);
29260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //7
29270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_temp -= (stride);
29280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //3
29290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                for(j = 0; j < 2; j++)
29320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
29330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    if(j) //H8B= higher 8 bytes L8B lower 8 bytes
29350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
29360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
29370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B
29380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_12 = _mm_unpackhi_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 H8B
29390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 H8B
29400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
29410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    else
29420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
29430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
29440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B
29450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 L8B
29460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 L8B
29470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
29480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
29490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70
29500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[2][0]); //57 43
29510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[3][0]); //25  9
29520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o0[0-3] */
29550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
29560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
29570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
29580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
29590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
29600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
29630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch += in_stride;
29640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
29660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43
29670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[6][0]); //80 90
29680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[7][0]); //70 25
29690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
29710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
29720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
29730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
29740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
29750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
29770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_count = _mm_cvtsi32_si128(i4_shift);
29780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00);
29790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
29810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
29820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
29830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
29840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
29860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
29880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += out_stride;
29890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
29900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o1[0-3] */
29920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
29930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
29940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
29950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
29960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
29970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
30000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch += in_stride;
30010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
30030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87
30040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[10][0]); //25 -57
30050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[11][0]); //90 43
30060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
30080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27);
30090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26);
30100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
30110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
30120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
30140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
30150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
30160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
30170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
30190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
30210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += ((!i) * out_stride + 8);
30220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
30230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o2[0-3] */
30250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
30260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
30270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
30280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
30290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
30300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
30320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch += in_stride;
30330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
30350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9
30360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[14][0]); //90 25
30370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[15][0]); //80 57
30380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
30400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
30410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22);
30420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
30430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
30440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
30460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
30470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
30480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
30490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
30510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
30530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += out_stride;
30540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
30550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o3[0-3] */
30570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
30580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
30590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
30600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
30610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
30620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
30650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch += 8;
30660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
30680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90
30690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[18][0]); //9 87
30700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[19][0]); //43 70
30710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25);
30730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27);
30740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26);
30750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
30760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
30770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
30790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
30800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
30810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
30820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
30840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
30860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += (i * out_stride + 8);
30870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
30880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o4[0-3] */
30900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
30910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
30920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
30930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
30940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
30950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
30970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch -= in_stride;
30980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
31000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25
31010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[22][0]); //87 -70
31020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[23][0]); //9 -80
31030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
31050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
31060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22);
31070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
31080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
31090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
31110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
31120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
31130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
31140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
31160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
31180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += out_stride;
31190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
31200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o5[0-3] */
31220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
31230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
31240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
31250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
31260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
31270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
31300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch -= in_stride;
31310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
31330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80
31340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[26][0]); //43 9
31350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[27][0]); //57 -87
31360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
31380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27);
31390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26);
31400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
31410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
31420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
31440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
31450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
31460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
31470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
31490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
31510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += ((!i) * out_stride + 8);
31520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
31530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o6[0-3] */
31550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
31560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
31570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
31580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
31590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
31600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
31630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch -= in_stride;
31640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
31660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57
31670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[30][0]); //70 -80
31680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[31][0]); //87 -90
31690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
31720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
31730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
31740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
31750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
31760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
31780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
31790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
31800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
31810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
31830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
31850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += out_stride;
31860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
31870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o7[0-3] */
31890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
31900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
31910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
31920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
31930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
31940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
31960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src_scratch += 8;
31970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
31990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27);
32000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26);
32010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
32020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
32030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
32060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
32070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
32080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
32090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
32110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
32130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_dst_scratch += (i * out_stride + 8);
32140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    }
32150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
32170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
32180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
32190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
32200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* Transpose */
32220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
32230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        WORD16 *pi2_src_scratch;
32240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        UWORD8 *pu1_pred_temp = pu1_pred;
32250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        WORD32 out_stride = dst_strd;
32260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        WORD32 in_stride = trans_size;
32270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        WORD32 j;
32280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_1 = _mm_setzero_si128();
32290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        for(i = 0; i < 2; i++)
32300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
32310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src_scratch = (i) ? (pi2_tmp + 8) : pi2_tmp;
32320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            for(j = 0; j < 2; j++)
32340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
32350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //b, a
32360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_scratch += in_stride;
32370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_31 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //d, c
32380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_scratch += ((!i) * in_stride + 8);
32390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_32 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //f, e
32400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_scratch += (in_stride);
32410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_33 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //h, g
32420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_scratch += (i * in_stride + 8);
32430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_34 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //j, i
32440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_scratch += in_stride;
32450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_35 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //l, k
32460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_scratch += ((!i) * in_stride + 8);
32470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_36 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //n, m
32480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_scratch += in_stride;
32490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_37 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //p, o
32500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src_scratch += (i * in_stride + 8);
32510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31); //ca3ca2ca1ca0
32530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30); //bd3bd2bd1bd0
32540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33); //ge3ge2ge1ge0
32560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32); //fh3fh2fh1fh0
32570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35); //ki3ki2ki1ki0
32590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34); //jl3jl2jl1jl0
32600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37); //om3om2om1om0
32620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36); //np3np2np1np0
32630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_30 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42); //ge1ca1ge0ca0
32660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_31 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42); //ge3ca3ge2ca2
32670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_32 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46); //om1ki1om0ki0
32690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_33 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46); //om3ki3om2ki2
32700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_34 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41); //bd1fh1bd0fh0
32720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_35 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41); //bd3fh3bd2fh2
32730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_36 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45); //jl1np1jl0np0
32750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_37 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45); //jl3np3jl2np2
32760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_40 = _mm_unpacklo_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca0
32790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp);
32800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1);
32820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1);
32830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_44 = _mm_unpacklo_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp0
32850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_40 = _mm_add_epi16(m_temp_reg_40, m_temp_reg_0);
32860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_44 = _mm_add_epi16(m_temp_reg_44, m_temp_reg_12);
32870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
32890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
32900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_dst += out_stride;
32910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_pred_temp += pred_strd;
32920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_41 = _mm_unpackhi_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca1
32940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp);
32950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1);
32970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1);
32980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_45 = _mm_unpackhi_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp0
33000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_41 = _mm_add_epi16(m_temp_reg_41, m_temp_reg_0);
33010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_45 = _mm_add_epi16(m_temp_reg_45, m_temp_reg_12);
33020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
33030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_41, m_temp_reg_45);
33040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
33050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_dst += out_stride;
33060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_pred_temp += pred_strd;
33070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
33080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_42 = _mm_unpacklo_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca2
33090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp);
33100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
33110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1);
33120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1);
33130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
33140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_46 = _mm_unpacklo_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp0
33150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_42 = _mm_add_epi16(m_temp_reg_42, m_temp_reg_0);
33160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_46 = _mm_add_epi16(m_temp_reg_46, m_temp_reg_12);
33170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
33180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_42, m_temp_reg_46);
33190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
33200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_dst += out_stride;
33210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_pred_temp += pred_strd;
33220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
33230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_43 = _mm_unpackhi_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca3
33240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp);
33250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
33260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1);
33270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1);
33280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
33290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_47 = _mm_unpackhi_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp0
33300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_43 = _mm_add_epi16(m_temp_reg_43, m_temp_reg_0);
33310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_47 = _mm_add_epi16(m_temp_reg_47, m_temp_reg_12);
33320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
33330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_43, m_temp_reg_47);
33340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
33350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_dst += out_stride;
33360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_pred_temp += pred_strd;
33370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
33380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
33390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
33400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar}
3341