10d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/******************************************************************************
20d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
30d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
40d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
50d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Licensed under the Apache License, Version 2.0 (the "License");
60d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* you may not use this file except in compliance with the License.
70d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* You may obtain a copy of the License at:
80d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
90d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* http://www.apache.org/licenses/LICENSE-2.0
100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Unless required by applicable law or agreed to in writing, software
120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* distributed under the License is distributed on an "AS IS" BASIS,
130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* See the License for the specific language governing permissions and
150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* limitations under the License.
160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar******************************************************************************/
180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/**
190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *******************************************************************************
200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @file
210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  ihevc_itrans_recon_atom_intr.c
220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @brief
240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Contains function definitions for inverse  quantization, inverse
250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * transform and reconstruction
260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @author
280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  100470
290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  100592 (edited by)
300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @par List of Functions:
320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  - ihevc_itrans_recon_4x4_ttype1_ssse3()
330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  - ihevc_itrans_recon_4x4_ssse3()
340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  - ihevc_itrans_recon_8x8_ssse3()
350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @remarks
370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  None
380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *******************************************************************************
400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar */
410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <stdio.h>
420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <string.h>
430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_typedefs.h"
440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_platform_macros.h"
450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_macros.h"
460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_defs.h"
470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_func_selector.h"
480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_trans_tables.h"
490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_iquant_itrans_recon.h"
500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_trans_macros.h"
510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <immintrin.h>
540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <emmintrin.h>
550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <tmmintrin.h>
570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/**
600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *******************************************************************************
610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @brief
630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  This function performs inverse quantization, inverse  transform
640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * type1(DST) and reconstruction for 4x4  input block
650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @par Description:
670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Performs inverse quantization , inverse transform type 1  and adds
680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * prediction data and clips output to 8 bit
690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] pi2_src
710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Input 4x4 coefficients
720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] pi2_tmp
740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Temporary 4x4 buffer for storing inverse
750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  transform 1st stage output
760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] pu1_pred
780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Prediction 4x4 block
790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] pi2_dequant_coeff
810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Dequant Coeffs
820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[out] pu1_dst
840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Output 4x4 block
850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] qp_div
870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Quantization parameter / 6
880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] qp_rem
900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Quantization parameter % 6
910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] src_strd
930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Input stride
940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] pred_strd
960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Prediction stride
970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] dst_strd
990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Output Stride
1000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
1010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] zero_cols
1020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Zero columns in pi2_src
1030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
1040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @returns  Void
1050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
1060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @remarks
1070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  None
1080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
1090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *******************************************************************************
1100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar */
1110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarvoid ihevc_itrans_recon_4x4_ttype1_ssse3(WORD16 *pi2_src,
1130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                         WORD16 *pi2_tmp,
1140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                         UWORD8 *pu1_pred,
1150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                         UWORD8 *pu1_dst,
1160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                         WORD32 src_strd,
1170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                         WORD32 pred_strd,
1180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                         WORD32 dst_strd,
1190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                         WORD32 zero_cols,
1200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                         WORD32 zero_rows)
1210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar{
1220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_0;
1230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_1;
1240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_2;
1250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_3;
1260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_4;
1270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_10;
1280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_11;
1290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_12;
1300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_13;
1310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_14;
1320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_20;
1330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_21;
1340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_22;
1350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_23;
1360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_24;
1370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_25;
1380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_30;
1390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_31;
1400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_32;
1410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_33;
1420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_34;
1430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_35;
1440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_36;
1450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_rdng_factor;
1460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_count;
1470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_ge_zero16b_flag_row0;
1490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_ge_zero16b_flag_row1;
1500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_ge_zero16b_flag_row2;
1510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_ge_zero16b_flag_row3;
1520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_zero = _mm_setzero_si128();
1540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 i4_shift = IT_SHIFT_STAGE_1;
1560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    UNUSED(zero_cols);
1570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    UNUSED(zero_rows);
1580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    UNUSED(pi2_tmp);
1590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pi2_src);
1600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    pi2_src += src_strd;
1610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pi2_src);
1620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    pi2_src += src_strd;
1630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pi2_src);
1640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    pi2_src += src_strd;
1650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pi2_src);
1660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    m_ge_zero16b_flag_row0 = _mm_cmpgt_epi16(m_zero, m_temp_reg_0);
1680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    m_ge_zero16b_flag_row1 = _mm_cmpgt_epi16(m_zero, m_temp_reg_1);
1690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    m_ge_zero16b_flag_row2 = _mm_cmpgt_epi16(m_zero, m_temp_reg_2);
1700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    m_ge_zero16b_flag_row3 = _mm_cmpgt_epi16(m_zero, m_temp_reg_3);
1710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_0, m_ge_zero16b_flag_row0);
1730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_1, m_ge_zero16b_flag_row1);
1740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    m_temp_reg_2 = _mm_unpacklo_epi16(m_temp_reg_2, m_ge_zero16b_flag_row2);
1750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    m_temp_reg_3 = _mm_unpacklo_epi16(m_temp_reg_3, m_ge_zero16b_flag_row3);
1760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /*m_temp_reg_0 = _mm_cvtepi16_epi32(m_temp_reg_0);
1780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    m_temp_reg_2 = _mm_cvtepi16_epi32(m_temp_reg_2);
1790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    m_temp_reg_1 = _mm_cvtepi16_epi32(m_temp_reg_1);
1810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    m_temp_reg_3 = _mm_cvtepi16_epi32(m_temp_reg_3);*/
1820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* c[4] in m_temp_reg_14 */
1840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* c[4] = src[0] - src[2] + src[3] */
1850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
1860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_14 = _mm_sub_epi32(m_temp_reg_0, m_temp_reg_2);
1870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
1880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* c[3] in m_temp_reg_13 */
1900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
1910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_1, 6);
1920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_1, 3);
1930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_22 = _mm_slli_epi32(m_temp_reg_1, 1);
1940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_23 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
1950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_13 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
1960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //m_temp_reg_13 = _mm_mullo_epi32(m_temp_reg_1, m_coeff3);
1970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
1980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* c[0] in m_temp_reg_10 */
2000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
2010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_10 = _mm_add_epi32(m_temp_reg_0, m_temp_reg_2);
2020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
2030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* c[1] in m_temp_reg_11 */
2050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
2060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_11 = _mm_add_epi32(m_temp_reg_2, m_temp_reg_3);
2070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
2080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* c[2] in m_temp_reg_12 */
2100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
2110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_12 = _mm_sub_epi32(m_temp_reg_0, m_temp_reg_3);
2120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
2130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* c[4] in m_temp_reg_14 */
2150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* c[4] = src[0] - src[2] + src[3] */
2160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
2170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_14 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_3);
2180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
2190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* Stage 1 outputs stored in m_temp_reg_20-23 */
2210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
2220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_10, 5);
2230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_10, 1);
2240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_10);
2250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_21);
2260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //m_temp_reg_30 = _mm_mullo_epi32(m_temp_reg_10, m_coeff1);//29*c0
2270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_11, 6);
2290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_11, 3);
2300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_11);
2310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_21);
2320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //m_temp_reg_31 = _mm_mullo_epi32(m_temp_reg_11, m_coeff2);//55*c1
2330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_11, 5);
2370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_11, 1);
2380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_11);
2390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_21);
2400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //m_temp_reg_32 = _mm_mullo_epi32(m_temp_reg_11, m_coeff1);//29*c1
2410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_12, 6);
2430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_12, 3);
2440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_12);
2450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_21);
2460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //m_temp_reg_33 = _mm_mullo_epi32(m_temp_reg_12, m_coeff2);//55*c2
2470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_10, 6);
2510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_10, 3);
2520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_10);
2530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_21);
2540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //m_temp_reg_34 = _mm_mullo_epi32(m_temp_reg_10, m_coeff2);//55*c0
2550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_12, 5);
2570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_12, 1);
2580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_12);
2590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_21);
2600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //m_temp_reg_35 = _mm_mullo_epi32(m_temp_reg_12, m_coeff1);//29*c2
2610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_14, 6);
2630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_14, 3);
2640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_22 = _mm_slli_epi32(m_temp_reg_14, 1);
2650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_23 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
2670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //m_temp_reg_36 = _mm_mullo_epi32(m_temp_reg_14, m_coeff3);//74*c4
2680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_count = _mm_cvtsi32_si128(i4_shift);
2710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
2730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_4 = _mm_add_epi32(m_rdng_factor, m_temp_reg_13);
2740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_4);
2750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_21 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32);
2770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_4);
2780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_23 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_35);
2800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_4 = _mm_sub_epi32(m_rdng_factor, m_temp_reg_13);
2810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_4);
2820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_36, m_rdng_factor);
2840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_20 = _mm_sra_epi32(m_temp_reg_20, m_count);
2860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_21 = _mm_sra_epi32(m_temp_reg_21, m_count);
2870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_23 = _mm_sra_epi32(m_temp_reg_23, m_count);
2880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_22 = _mm_sra_epi32(m_temp_reg_22, m_count);
2890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_20 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
2910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_21 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
2920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
2930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);
2940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_24 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_22);
2960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_25 = _mm_unpacklo_epi16(m_temp_reg_21, m_temp_reg_23);
2970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_30 = _mm_unpacklo_epi32(m_temp_reg_24, m_temp_reg_25);
2990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_31 = _mm_unpackhi_epi32(m_temp_reg_24, m_temp_reg_25);
3000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
3020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* Stage 2 */
3040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
3050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        i4_shift = IT_SHIFT_STAGE_2;
3060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /*m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
3080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_20 = _mm_cvtepi16_epi32(m_temp_reg_20);
3090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);
3100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_21 = _mm_cvtepi16_epi32(m_temp_reg_21);
3110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_22 = _mm_cvtepi16_epi32(m_temp_reg_22);
3120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_23 = _mm_cvtepi16_epi32(m_temp_reg_23);*/
3130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_ge_zero16b_flag_row0 = _mm_cmpgt_epi16(m_zero, m_temp_reg_30);
3150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_ge_zero16b_flag_row1 = _mm_cmpgt_epi16(m_zero, m_temp_reg_31);
3160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_20 = _mm_unpacklo_epi16(m_temp_reg_30, m_ge_zero16b_flag_row0);
3180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_21 = _mm_unpacklo_epi16(m_temp_reg_31, m_ge_zero16b_flag_row1);
3190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_22 = _mm_unpackhi_epi16(m_temp_reg_30, m_ge_zero16b_flag_row0);
3200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_23 = _mm_unpackhi_epi16(m_temp_reg_31, m_ge_zero16b_flag_row1);
3210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* c[4] stored in m_temp_reg_4 */
3240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
3250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_4 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
3260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
3270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* c[3] stored in m_temp_reg_3 */
3290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
3300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_22, 6);
3310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_22, 3);
3320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_12 = _mm_slli_epi32(m_temp_reg_22, 1);
3330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_13 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_11);
3340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_3 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_13);
3350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //m_temp_reg_3 = _mm_mullo_epi32(m_temp_reg_22, m_coeff3);
3360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
3370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* c[0] stored in m_temp_reg_0 */
3390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
3400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_0 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
3420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* c[1] stored in m_temp_reg_1 */
3440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
3450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_1 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_21);
3460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
3470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* c[2] stored in m_temp_reg_2 */
3490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
3500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_2 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_23);
3510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
3520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* c[4] stored in m_temp_reg_4 */
3540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
3550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_4 = _mm_add_epi32(m_temp_reg_4, m_temp_reg_23);
3560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
3570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* Stage 2 output generation */
3590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
3600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_0, 5);
3610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_0, 1);
3620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_13 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_0);
3630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_11);
3640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //m_temp_reg_30 = _mm_mullo_epi32(m_temp_reg_0, m_coeff1);//29*c0
3650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_1, 6);
3670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_1, 3);
3680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_13 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_1);
3690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_11);
3700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //m_temp_reg_31 = _mm_mullo_epi32(m_temp_reg_1, m_coeff2);//55*c1
3710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_1, 5);
3750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_1, 1);
3760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_13 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_1);
3770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_11);
3780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //m_temp_reg_32 = _mm_mullo_epi32(m_temp_reg_1, m_coeff1);//29*c1
3790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_2, 6);
3810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_2, 3);
3820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_13 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_2);
3830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_11);
3840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //m_temp_reg_33 = _mm_mullo_epi32(m_temp_reg_2, m_coeff2);//55*c2
3850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_0, 6);
3890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_0, 3);
3900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_13 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_0);
3910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_11);
3920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //m_temp_reg_34 = _mm_mullo_epi32(m_temp_reg_0, m_coeff2);//55*c0
3930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_2, 5);
3950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_2, 1);
3960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_13 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_2);
3970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_11);
3980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //m_temp_reg_35 = _mm_mullo_epi32(m_temp_reg_2, m_coeff1);//29*c2
3990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_4, 6);
4010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_4, 3);
4020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_12 = _mm_slli_epi32(m_temp_reg_4, 1);
4030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_13 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_11);
4040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_36 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_13);
4050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //m_temp_reg_36 = _mm_mullo_epi32(m_temp_reg_4, m_coeff3);//74*c4
4060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_count = _mm_cvtsi32_si128(i4_shift);
4090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_4 = _mm_add_epi32(m_rdng_factor, m_temp_reg_3);
4110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_20 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
4120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_4);
4130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_21 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32);
4150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_4);
4160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_4 = _mm_sub_epi32(m_rdng_factor, m_temp_reg_3);
4180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_23 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_35);
4190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_4);
4200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_22 = _mm_add_epi32(m_temp_reg_36, m_rdng_factor);
4220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_20 = _mm_sra_epi32(m_temp_reg_20, m_count);
4240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_21 = _mm_sra_epi32(m_temp_reg_21, m_count);
4250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_23 = _mm_sra_epi32(m_temp_reg_23, m_count);
4260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_22 = _mm_sra_epi32(m_temp_reg_22, m_count);
4270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_20 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
4290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_21 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
4300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
4310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);
4320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_24 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_22);
4340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_25 = _mm_unpacklo_epi16(m_temp_reg_21, m_temp_reg_23);
4350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_20 = _mm_unpacklo_epi32(m_temp_reg_24, m_temp_reg_25);
4370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_21 = _mm_unpackhi_epi32(m_temp_reg_24, m_temp_reg_25);
4380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
4390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* Recon and store */
4410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
4420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            WORD32 *pi4_dst = (WORD32 *)pu1_dst;
4430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
4450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_pred += pred_strd;
4460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred);
4470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_pred += pred_strd;
4480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred);
4490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_pred += pred_strd;
4500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred);
4510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, m_zero);
4530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_1 = _mm_unpacklo_epi8(m_temp_reg_1, m_zero);
4540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_2 = _mm_unpacklo_epi8(m_temp_reg_2, m_zero);
4550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_3 = _mm_unpacklo_epi8(m_temp_reg_3, m_zero);
4560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /*m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
4580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_1 = _mm_cvtepu8_epi16(m_temp_reg_1);
4590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_2 = _mm_cvtepu8_epi16(m_temp_reg_2);
4600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_3 = _mm_cvtepu8_epi16(m_temp_reg_3);*/
4610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_0 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_1);
4630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_1 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_3);
4640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_20 = _mm_add_epi16(m_temp_reg_20, m_temp_reg_0);
4660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_21 = _mm_add_epi16(m_temp_reg_21, m_temp_reg_1);
4670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_0 = _mm_packus_epi16(m_temp_reg_20, m_temp_reg_21);
4690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_0);
4710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_1 = _mm_srli_si128(m_temp_reg_0, 4);
4720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_2 = _mm_srli_si128(m_temp_reg_0, 8);
4730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_3 = _mm_srli_si128(m_temp_reg_0, 12);
4740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_dst += dst_strd;
4750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi4_dst = (WORD32 *)(pu1_dst);
4760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_1);
4780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_dst += dst_strd;
4790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi4_dst = (WORD32 *)(pu1_dst);
4800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_2);
4820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_dst += dst_strd;
4830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi4_dst = (WORD32 *)(pu1_dst);
4840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_3);
4860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
4870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
4880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar}
4890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/**
4920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *******************************************************************************
4930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
4940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @brief
4950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  This function performs inverse quantization, inverse  transform
4960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * (DCT) and reconstruction for 4x4  input block
4970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
4980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @par Description:
4990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Performs inverse quantization , inverse transform and adds
5000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * prediction data and clips output to 8 bit
5010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
5020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] pi2_src
5030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Input 4x4 coefficients
5040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
5050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] pi2_tmp
5060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Temporary 4x4 buffer for storing inverse
5070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  transform 1st stage output
5080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
5090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] pu1_pred
5100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Prediction 4x4 block
5110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
5120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] pi2_dequant_coeff
5130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Dequant Coeffs
5140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
5150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[out] pu1_dst
5160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Output 4x4 block
5170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
5180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] qp_div
5190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Quantization parameter / 6
5200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
5210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] qp_rem
5220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Quantization parameter % 6
5230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
5240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] src_strd
5250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Input stride
5260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
5270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] pred_strd
5280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Prediction stride
5290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
5300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] dst_strd
5310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Output Stride
5320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
5330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] zero_cols
5340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Zero columns in pi2_src
5350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
5360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @returns  Void
5370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
5380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @remarks
5390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  None
5400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
5410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *******************************************************************************
5420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar */
5430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarvoid ihevc_itrans_recon_4x4_ssse3(WORD16 *pi2_src,
5450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                  WORD16 *pi2_tmp,
5460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                  UWORD8 *pu1_pred,
5470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                  UWORD8 *pu1_dst,
5480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                  WORD32 src_strd,
5490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                  WORD32 pred_strd,
5500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                  WORD32 dst_strd,
5510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                  WORD32 zero_cols,
5520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                  WORD32 zero_rows)
5530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar{
5540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_0;
5550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_1;
5560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_2;
5570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_3;
5580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_4;
5590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_10;
5600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_11;
5610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_12;
5620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_13;
5630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_14;
5640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_15;
5650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_20;
5660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_21;
5670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_22;
5680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_23;
5690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_24;
5700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_25;
5710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_30;
5720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_31;
5730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_33;
5740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_34;
5750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_rdng_factor;
5760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_count;
5770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_ge_zero16b_flag_row0;
5790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_ge_zero16b_flag_row1;
5800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_ge_zero16b_flag_row2;
5810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_ge_zero16b_flag_row3;
5820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_zero = _mm_setzero_si128();
5840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 i4_shift = IT_SHIFT_STAGE_1;
5860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    UNUSED(zero_rows);
5870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    UNUSED(zero_cols);
5880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    UNUSED(pi2_tmp);
5890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pi2_src);
5910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    pi2_src += src_strd;
5920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pi2_src);
5930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    pi2_src += src_strd;
5940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pi2_src);
5950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    pi2_src += src_strd;
5960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pi2_src);
5970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    m_ge_zero16b_flag_row0 = _mm_cmpgt_epi16(m_zero, m_temp_reg_0);
5990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    m_ge_zero16b_flag_row1 = _mm_cmpgt_epi16(m_zero, m_temp_reg_1);
6000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    m_ge_zero16b_flag_row2 = _mm_cmpgt_epi16(m_zero, m_temp_reg_2);
6010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    m_ge_zero16b_flag_row3 = _mm_cmpgt_epi16(m_zero, m_temp_reg_3);
6020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_0, m_ge_zero16b_flag_row0);
6040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_1, m_ge_zero16b_flag_row1);
6050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    m_temp_reg_2 = _mm_unpacklo_epi16(m_temp_reg_2, m_ge_zero16b_flag_row2);
6060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    m_temp_reg_3 = _mm_unpacklo_epi16(m_temp_reg_3, m_ge_zero16b_flag_row3);
6070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /*m_temp_reg_0 = _mm_cvtepi16_epi32(m_temp_reg_0);
6090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    m_temp_reg_2 = _mm_cvtepi16_epi32(m_temp_reg_2);
6100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    m_temp_reg_1 = _mm_cvtepi16_epi32(m_temp_reg_1);
6120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    m_temp_reg_3 = _mm_cvtepi16_epi32(m_temp_reg_3);*/
6130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* e */
6150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
6160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_0, 6);
6170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_2, 6);
6180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
6190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* o */
6210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
6220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_1, 5);
6230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_1, 2);
6240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_12 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
6250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //m_temp_reg_12 = _mm_mullo_epi32(m_temp_reg_1, m_coeff1);//src[1]*36
6260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_3, 6);
6280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_3, 4);
6290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_22 = _mm_slli_epi32(m_temp_reg_3, 1);
6300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_23 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_3);
6310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_22);
6320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_13 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_24);
6330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //m_temp_reg_13 = _mm_mullo_epi32(m_temp_reg_3, m_coeff3);//src[3]*83
6340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_1, 6);
6360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_1, 4);
6370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_22 = _mm_slli_epi32(m_temp_reg_1, 1);
6380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_23 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_1);
6390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_22);
6400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_14 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_24);
6410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //m_temp_reg_14 = _mm_mullo_epi32(m_temp_reg_1, m_coeff3);//src[1]*83
6420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_20 = _mm_slli_epi32(m_temp_reg_3, 5);
6440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_21 = _mm_slli_epi32(m_temp_reg_3, 2);
6450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_15 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
6460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //m_temp_reg_15 = _mm_mullo_epi32(m_temp_reg_3, m_coeff1);//src[3]*36
6470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
6480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
6500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* e1 stored in m_temp_reg_31 */
6520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
6530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_11);
6540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
6550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
6570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* e0 stored in m_temp_reg_30 */
6590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
6600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_11);
6610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
6620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
6640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    m_count = _mm_cvtsi32_si128(i4_shift);
6650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* o1 stored in m_temp_reg_33 */
6670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
6680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_13);
6690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
6700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* e1 + add */
6720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
6730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
6740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
6750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* e0 + add */
6770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
6780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
6790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
6800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* o0 stored in m_temp_reg_34 */
6820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
6830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_15);
6840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
6850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* Stage 1 outputs */
6870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
6880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_21 = _mm_add_epi32(m_temp_reg_31, m_temp_reg_33);
6890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_31, m_temp_reg_33);
6900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_34);
6920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_34);
6930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_21 = _mm_sra_epi32(m_temp_reg_21, m_count);
6960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_20 = _mm_sra_epi32(m_temp_reg_20, m_count);
6970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_22 = _mm_sra_epi32(m_temp_reg_22, m_count);
6980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_23 = _mm_sra_epi32(m_temp_reg_23, m_count);
6990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_20 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
7010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_21 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
7020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
7030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);
7040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_24 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_22);
7060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_25 = _mm_unpacklo_epi16(m_temp_reg_21, m_temp_reg_23);
7070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_30 = _mm_unpacklo_epi32(m_temp_reg_24, m_temp_reg_25);
7090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_31 = _mm_unpackhi_epi32(m_temp_reg_24, m_temp_reg_25);
7100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
7110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* Stage 2 */
7130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
7140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        i4_shift = IT_SHIFT_STAGE_2;
7150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /*m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
7170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);*/
7180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_ge_zero16b_flag_row0 = _mm_cmpgt_epi16(m_zero, m_temp_reg_30);
7200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_ge_zero16b_flag_row1 = _mm_cmpgt_epi16(m_zero, m_temp_reg_31);
7210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_20 = _mm_unpacklo_epi16(m_temp_reg_30, m_ge_zero16b_flag_row0);
7230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_21 = _mm_unpacklo_epi16(m_temp_reg_31, m_ge_zero16b_flag_row1);
7240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_22 = _mm_unpackhi_epi16(m_temp_reg_30, m_ge_zero16b_flag_row0);
7250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_23 = _mm_unpackhi_epi16(m_temp_reg_31, m_ge_zero16b_flag_row1);
7260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /*m_temp_reg_20 = _mm_cvtepi16_epi32(m_temp_reg_20);
7280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_21 = _mm_cvtepi16_epi32(m_temp_reg_21);
7290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_22 = _mm_cvtepi16_epi32(m_temp_reg_22);
7310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_23 = _mm_cvtepi16_epi32(m_temp_reg_23);*/
7320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* e */
7340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
7350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_20, 6);
7360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
7370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* o */
7390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /*{
7400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_12 = _mm_mullo_epi32(m_temp_reg_22, m_coeff1);//src[1]*36
7410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_14 = _mm_mullo_epi32(m_temp_reg_22, m_coeff3);//src[1]*83
7420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_13 = _mm_mullo_epi32(m_temp_reg_23, m_coeff3);//src[3]*83
7430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_15 = _mm_mullo_epi32(m_temp_reg_23, m_coeff1);//src[3]*36
7440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }*/
7450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
7460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_0 = _mm_slli_epi32(m_temp_reg_22, 5);
7470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_1 = _mm_slli_epi32(m_temp_reg_22, 2);
7480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_12 = _mm_add_epi32(m_temp_reg_0, m_temp_reg_1);
7490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //m_temp_reg_12 = _mm_mullo_epi32(m_temp_reg_1, m_coeff1);//src[1]*36
7500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_0 = _mm_slli_epi32(m_temp_reg_23, 6);
7520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_1 = _mm_slli_epi32(m_temp_reg_23, 4);
7530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_2 = _mm_slli_epi32(m_temp_reg_23, 1);
7540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_3 = _mm_add_epi32(m_temp_reg_0, m_temp_reg_23);
7550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_4 = _mm_add_epi32(m_temp_reg_1, m_temp_reg_2);
7560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_13 = _mm_add_epi32(m_temp_reg_3, m_temp_reg_4);
7570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //m_temp_reg_13 = _mm_mullo_epi32(m_temp_reg_3, m_coeff3);//src[3]*83
7580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_0 = _mm_slli_epi32(m_temp_reg_22, 6);
7600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_1 = _mm_slli_epi32(m_temp_reg_22, 4);
7610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_2 = _mm_slli_epi32(m_temp_reg_22, 1);
7620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_3 = _mm_add_epi32(m_temp_reg_0, m_temp_reg_22);
7630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_4 = _mm_add_epi32(m_temp_reg_1, m_temp_reg_2);
7640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_14 = _mm_add_epi32(m_temp_reg_3, m_temp_reg_4);
7650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //m_temp_reg_14 = _mm_mullo_epi32(m_temp_reg_1, m_coeff3);//src[1]*83
7660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_0 = _mm_slli_epi32(m_temp_reg_23, 5);
7680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_1 = _mm_slli_epi32(m_temp_reg_23, 2);
7690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_15 = _mm_add_epi32(m_temp_reg_0, m_temp_reg_1);
7700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //m_temp_reg_15 = _mm_mullo_epi32(m_temp_reg_3, m_coeff1);//src[3]*36
7710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
7720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* e */
7740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
7750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_21, 6);
7760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
7770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
7790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* e1 stored in m_temp_reg_31 */
7810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
7820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_11);
7830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
7840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
7860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* e0 stored in m_temp_reg_30 */
7880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
7890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_30 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_11);
7900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
7910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
7930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_count = _mm_cvtsi32_si128(i4_shift);
7940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* o1 stored in m_temp_reg_33 */
7960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
7970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_13);
7980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
7990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* e1 + add */
8010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
8020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
8030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
8040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* e0 + add */
8060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
8070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
8080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
8090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* o0 stored in m_temp_reg_34 */
8110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
8120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_34 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_15);
8130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
8140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* Stage 2 outputs */
8160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
8170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_21 = _mm_add_epi32(m_temp_reg_31, m_temp_reg_33);
8180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_31, m_temp_reg_33);
8190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_20 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_34);
8200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_34);
8210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_21 = _mm_sra_epi32(m_temp_reg_21, m_count);
8230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_22 = _mm_sra_epi32(m_temp_reg_22, m_count);
8240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_20 = _mm_sra_epi32(m_temp_reg_20, m_count);
8250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_23 = _mm_sra_epi32(m_temp_reg_23, m_count);
8260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_20 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
8280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_21 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
8290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
8300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);
8310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_24 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_22);
8330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_25 = _mm_unpacklo_epi16(m_temp_reg_21, m_temp_reg_23);
8340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_20 = _mm_unpacklo_epi32(m_temp_reg_24, m_temp_reg_25);
8360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_21 = _mm_unpackhi_epi32(m_temp_reg_24, m_temp_reg_25);
8370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
8380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* Recon and store */
8400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
8410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            UWORD32 *pu4_dst = (UWORD32 *)pu1_dst;
8420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
8440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_pred += pred_strd;
8450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred);
8460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_pred += pred_strd;
8470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred);
8480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_pred += pred_strd;
8490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred);
8500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
8520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //m_temp_reg_1 = _mm_cvtepu8_epi16(m_temp_reg_1);
8530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //m_temp_reg_2 = _mm_cvtepu8_epi16(m_temp_reg_2);
8550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //m_temp_reg_3 = _mm_cvtepu8_epi16(m_temp_reg_3);
8560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, m_zero);
8580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_1 = _mm_unpacklo_epi8(m_temp_reg_1, m_zero);
8590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_2 = _mm_unpacklo_epi8(m_temp_reg_2, m_zero);
8600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_3 = _mm_unpacklo_epi8(m_temp_reg_3, m_zero);
8610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_0 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_1);
8630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_1 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_3);
8640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_20 = _mm_add_epi16(m_temp_reg_20, m_temp_reg_0);
8660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_21 = _mm_add_epi16(m_temp_reg_21, m_temp_reg_1);
8670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_0 = _mm_packus_epi16(m_temp_reg_20, m_temp_reg_21);
8690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            *pu4_dst = _mm_cvtsi128_si32(m_temp_reg_0);
8710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_1 = _mm_srli_si128(m_temp_reg_0, 4);
8720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_2 = _mm_srli_si128(m_temp_reg_0, 8);
8730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            m_temp_reg_3 = _mm_srli_si128(m_temp_reg_0, 12);
8740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_dst += dst_strd;
8750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu4_dst = (UWORD32 *)(pu1_dst);
8760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            *pu4_dst = _mm_cvtsi128_si32(m_temp_reg_1);
8780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_dst += dst_strd;
8790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu4_dst = (UWORD32 *)(pu1_dst);
8800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            *pu4_dst = _mm_cvtsi128_si32(m_temp_reg_2);
8820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_dst += dst_strd;
8830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu4_dst = (UWORD32 *)(pu1_dst);
8840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            *pu4_dst = _mm_cvtsi128_si32(m_temp_reg_3);
8860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
8870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
8880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar}
8890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/**
8930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *******************************************************************************
8940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
8950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @brief
8960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  This function performs inverse quantization, inverse  transform and
8970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * reconstruction for 8c8 input block
8980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
8990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @par Description:
9000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Performs inverse quantization , inverse transform  and adds the
9010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * prediction data and clips output to 8 bit
9020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
9030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] pi2_src
9040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Input 8x8 coefficients
9050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
9060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] pi2_tmp
9070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Temporary 8x8 buffer for storing inverse
9080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  transform 1st stage output
9090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
9100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] pu1_pred
9110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Prediction 8x8 block
9120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
9130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] pi2_dequant_coeff
9140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Dequant Coeffs
9150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
9160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[out] pu1_dst
9170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Output 8x8 block
9180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
9190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] src_strd
9200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Input stride
9210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
9220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] qp_div
9230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Quantization parameter / 6
9240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
9250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] qp_rem
9260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Quantization parameter % 6
9270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
9280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] pred_strd
9290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Prediction stride
9300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
9310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] dst_strd
9320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Output Stride
9330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
9340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @param[in] zero_cols
9350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  Zero columns in pi2_src
9360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
9370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @returns  Void
9380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
9390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar * @remarks
9400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *  None
9410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *
9420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *******************************************************************************
9430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar */
9440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarvoid ihevc_itrans_recon_8x8_ssse3(WORD16 *pi2_src,
9470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                  WORD16 *pi2_tmp,
9480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                  UWORD8 *pu1_pred,
9490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                  UWORD8 *pu1_dst,
9500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                  WORD32 src_strd,
9510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                  WORD32 pred_strd,
9520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                  WORD32 dst_strd,
9530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                  WORD32 zero_cols,
9540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                  WORD32 zero_rows)
9550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar{
9560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_0;
9570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_1;
9580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_2;
9590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_3;
9600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_5;
9610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_6;
9620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_7;
9630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_4;
9640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_10;
9650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_11;
9660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_12;
9670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_13;
9680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_14;
9690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_15;
9700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_16;
9710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_17;
9720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_20;
9730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_21;
9740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_22;
9750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_23;
9760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_24;
9770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_25;
9780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_26;
9790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_27;
9800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_30;
9810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_31;
9820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_32;
9830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_33;
9840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_34;
9850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_35;
9860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_36;
9870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_37;
9880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_40;
9890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_41;
9900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_42;
9910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_43;
9920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_44;
9930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_45;
9940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_46;
9950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_47;
9960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_50;
9970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_51;
9980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_52;
9990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_53;
10000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_54;
10010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_55;
10020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_56;
10030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_57;
10040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_60;
10050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_61;
10060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_62;
10070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_63;
10080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_64;
10090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_65;
10100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_66;
10110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_67;
10120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_70;
10130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_71;
10140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_72;
10150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_73;
10160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_74;
10170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_75;
10180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_76;
10190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_77;
10200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_coeff1, m_coeff2, m_coeff3, m_coeff4;
10210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 check_row_stage_1;   /* Lokesh */
10230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 check_row_stage_2;   /* Lokesh */
10240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_rdng_factor;
10260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    //__m128i m_count;
10270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 i4_shift = IT_SHIFT_STAGE_1;
10280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    UNUSED(zero_rows);
10290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    UNUSED(zero_cols);
10300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    UNUSED(pi2_tmp);
10310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    check_row_stage_1   = ((zero_rows & 0xF0) != 0xF0) ? 1 : 0;
10330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    check_row_stage_2   = ((zero_cols & 0xF0) != 0xF0) ? 1 : 0;
10340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_src);
10360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    pi2_src += src_strd;
10370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_src);
10380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    pi2_src += src_strd;
10390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_src);
10400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    pi2_src += src_strd;
10410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_src);
10420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    pi2_src += src_strd;
10430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_src);
10450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    pi2_src += src_strd;
10460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_src);
10470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    pi2_src += src_strd;
10480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_src);
10490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    pi2_src += src_strd;
10500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_src);
10510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    if(!check_row_stage_2)
10530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
10540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        if(!check_row_stage_1)
10550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
10560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
10570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
10580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
10590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                //Interleaving 0,4 row in 0 , 1 Rishab
10600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
10610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]);
10620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]);
10630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
10650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
10670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
10680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
10700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
10730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
10740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* as upper 8 bytes are zeros so m_temp_reg_15 and m_temp_reg_17 are not used*/
10750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
10760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
10780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
10790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* Combining instructions to eliminate them based on zero_rows : Lokesh */
10810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                //Interleaving 2,6 row in 4, 5 Rishab
10820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
10830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
10850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
10860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
10890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
10910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[3][0]);
10920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
10940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[1][0]);
10950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* e */
10990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
11010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
11020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
11030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
11040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
11050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
11060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
11080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
11090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
11110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* o */
11130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
11140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
11160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
11170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
11190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    //o0:1B*89+3B*75,5B*50+7B*18
11200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
11210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
11230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
11240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* Column 0 of destination computed here */
11280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* It is stored in m_temp_reg_50 */
11290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* Column 7 of destination computed here */
11300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* It is stored in m_temp_reg_57 */
11310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* Upper 8 bytes of both registers are zero due to zero_cols*/
11320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
11360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
11370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
11390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
11400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
11420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_63 = _mm_setzero_si128();
11430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
11440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    //o1:1B*75-3B*18,5B*89+7B*50
11460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
11470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
11490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
11500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* Loading coeff for computing o2  in the next block */
11520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
11540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[5][0]);
11550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
11570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* Column 1 of destination computed here */
11610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* It is stored in m_temp_reg_51 */
11620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* Column 6 of destination computed here */
11630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* It is stored in m_temp_reg_56 */
11640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
11660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
11670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
11690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
11700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
11720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
11730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    //o2:1B*50-3B*89,5B*18+7B*75
11750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
11760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
11780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
11790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
11820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* Loading coeff for computing o3  in the next block */
11840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
11860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[7][0]);
11870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* Column 2 of destination computed here */
11910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* It is stored in m_temp_reg_52 */
11920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* Column 5 of destination computed here */
11930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* It is stored in m_temp_reg_55 */
11940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
11960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
11970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
11990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
12000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
12020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
12030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    //o3:1B*18-3B*50,5B*75-7B*89
12050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
12060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
12080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
12090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
12130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* Column 3 of destination computed here */
12170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* It is stored in m_temp_reg_53 */
12180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* Column 4 of destination computed here */
12190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* It is stored in m_temp_reg_54 */
12200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
12220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
12230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
12250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
12260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
12280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
12290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
12320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
12330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
12340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
12350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* Transpose of the destination 8x8 matrix done here */
12370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
12380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* respectively */
12390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
12400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
12410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
12420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                //m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
12430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                //m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
12440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
12450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
12460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                //m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
12470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                //m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
12480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
12500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
12510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                //m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
12520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                //m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
12530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
12540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
12550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                //m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
12560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                //m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
12570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
12590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
12600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
12610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
12620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*m_temp_reg_54 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
12640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_55 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
12650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_56 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
12660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_57 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
12670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                */
12680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_54 = _mm_setzero_si128();
12690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_55 = _mm_setzero_si128();
12700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_56 = _mm_setzero_si128();
12710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_57 = _mm_setzero_si128();
12720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
12730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
12740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        else
12750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
12760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
12770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
12780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
12790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                //Interleaving 0,4 row in 0 , 1 Rishab
12800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
12810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]);
12820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]);
12830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
12850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
12870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
12880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
12900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
12930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
12940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* as upper 8 bytes are zeros so m_temp_reg_15 and m_temp_reg_17 are not used*/
12950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
12960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
12980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
12990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* Combining instructions to eliminate them based on zero_rows : Lokesh */
13010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                //Interleaving 2,6 row in 4, 5 Rishab
13020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
13030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
13050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
13060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
13090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
13110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[3][0]);
13120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
13140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[1][0]);
13150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* e */
13190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
13210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
13220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
13230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
13240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
13250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
13260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
13280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
13290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
13310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* o */
13330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
13340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
13360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
13370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
13390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_64 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
13400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    //o0:1B*89+3B*75,5B*50+7B*18
13410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
13420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
13430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
13450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
13460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
13480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* Column 0 of destination computed here */
13520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* It is stored in m_temp_reg_50 */
13530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* Column 7 of destination computed here */
13540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* It is stored in m_temp_reg_57 */
13550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* Upper 8 bytes of both registers are zero due to zero_cols*/
13560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
13600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
13610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
13630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
13640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
13660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_63 = _mm_setzero_si128();
13670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
13680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    //o1:1B*75-3B*18,5B*89+7B*50
13700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
13710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
13720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
13740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
13750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* Loading coeff for computing o2  in the next block */
13770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
13790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[5][0]);
13800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
13820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26);
13830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* Column 1 of destination computed here */
13870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* It is stored in m_temp_reg_51 */
13880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* Column 6 of destination computed here */
13890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* It is stored in m_temp_reg_56 */
13900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
13920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
13930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
13950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
13960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
13980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
13990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    //o2:1B*50-3B*89,5B*18+7B*75
14010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
14020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
14030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
14050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
14060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
14090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* Loading coeff for computing o3  in the next block */
14110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
14130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[7][0]);
14140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
14160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* Column 2 of destination computed here */
14190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* It is stored in m_temp_reg_52 */
14200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* Column 5 of destination computed here */
14210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* It is stored in m_temp_reg_55 */
14220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
14240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
14250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
14270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
14280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
14300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
14310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    //o3:1B*18-3B*50,5B*75-7B*89
14330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
14340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
14350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
14370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
14380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
14420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26);
14440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* Column 3 of destination computed here */
14470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* It is stored in m_temp_reg_53 */
14480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* Column 4 of destination computed here */
14490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* It is stored in m_temp_reg_54 */
14500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
14520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
14530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
14550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
14560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
14580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
14590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
14620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
14630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
14640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
14650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* Transpose of the destination 8x8 matrix done here */
14670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
14680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* respectively */
14690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
14700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
14710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
14720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                //m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
14730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                //m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
14740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
14750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
14760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                //m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
14770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                //m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
14780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
14800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
14810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                //m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
14820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                //m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
14830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
14840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
14850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                //m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
14860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                //m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
14870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
14890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
14900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
14910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
14920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*m_temp_reg_54 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
14940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_55 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
14950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_56 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
14960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_57 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
14970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                */
14980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_54 = _mm_setzero_si128();
14990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_55 = _mm_setzero_si128();
15000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_56 = _mm_setzero_si128();
15010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_57 = _mm_setzero_si128();
15020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
15030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
15040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* Stage 2 */
15060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        i4_shift = IT_SHIFT_STAGE_2;
15070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
15080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
15090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
15100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
15110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]); //add
15120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]); //sub
15130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_54);
15150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_54);
15160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
15180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
15190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
15200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
15210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]);
15240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]);
15250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
15260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
15290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
15300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
15310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_56);
15330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_56);
15340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
15370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
15380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
15390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
15400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* Loading coeff for computing o0 in the next block */
15420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
15430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_51, m_temp_reg_53);
15460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_51, m_temp_reg_53);
15470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* e */
15510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
15530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
15540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
15550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
15560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
15570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
15580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
15600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
15610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
15630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
15640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
15660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
15670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
15690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* o */
15710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
15720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                //m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_55,m_temp_reg_57);
15740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                //m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_55,m_temp_reg_57);
15750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
15770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
15780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    //o0:1B*89+3B*75,1T*89+3T*75
15790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
15800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
15810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
15830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
15840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* Loading coeff for computing o1 in the next block */
15850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
15860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* Column 0 of destination computed here */
15900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* It is stored in m_temp_reg_50 */
15910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* Column 7 of destination computed here */
15920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* It is stored in m_temp_reg_57 */
15930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
15950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
15960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
15980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
15990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
16010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
16020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
16030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
16040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    //o1:1B*75-3B*18,1T*75-3T*18
16060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
16070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
16080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
16100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
16110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
16120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
16130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
16150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
16160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
16190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* Loading coeff for computing o2  in the next block */
16220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
16230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* Column 1 of destination computed here */
16270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* It is stored in m_temp_reg_51 */
16280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* Column 6 of destination computed here */
16290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* It is stored in m_temp_reg_56 */
16300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
16320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
16330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
16350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
16360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
16380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
16390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
16400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
16410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    //o2:1B*50-3B*89,5T*18+7T*75.
16430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
16440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
16450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
16470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
16480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
16490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
16500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
16520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
16530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
16560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* Loading coeff for computing o3  in the next block */
16580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
16600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* Column 2 of destination computed here */
16630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* It is stored in m_temp_reg_52 */
16640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* Column 5 of destination computed here */
16650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* It is stored in m_temp_reg_55 */
16660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
16680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
16690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
16710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
16720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
16740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
16750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
16760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
16770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    //o3:1B*18-3B*50,1T*18-3T*50
16790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
16800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_37 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
16810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
16830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
16840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
16850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
16860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
16890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
16900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
16940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* Column 3 of destination computed here */
16970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* It is stored in m_temp_reg_53 */
16980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* Column 4 of destination computed here */
16990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* It is stored in m_temp_reg_54 */
17000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
17020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
17030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_21 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
17050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
17060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_rdng_factor);
17080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_rdng_factor);
17090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_rdng_factor);
17100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_rdng_factor);
17110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_srai_epi32(m_temp_reg_20, i4_shift);
17130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_21 = _mm_srai_epi32(m_temp_reg_21, i4_shift);
17140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_22 = _mm_srai_epi32(m_temp_reg_22, i4_shift);
17150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_23 = _mm_srai_epi32(m_temp_reg_23, i4_shift);
17160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
17180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
17190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
17200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
17210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* Transpose of the destination 8x8 matrix done here */
17230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
17240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* respectively */
17250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
17260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
17270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
17280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
17290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
17300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
17310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
17320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
17330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
17340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
17360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
17370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
17380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
17390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
17400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
17410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
17420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
17430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_10 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
17440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_11 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
17450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_12 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
17460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_13 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
17470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_14 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
17490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_15 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
17500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_16 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
17510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_17 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
17520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
17530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* Recon and store */
17550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
17560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
17570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_pred += pred_strd;
17580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred);
17590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_pred += pred_strd;
17600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred);
17610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_pred += pred_strd;
17620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred);
17630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_pred += pred_strd;
17640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_4 = _mm_loadl_epi64((__m128i *)pu1_pred);
17650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_pred += pred_strd;
17660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_5 = _mm_loadl_epi64((__m128i *)pu1_pred);
17670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_pred += pred_strd;
17680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_6 = _mm_loadl_epi64((__m128i *)pu1_pred);
17690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_pred += pred_strd;
17700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_7 = _mm_loadl_epi64((__m128i *)pu1_pred);
17710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_50 = _mm_setzero_si128();
17730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, m_temp_reg_50);
17740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_1 = _mm_unpacklo_epi8(m_temp_reg_1, m_temp_reg_50);
17750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_2 = _mm_unpacklo_epi8(m_temp_reg_2, m_temp_reg_50);
17760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_3 = _mm_unpacklo_epi8(m_temp_reg_3, m_temp_reg_50);
17770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_50);
17780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_50);
17790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_6, m_temp_reg_50);
17800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_7, m_temp_reg_50);
17810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_50 = _mm_add_epi16(m_temp_reg_10, m_temp_reg_0);
17830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_51 = _mm_add_epi16(m_temp_reg_11, m_temp_reg_1);
17840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_52 = _mm_add_epi16(m_temp_reg_12, m_temp_reg_2);
17850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_53 = _mm_add_epi16(m_temp_reg_13, m_temp_reg_3);
17860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_54 = _mm_add_epi16(m_temp_reg_14, m_temp_reg_4);
17870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_55 = _mm_add_epi16(m_temp_reg_15, m_temp_reg_5);
17880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_56 = _mm_add_epi16(m_temp_reg_16, m_temp_reg_6);
17890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_57 = _mm_add_epi16(m_temp_reg_17, m_temp_reg_7);
17900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_50 = _mm_packus_epi16(m_temp_reg_50, m_temp_reg_50);
17920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_51 = _mm_packus_epi16(m_temp_reg_51, m_temp_reg_51);
17930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_52 = _mm_packus_epi16(m_temp_reg_52, m_temp_reg_52);
17940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_53 = _mm_packus_epi16(m_temp_reg_53, m_temp_reg_53);
17950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_54 = _mm_packus_epi16(m_temp_reg_54, m_temp_reg_54);
17960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_55 = _mm_packus_epi16(m_temp_reg_55, m_temp_reg_55);
17970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_56 = _mm_packus_epi16(m_temp_reg_56, m_temp_reg_56);
17980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_57 = _mm_packus_epi16(m_temp_reg_57, m_temp_reg_57);
17990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_50);
18010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_dst += dst_strd;
18020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_51);
18030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_dst += dst_strd;
18040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_52);
18050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_dst += dst_strd;
18060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_53);
18070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_dst += dst_strd;
18080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_54);
18090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_dst += dst_strd;
18100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_55);
18110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_dst += dst_strd;
18120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_56);
18130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_dst += dst_strd;
18140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_57);
18150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_dst += dst_strd;
18160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
18170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
18180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
18190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    else
18200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
18220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
18240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
18250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        if(!check_row_stage_1)
18260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
18270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
18280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
18290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
18300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                //Interleaving 0,4 row in 0 , 1 Rishab
18310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
18320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]);
18330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]);
18340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
18360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74);
18370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
18390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
18400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
18430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
18440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
18450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
18480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
18490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
18500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
18520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
18530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* Combining instructions to eliminate them based on zero_rows : Lokesh */
18550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                //Interleaving 2,6 row in 4, 5 Rishab
18560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
18570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76);
18580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
18600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
18610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_5, m_coeff1);
18630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
18640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
18680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
18700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                //m_coeff4 = _mm_loadu_si128((__m128i *) &g_ai2_ihevc_trans_intr_odd_8[3][0]);
18710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
18730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                //m_coeff2 = _mm_loadu_si128((__m128i *) &g_ai2_ihevc_trans_intr_odd_8[1][0]);
18740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
18760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* e */
18780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
18790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
18800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
18810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
18820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
18830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
18840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
18850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
18870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
18880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
18900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
18910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
18930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
18940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
18960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* o */
18980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
18990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
19010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
19020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
19040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_61 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
19050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    //  m_temp_reg_64 = _mm_unpacklo_epi16(m_temp_reg_75,m_temp_reg_77);
19060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    //   m_temp_reg_65 = _mm_unpackhi_epi16(m_temp_reg_75,m_temp_reg_77);
19070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    //o0:1B*89+3B*75,1T*89+3T*75
19080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
19090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
19100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    //m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
19110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    //m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_65, m_coeff2);
19120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
19150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
19160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    //m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
19180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    //m_temp_reg_31 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
19190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
19200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* Column 0 of destination computed here */
19220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* It is stored in m_temp_reg_50 */
19230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* Column 7 of destination computed here */
19240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* It is stored in m_temp_reg_57 */
19250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
19260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
19290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
19300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
19320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
19330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
19350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
19360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
19370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
19380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
19400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
19410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
19420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
19430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50
19450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
19460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    //m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
19470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
19480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    //m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_65, m_coeff4);
19490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
19510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
19520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* Loading coeff for computing o2  in the next block */
19540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
19560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
19580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    //m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26);
19590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    //m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_27);
19600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
19610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* Column 1 of destination computed here */
19630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* It is stored in m_temp_reg_51 */
19640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* Column 6 of destination computed here */
19650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* It is stored in m_temp_reg_56 */
19660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
19670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
19680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
19690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
19710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
19720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
19740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
19750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
19760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
19770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
19790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
19800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
19810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
19820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    //o2:1B*50-3B*89,1T*50-3T*89
19840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
19850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
19860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
19880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
19890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
19920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* Loading coeff for computing o3  in the next block */
19950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
19970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    //m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
19990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    //m_temp_reg_35 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
20000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
20010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* Column 2 of destination computed here */
20030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* It is stored in m_temp_reg_52 */
20040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* Column 5 of destination computed here */
20050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* It is stored in m_temp_reg_55 */
20060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
20070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
20080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
20090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
20110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
20120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
20140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
20150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
20160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
20170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
20190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
20200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
20210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
20220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    //o3:1B*18-3B*50,1T*18-3T*50
20240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
20250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_37 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
20260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
20280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
20290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
20330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
20360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* Column 3 of destination computed here */
20380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* It is stored in m_temp_reg_53 */
20390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* Column 4 of destination computed here */
20400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* It is stored in m_temp_reg_54 */
20410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
20420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
20430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
20440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
20460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
20470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
20490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
20500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
20510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
20520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
20540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
20550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
20560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
20570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
20590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
20600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
20610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
20620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* Transpose of the destination 8x8 matrix done here */
20640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
20650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* respectively */
20660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
20670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
20700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
20710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
20720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
20730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
20740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
20750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
20760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
20770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
20790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
20800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
20810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
20820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
20830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
20840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
20850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
20860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
20880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
20890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
20900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
20910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_54 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
20930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_55 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
20940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_56 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
20950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_57 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
20960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
20970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
20980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        else
20990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
21000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
21020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
21030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
21040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                //Interleaving 0,4 row in 0 , 1 Rishab
21050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
21060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]);
21070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]);
21080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
21100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74);
21110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
21130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
21140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
21170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
21180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
21190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
21220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
21230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
21240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
21260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
21270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* Combining instructions to eliminate them based on zero_rows : Lokesh */
21290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                //Interleaving 2,6 row in 4, 5 Rishab
21300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
21310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76);
21320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
21340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
21350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_5, m_coeff1);
21370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
21380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
21420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
21440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[3][0]);
21450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
21470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[1][0]);
21480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
21500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* e */
21520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
21530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
21540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
21550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
21560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
21570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
21580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
21590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
21610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
21620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
21640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
21650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
21670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
21680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
21700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* o */
21720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
21730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
21750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
21760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
21780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_61 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
21790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_64 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
21800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_65 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_77);
21810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    //o0:1B*89+3B*75,1T*89+3T*75,5B*50+7B*18,5T*50+7T*18
21820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
21830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
21840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
21850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_65, m_coeff2);
21860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
21890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
21900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
21920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
21930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
21940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* Column 0 of destination computed here */
21960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* It is stored in m_temp_reg_50 */
21970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* Column 7 of destination computed here */
21980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* It is stored in m_temp_reg_57 */
21990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
22000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
22030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
22040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
22060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
22070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
22090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
22100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
22110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
22120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
22140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
22150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
22160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
22170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50
22190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
22200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
22210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
22220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_65, m_coeff4);
22230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
22250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
22260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* Loading coeff for computing o2  in the next block */
22280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
22300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[5][0]);
22310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
22330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26);
22340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_27);
22350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
22360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* Column 1 of destination computed here */
22380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* It is stored in m_temp_reg_51 */
22390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* Column 6 of destination computed here */
22400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* It is stored in m_temp_reg_56 */
22410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
22420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
22430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
22440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
22460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
22470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
22490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
22500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
22510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
22520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
22540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
22550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
22560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
22570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    //o2:1B*50-3B*89,1T*50-3T*89,5B*18+7B*75,5T*18+7T*75
22590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
22600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
22610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
22620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_65, m_coeff2);
22630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
22650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
22660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
22690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* Loading coeff for computing o3  in the next block */
22720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
22740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[7][0]);
22750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
22770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
22780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
22790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* Column 2 of destination computed here */
22810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* It is stored in m_temp_reg_52 */
22820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* Column 5 of destination computed here */
22830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* It is stored in m_temp_reg_55 */
22840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
22850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
22860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
22870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
22890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
22900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
22920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
22930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
22940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
22950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
22970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
22980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
22990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
23000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    //o3:1B*18-3B*50,1T*18-3T*50,5B*75-7B*89,5T*75-7T*89
23020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
23030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
23040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
23050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_65, m_coeff4);
23060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
23080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
23090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
23130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26);
23160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_37 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_27);
23170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
23180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* Column 3 of destination computed here */
23200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* It is stored in m_temp_reg_53 */
23210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* Column 4 of destination computed here */
23220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* It is stored in m_temp_reg_54 */
23230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
23240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
23250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
23260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
23280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
23290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
23310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
23320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
23330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
23340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
23360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
23370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
23380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
23390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
23410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
23420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
23430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
23440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* Transpose of the destination 8x8 matrix done here */
23460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
23470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* respectively */
23480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
23490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
23520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
23530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
23540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
23550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
23560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
23570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
23580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
23590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
23610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
23620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
23630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
23640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
23650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
23660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
23670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
23680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
23700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
23710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
23720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
23730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_54 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
23750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_55 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
23760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_56 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
23770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_57 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
23780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
23790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
23800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* Stage 2 */
23810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        i4_shift = IT_SHIFT_STAGE_2;
23830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
23850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
23870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
23880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
23890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]); //add
23900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]); //sub
23910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_54);
23930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_54);
23940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
23960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
23970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
23980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
23990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]);
24020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]);
24030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
24040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
24070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
24080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
24090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                //m_temp_reg_66 = _mm_mullo_epi32(m_temp_reg_2, m_coeff1);
24100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                //m_temp_reg_64 = _mm_mullo_epi32(m_temp_reg_0, m_coeff2);
24110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                //m_temp_reg_62 = _mm_mullo_epi32(m_temp_reg_2, m_coeff2);
24120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                //m_temp_reg_60 = _mm_mullo_epi32(m_temp_reg_0, m_coeff1);
24130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_56);
24140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_56);
24150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
24180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
24190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
24200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
24210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                //m_temp_reg_16 = _mm_sub_epi32(m_temp_reg_64, m_temp_reg_66);
24230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                //m_temp_reg_14 = _mm_add_epi32(m_temp_reg_60, m_temp_reg_62);
24240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* Loading coeff for computing o0 in the next block */
24260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
24270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[1][0]);
24280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_51, m_temp_reg_53);
24310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_51, m_temp_reg_53);
24320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*m_temp_reg_3 = _mm_srli_si128(m_temp_reg_53, 8);
24330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_1 = _mm_cvtepi16_epi32(m_temp_reg_1);
24340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_3 = _mm_cvtepi16_epi32(m_temp_reg_3);
24350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                */
24360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
24380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* e */
24400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
24410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
24420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
24430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
24440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
24450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
24460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
24470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
24490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
24500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
24520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
24530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
24550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
24560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
24580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* o */
24600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
24610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*  m_temp_reg_4 = _mm_cvtepi16_epi32(m_temp_reg_55);
24620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_5 = _mm_srli_si128(m_temp_reg_55, 8);
24630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_6 = _mm_cvtepi16_epi32(m_temp_reg_57);
24640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_7 = _mm_srli_si128(m_temp_reg_57, 8);
24650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_5 = _mm_cvtepi16_epi32(m_temp_reg_5);
24660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_7 = _mm_cvtepi16_epi32(m_temp_reg_7);
24670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    */
24680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_55, m_temp_reg_57);
24690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_55, m_temp_reg_57);
24700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
24720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
24730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    //o0:1B*89+3B*75,1T*89+3T*75,5B*50+7B*18,5T*50+7T*18
24740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
24750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
24760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
24770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
24780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
24800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
24810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* Loading coeff for computing o1 in the next block */
24820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
24830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[3][0]);
24840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
24860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
24870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
24880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* Column 0 of destination computed here */
24900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* It is stored in m_temp_reg_50 */
24910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* Column 7 of destination computed here */
24920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* It is stored in m_temp_reg_57 */
24930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
24940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
24950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
24960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
24980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
24990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
25010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
25020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
25030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
25040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
25060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
25070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
25080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
25090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50
25110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
25120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_4, m_coeff4);
25130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
25140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_5, m_coeff4);
25150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
25170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
25180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
25210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* Loading coeff for computing o2  in the next block */
25240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
25250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[5][0]);
25260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26);
25280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_27);
25290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
25300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* Column 1 of destination computed here */
25320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* It is stored in m_temp_reg_51 */
25330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* Column 6 of destination computed here */
25340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* It is stored in m_temp_reg_56 */
25350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
25360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
25370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
25380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
25400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
25410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
25430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
25440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
25450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
25460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
25480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
25490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
25500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
25510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    //o2:1B*50-3B*89,1T*50-3T*89,5B*18+7B*75,5T*18+7T*75
25530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
25540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
25550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
25560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
25570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
25590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
25600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
25630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* Loading coeff for computing o3  in the next block */
25650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
25670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[7][0]);
25680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
25700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_35 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
25710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
25720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* Column 2 of destination computed here */
25740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* It is stored in m_temp_reg_52 */
25750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* Column 5 of destination computed here */
25760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* It is stored in m_temp_reg_55 */
25770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
25780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
25790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
25800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
25820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
25830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
25850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
25860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
25870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
25880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
25900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
25910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
25920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
25930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    //o3:1B*18-3B*50,1T*18-3T*50,5B*75-7B*89,5T*75-7T*89
25950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
25960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_4, m_coeff4);
25970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
25980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_5, m_coeff4);
25990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
26010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
26020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
26060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26);
26090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_37 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_27);
26100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
26110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* Column 3 of destination computed here */
26130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* It is stored in m_temp_reg_53 */
26140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* Column 4 of destination computed here */
26150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* It is stored in m_temp_reg_54 */
26160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
26170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
26180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
26190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_21 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
26210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
26220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_rdng_factor);
26240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_rdng_factor);
26250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_rdng_factor);
26260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_rdng_factor);
26270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_20 = _mm_srai_epi32(m_temp_reg_20, i4_shift);
26290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_21 = _mm_srai_epi32(m_temp_reg_21, i4_shift);
26300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_22 = _mm_srai_epi32(m_temp_reg_22, i4_shift);
26310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_23 = _mm_srai_epi32(m_temp_reg_23, i4_shift);
26320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
26340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
26350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
26360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
26370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* Transpose of the destination 8x8 matrix done here */
26390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
26400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* respectively */
26410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
26420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
26430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
26440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
26450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
26460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
26470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
26480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
26490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
26500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
26520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
26530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
26540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
26550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
26560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
26570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
26580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
26590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_10 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
26600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_11 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
26610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_12 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
26620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_13 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
26630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_14 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
26650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_15 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
26660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_16 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
26670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_17 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
26680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
26690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* Recon and store */
26710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
26720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
26730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_pred += pred_strd;
26740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred);
26750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_pred += pred_strd;
26760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred);
26770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_pred += pred_strd;
26780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred);
26790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_pred += pred_strd;
26800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_4 = _mm_loadl_epi64((__m128i *)pu1_pred);
26810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_pred += pred_strd;
26820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_5 = _mm_loadl_epi64((__m128i *)pu1_pred);
26830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_pred += pred_strd;
26840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_6 = _mm_loadl_epi64((__m128i *)pu1_pred);
26850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_pred += pred_strd;
26860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_7 = _mm_loadl_epi64((__m128i *)pu1_pred);
26870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_50 = _mm_setzero_si128();
26900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, m_temp_reg_50);
26910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_1 = _mm_unpacklo_epi8(m_temp_reg_1, m_temp_reg_50);
26920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_2 = _mm_unpacklo_epi8(m_temp_reg_2, m_temp_reg_50);
26930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_3 = _mm_unpacklo_epi8(m_temp_reg_3, m_temp_reg_50);
26940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_50);
26950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_50);
26960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_6, m_temp_reg_50);
26970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_7, m_temp_reg_50);
26980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_50 = _mm_add_epi16(m_temp_reg_10, m_temp_reg_0);
27000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_51 = _mm_add_epi16(m_temp_reg_11, m_temp_reg_1);
27010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_52 = _mm_add_epi16(m_temp_reg_12, m_temp_reg_2);
27020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_53 = _mm_add_epi16(m_temp_reg_13, m_temp_reg_3);
27030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_54 = _mm_add_epi16(m_temp_reg_14, m_temp_reg_4);
27040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_55 = _mm_add_epi16(m_temp_reg_15, m_temp_reg_5);
27050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_56 = _mm_add_epi16(m_temp_reg_16, m_temp_reg_6);
27060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_57 = _mm_add_epi16(m_temp_reg_17, m_temp_reg_7);
27070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_50 = _mm_packus_epi16(m_temp_reg_50, m_temp_reg_50);
27090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_51 = _mm_packus_epi16(m_temp_reg_51, m_temp_reg_51);
27100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_52 = _mm_packus_epi16(m_temp_reg_52, m_temp_reg_52);
27110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_53 = _mm_packus_epi16(m_temp_reg_53, m_temp_reg_53);
27120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_54 = _mm_packus_epi16(m_temp_reg_54, m_temp_reg_54);
27130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_55 = _mm_packus_epi16(m_temp_reg_55, m_temp_reg_55);
27140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_56 = _mm_packus_epi16(m_temp_reg_56, m_temp_reg_56);
27150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_57 = _mm_packus_epi16(m_temp_reg_57, m_temp_reg_57);
27160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_50);
27180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_dst += dst_strd;
27190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_51);
27200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_dst += dst_strd;
27210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_52);
27220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_dst += dst_strd;
27230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_53);
27240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_dst += dst_strd;
27250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_54);
27260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_dst += dst_strd;
27270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_55);
27280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_dst += dst_strd;
27290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_56);
27300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_dst += dst_strd;
27310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_57);
27320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_dst += dst_strd;
27330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
27350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
27380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
27410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar}
27420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2743