10d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/****************************************************************************** 20d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 30d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 40d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 50d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Licensed under the Apache License, Version 2.0 (the "License"); 60d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* you may not use this file except in compliance with the License. 70d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* You may obtain a copy of the License at: 80d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 90d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* http://www.apache.org/licenses/LICENSE-2.0 100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Unless required by applicable law or agreed to in writing, software 120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* distributed under the License is distributed on an "AS IS" BASIS, 130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* See the License for the specific language governing permissions and 150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* limitations under the License. 160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar******************************************************************************/ 180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/** 190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar******************************************************************************* 200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @file 210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* ihevcd_it_rec_dc_x86_intr.c 220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @brief 240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Platform specific intrinsic implementation of certain functions 250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @author 270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Ittiam 280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @par List of Functions: 290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* - ihevcd_itrans_recon_dc 300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* - ihevcd_fmt_conv_420sp_to_420p 310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @remarks 330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* None 340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar******************************************************************************* 360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*/ 370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_typedefs.h" 390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_defs.h" 400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_macros.h" 410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_platform_macros.h" 420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevcd_function_selector.h" 430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <immintrin.h> 450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarvoid ihevcd_itrans_recon_dc_luma_sse42(UWORD8 *pu1_pred, UWORD8 *pu1_dst, WORD32 pred_strd, WORD32 dst_strd, 480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 log2_trans_size, WORD16 i2_coeff_value) 490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar{ 500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_0; 510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_1; 520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_2; 530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_3; 540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_4; 550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_5; 560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_6; 570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_7; 580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_8; 590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_9; 600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_10; 610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_11; 620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_12; 630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_13; 640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_14; 650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_15; 660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_20, zero_8x16b; 670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i *pi4_dst = (__m128i *)pu1_dst; 680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //WORD32 row,col; 710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 add, shift; 720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 dc_value, quant_out; 730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 trans_size; 740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar trans_size = (1 << log2_trans_size); 790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar quant_out = i2_coeff_value; 810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar shift = IT_SHIFT_STAGE_1; 830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar add = 1 << (shift - 1); 840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dc_value = CLIP_S16((quant_out * 64 + add) >> shift); 850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar shift = IT_SHIFT_STAGE_2; 860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar add = 1 << (shift - 1); 870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dc_value = CLIP_S16((dc_value * 64 + add) >> shift); 880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*Replicate the DC value within 16 bits in 128 bit register*/ 900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_set1_epi16(dc_value); 910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar zero_8x16b = _mm_setzero_si128(); 920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(trans_size == 4) 940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 *pi4_dst = (WORD32 *)pu1_dst; 960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_loadl_epi64((__m128i *)(pu1_pred)); 980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_loadl_epi64((__m128i *)(pu1_pred + pred_strd)); 990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_2 = _mm_loadl_epi64((__m128i *)(pu1_pred + 2 * pred_strd)); 1000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_3 = _mm_loadl_epi64((__m128i *)(pu1_pred + 3 * pred_strd)); 1010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_0, m_temp_reg_1); 1030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_5 = _mm_unpacklo_epi32(m_temp_reg_2, m_temp_reg_3); 1040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, zero_8x16b); 1060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, zero_8x16b); 1070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_6 = _mm_add_epi16(m_temp_reg_4, m_temp_reg_20); 1090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_7 = _mm_add_epi16(m_temp_reg_5, m_temp_reg_20); 1100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_8 = _mm_packus_epi16(m_temp_reg_6, m_temp_reg_7); 1120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_8); 1150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_srli_si128(m_temp_reg_8, 4); 1160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_2 = _mm_srli_si128(m_temp_reg_8, 8); 1170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_3 = _mm_srli_si128(m_temp_reg_8, 12); 1180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += dst_strd; 1190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi4_dst = (WORD32 *)(pu1_dst); 1200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_1); 1220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += dst_strd; 1230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi4_dst = (WORD32 *)(pu1_dst); 1240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_2); 1260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += dst_strd; 1270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi4_dst = (WORD32 *)(pu1_dst); 1280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_3); 1300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 1310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else 1320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 1330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 i, j; 1340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(i = 1; i <= trans_size; i += 4) 1360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 1370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(j = 1; j <= trans_size; j += 8) 1380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 1390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred); 1410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_loadl_epi64((__m128i *)(pu1_pred + pred_strd)); 1420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_2 = _mm_loadl_epi64((__m128i *)(pu1_pred + 2 * pred_strd)); 1430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_3 = _mm_loadl_epi64((__m128i *)(pu1_pred + 3 * pred_strd)); 1440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_0, zero_8x16b); 1470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_1, zero_8x16b); 1480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_2, zero_8x16b); 1490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_3, zero_8x16b); 1500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_8 = _mm_add_epi16(m_temp_reg_4, m_temp_reg_20); 1520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_9 = _mm_add_epi16(m_temp_reg_5, m_temp_reg_20); 1530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_add_epi16(m_temp_reg_6, m_temp_reg_20); 1540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_11 = _mm_add_epi16(m_temp_reg_7, m_temp_reg_20); 1550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi4_dst = (__m128i *)(pu1_dst); 1570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_12 = _mm_packus_epi16(m_temp_reg_8, m_temp_reg_9); 1590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64(pi4_dst, m_temp_reg_12); 1600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi4_dst = (__m128i *)(pu1_dst + dst_strd); 1620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_13 = _mm_srli_si128(m_temp_reg_12, 8); 1640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64(pi4_dst, m_temp_reg_13); 1650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi4_dst = (__m128i *)(pu1_dst + 2 * dst_strd); 1670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_14 = _mm_packus_epi16(m_temp_reg_10, m_temp_reg_11); 1690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64(pi4_dst, m_temp_reg_14); 1700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi4_dst = (__m128i *)(pu1_dst + 3 * dst_strd); 1720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_15 = _mm_srli_si128(m_temp_reg_14, 8); 1740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64(pi4_dst, m_temp_reg_15); 1750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_pred += 8; 1770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += 8; 1780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 1790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_pred += 4 * pred_strd - trans_size; 1800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += 4 * dst_strd - trans_size; 1810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 1820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 1830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar} 1860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarvoid ihevcd_itrans_recon_dc_chroma_sse42(UWORD8 *pu1_pred, UWORD8 *pu1_dst, WORD32 pred_strd, WORD32 dst_strd, 1880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 log2_trans_size, WORD16 i2_coeff_value) 1890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar{ 1900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_0; 1910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_1; 1920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_2; 1930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_3; 1940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_4; 1950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_5; 1960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_6; 1970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_7; 1980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_8; 1990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_9; 2000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_10; 2010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_11; 2020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_12; 2030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_13; 2040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_14; 2050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_15; 2060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_temp_reg_20, zero_8x16b; 2070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i *pi4_dst = (__m128i *)pu1_dst; 2080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //WORD32 row,col; 2110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 add, shift; 2120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 dc_value, quant_out; 2130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 trans_size; 2140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 shuffle_mask_4x4 = 0x06040200; 2170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 unchanged_mask_4x4 = 0x07050301; 2180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LWORD64 shuffle_mask = 0x0E0C0A0806040200LL; 2190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LWORD64 unchanged_mask = 0x0F0D0B0907050301LL; 2200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar trans_size = (1 << log2_trans_size); 2220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar quant_out = i2_coeff_value; 2240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar shift = IT_SHIFT_STAGE_1; 2260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar add = 1 << (shift - 1); 2270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dc_value = CLIP_S16((quant_out * 64 + add) >> shift); 2280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar shift = IT_SHIFT_STAGE_2; 2290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar add = 1 << (shift - 1); 2300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dc_value = CLIP_S16((dc_value * 64 + add) >> shift); 2310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*Replicate the DC value within 16 bits in 128 bit register*/ 2330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_20 = _mm_set1_epi16(dc_value); 2340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar zero_8x16b = _mm_setzero_si128(); 2350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(trans_size == 4) 2370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 2380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i chroma_shuffle_mask_16x8b; 2390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i chroma_unchanged_mask_16x8b; 2400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar chroma_shuffle_mask_16x8b = _mm_cvtsi32_si128(shuffle_mask_4x4); 2410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar chroma_unchanged_mask_16x8b = _mm_cvtsi32_si128(unchanged_mask_4x4); 2420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*Load the prediction data*/ 2440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_loadl_epi64((__m128i *)(pu1_pred)); 2450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_loadl_epi64((__m128i *)(pu1_pred + pred_strd)); 2460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_2 = _mm_loadl_epi64((__m128i *)(pu1_pred + 2 * pred_strd)); 2470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_3 = _mm_loadl_epi64((__m128i *)(pu1_pred + 3 * pred_strd)); 2480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_shuffle_epi8(m_temp_reg_0, chroma_shuffle_mask_16x8b); 2500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_11 = _mm_shuffle_epi8(m_temp_reg_1, chroma_shuffle_mask_16x8b); 2510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_12 = _mm_shuffle_epi8(m_temp_reg_2, chroma_shuffle_mask_16x8b); 2520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_13 = _mm_shuffle_epi8(m_temp_reg_3, chroma_shuffle_mask_16x8b); 2530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_14 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11); 2550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_15 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13); 2560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_14, zero_8x16b); 2580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_15, zero_8x16b); 2590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_6 = _mm_add_epi16(m_temp_reg_4, m_temp_reg_20); 2610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_7 = _mm_add_epi16(m_temp_reg_5, m_temp_reg_20); 2620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*Load the recon data to make sure that 'v' is not corrupted when 'u' is called and vice versa*/ 2640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_dst); 2650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_loadl_epi64((__m128i *)(pu1_dst + dst_strd)); 2660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_2 = _mm_loadl_epi64((__m128i *)(pu1_dst + 2 * dst_strd)); 2670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_3 = _mm_loadl_epi64((__m128i *)(pu1_dst + 3 * dst_strd)); 2680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_shuffle_epi8(m_temp_reg_0, chroma_unchanged_mask_16x8b); 2700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_shuffle_epi8(m_temp_reg_1, chroma_unchanged_mask_16x8b); 2710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_2 = _mm_shuffle_epi8(m_temp_reg_2, chroma_unchanged_mask_16x8b); 2720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_3 = _mm_shuffle_epi8(m_temp_reg_3, chroma_unchanged_mask_16x8b); 2730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_8 = _mm_packus_epi16(m_temp_reg_6, m_temp_reg_7); 2760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_9 = _mm_unpacklo_epi8(m_temp_reg_8, m_temp_reg_0); 2770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_8 = _mm_srli_si128(m_temp_reg_8, 4); 2780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_unpacklo_epi8(m_temp_reg_8, m_temp_reg_1); 2790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_8 = _mm_srli_si128(m_temp_reg_8, 4); 2800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_11 = _mm_unpacklo_epi8(m_temp_reg_8, m_temp_reg_2); 2810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_8 = _mm_srli_si128(m_temp_reg_8, 4); 2820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_12 = _mm_unpacklo_epi8(m_temp_reg_8, m_temp_reg_3); 2830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*Store the result in the destination*/ 2850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64(pi4_dst, m_temp_reg_9); 2860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += dst_strd; 2870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi4_dst = (__m128i *)(pu1_dst); 2880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64(pi4_dst, m_temp_reg_10); 2910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += dst_strd; 2920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi4_dst = (__m128i *)(pu1_dst); 2930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64(pi4_dst, m_temp_reg_11); 2950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += dst_strd; 2960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi4_dst = (__m128i *)(pu1_dst); 2970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64(pi4_dst, m_temp_reg_12); 2990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 3000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else 3010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 3020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 i, j; 3030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i chroma_shuffle_mask_16x8b; 3040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i chroma_unchanged_mask_16x8b; 3050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar chroma_shuffle_mask_16x8b = _mm_loadl_epi64((__m128i *)(&shuffle_mask)); 3060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar chroma_unchanged_mask_16x8b = 3070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_loadl_epi64((__m128i *)(&unchanged_mask)); 3080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(i = 0; i < trans_size; i += 4) 3100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 3110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(j = 0; j < trans_size; j += 8) 3120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 3130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_loadu_si128((__m128i *)pu1_pred); 3150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_loadu_si128((__m128i *)(pu1_pred + pred_strd)); 3160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_2 = _mm_loadu_si128((__m128i *)(pu1_pred + 2 * pred_strd)); 3170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_3 = _mm_loadu_si128((__m128i *)(pu1_pred + 3 * pred_strd)); 3180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*Retain only one chroma component*/ 3200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_4 = _mm_shuffle_epi8(m_temp_reg_0, chroma_shuffle_mask_16x8b); 3210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_5 = _mm_shuffle_epi8(m_temp_reg_1, chroma_shuffle_mask_16x8b); 3220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_6 = _mm_shuffle_epi8(m_temp_reg_2, chroma_shuffle_mask_16x8b); 3230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_7 = _mm_shuffle_epi8(m_temp_reg_3, chroma_shuffle_mask_16x8b); 3240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, zero_8x16b); 3260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, zero_8x16b); 3270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_6, zero_8x16b); 3280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_7, zero_8x16b); 3290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_8 = _mm_add_epi16(m_temp_reg_4, m_temp_reg_20); 3310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_9 = _mm_add_epi16(m_temp_reg_5, m_temp_reg_20); 3320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_add_epi16(m_temp_reg_6, m_temp_reg_20); 3330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_11 = _mm_add_epi16(m_temp_reg_7, m_temp_reg_20); 3340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*Load the recon data to make sure that 'v' is not corrupted when 'u' is called and vice versa*/ 3370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_loadu_si128((__m128i *)pu1_dst); 3380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_loadu_si128((__m128i *)(pu1_dst + dst_strd)); 3390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_2 = _mm_loadu_si128((__m128i *)(pu1_dst + 2 * dst_strd)); 3400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_3 = _mm_loadu_si128((__m128i *)(pu1_dst + 3 * dst_strd)); 3410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_0 = _mm_shuffle_epi8(m_temp_reg_0, chroma_unchanged_mask_16x8b); 3430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_1 = _mm_shuffle_epi8(m_temp_reg_1, chroma_unchanged_mask_16x8b); 3440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_2 = _mm_shuffle_epi8(m_temp_reg_2, chroma_unchanged_mask_16x8b); 3450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_3 = _mm_shuffle_epi8(m_temp_reg_3, chroma_unchanged_mask_16x8b); 3460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_4 = _mm_packus_epi16(m_temp_reg_8, m_temp_reg_9); 3480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_5 = _mm_packus_epi16(m_temp_reg_10, m_temp_reg_11); 3490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_12 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_0); 3510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_4 = _mm_srli_si128(m_temp_reg_4, 8); 3520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_13 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_1); 3530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_14 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_2); 3550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_5 = _mm_srli_si128(m_temp_reg_5, 8); 3560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_15 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_3); 3570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*Store the result in the destination*/ 3590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi4_dst = (__m128i *)(pu1_dst); 3600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64(pi4_dst, m_temp_reg_12); 3620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_8 = _mm_srli_si128(m_temp_reg_12, 8); 3630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi4_dst = (__m128i *)(pu1_dst + 8); 3650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64(pi4_dst, m_temp_reg_8); 3660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi4_dst = (__m128i *)(pu1_dst + dst_strd); 3680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64(pi4_dst, m_temp_reg_13); 3700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_9 = _mm_srli_si128(m_temp_reg_13, 8); 3710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi4_dst = (__m128i *)(pu1_dst + dst_strd + 8); 3730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64(pi4_dst, m_temp_reg_9); 3740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi4_dst = (__m128i *)(pu1_dst + 2 * dst_strd); 3760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64(pi4_dst, m_temp_reg_14); 3780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_10 = _mm_srli_si128(m_temp_reg_14, 8); 3790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi4_dst = (__m128i *)(pu1_dst + 2 * dst_strd + 8); 3810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64(pi4_dst, m_temp_reg_10); 3820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi4_dst = (__m128i *)(pu1_dst + 3 * dst_strd); 3840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64(pi4_dst, m_temp_reg_15); 3860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_temp_reg_11 = _mm_srli_si128(m_temp_reg_15, 8); 3870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi4_dst = (__m128i *)(pu1_dst + 3 * dst_strd + 8); 3890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64(pi4_dst, m_temp_reg_11); 3900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_pred += 16; 3920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += 16; 3930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 3940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_pred += 4 * pred_strd - 2 * trans_size; 3960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += 4 * dst_strd - 2 * trans_size; 3970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 3980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 3990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar} 402