10d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/****************************************************************************** 20d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 30d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 40d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 50d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Licensed under the Apache License, Version 2.0 (the "License"); 60d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* you may not use this file except in compliance with the License. 70d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* You may obtain a copy of the License at: 80d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 90d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* http://www.apache.org/licenses/LICENSE-2.0 100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Unless required by applicable law or agreed to in writing, software 120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* distributed under the License is distributed on an "AS IS" BASIS, 130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* See the License for the specific language governing permissions and 150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* limitations under the License. 160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar******************************************************************************/ 180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/** 190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar******************************************************************************* 200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @file 210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* ihevc_chroma_intra_pred_filters_x86_intr.c 220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @brief 240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Contains function Definition for intra prediction interpolation filters 250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @author 280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Ittiam 290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @par List of Functions: 310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* ihevc_intra_pred_chroma_planar_sse42() 320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* ihevc_intra_pred_chroma_dc_sse42() 340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @remarks 360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* None 370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar******************************************************************************* 390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*/ 400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/*****************************************************************************/ 430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/* File Includes */ 440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/*****************************************************************************/ 450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_typedefs.h" 470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_macros.h" 480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_func_selector.h" 490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_platform_macros.h" 500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_intra_pred.h" 510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_chroma_intra_pred.h" 520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_common_tables.h" 530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_tables_x86_intr.h" 540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <mmintrin.h> 560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <xmmintrin.h> 570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <emmintrin.h> 580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <smmintrin.h> 590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <immintrin.h> 600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/****************************************************************************/ 630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/* Constant Macros */ 640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/****************************************************************************/ 650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#define MAX_CU_SIZE 64 660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#define BIT_DEPTH 8 670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#define T32_4NT 128 680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#define T16_4NT 64 690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#define T16C_4NT 64 700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#define T8C_4NT 32 710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/****************************************************************************/ 720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/* Function Macros */ 730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/****************************************************************************/ 740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#define GET_BIT(y,x) ((y) & (1 << x)) && (1 << x) 760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/* tables to shuffle 8-bit values */ 780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/*****************************************************************************/ 800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/* Function Definition */ 810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/*****************************************************************************/ 820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/** 860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar******************************************************************************* 870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @brief 890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Planar Intraprediction with reference neighboring samples location 900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' Refer 910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* to section 8.4.4.2.4 in the standard 920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @par Description: 940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pu1_src 970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* UWORD8 pointer to the source 980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pu1_dst 1000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* UWORD8 pointer to the destination 1010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 1020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] src_strd 1030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* integer source stride 1040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 1050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] dst_strd 1060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* integer destination stride 1070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 1080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] nt 1090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* integer Transform Block size 1100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 1110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] mode 1120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* integer intraprediction mode 1130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 1140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @returns 1150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 1160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @remarks 1170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* None 1180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 1190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar******************************************************************************* 1200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*/ 1210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarvoid ihevc_intra_pred_chroma_planar_sse42(UWORD8 *pu1_ref, 1230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 src_strd, 1240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UWORD8 *pu1_dst, 1250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 dst_strd, 1260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 nt, 1270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 mode) 1280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar{ 1290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 row, col; 1310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 log2nt = 5; 1320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 two_nt, three_nt; 1330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i const_temp_4x32b, const_temp1_4x32b, const_temp2_4x32b, const_temp3_4x32b, const_temp4_4x32b; 1350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i col_8x16b, const_temp5_4x32b, const_temp6_4x32b, zero_8x16b, const_temp7_4x32b; 1360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UNUSED(src_strd); 1370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UNUSED(mode); 1380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar switch(nt) 1400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 1410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar case 16: 1420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar log2nt = 4; 1430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar break; 1440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar case 8: 1450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar log2nt = 3; 1460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar break; 1470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar case 4: 1480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar log2nt = 2; 1490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar break; 1500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar default: 1510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar break; 1520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 1530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar two_nt = 2 * nt; 1540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar three_nt = 3 * nt; 1550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Planar filtering */ 1570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/* setting vallues in registera*/ 1590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// pu1_ref[2*(two_nt - 1 - row)] 1610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// pu1_ref[2 * (three_nt + 1)] 1620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// pu1_ref[2 * (two_nt + 1) + col] 1630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// pu1_ref[2 * (nt - 1)] 1640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar const_temp_4x32b = _mm_set_epi16(pu1_ref[2 * (three_nt + 1) + 1], pu1_ref[2 * (three_nt + 1)], pu1_ref[2 * (three_nt + 1) + 1], 1660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_ref[2 * (three_nt + 1)], pu1_ref[2 * (three_nt + 1) + 1], pu1_ref[2 * (three_nt + 1)], 1670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_ref[2 * (three_nt + 1) + 1], pu1_ref[2 * (three_nt + 1)]); 1680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar const_temp1_4x32b = _mm_set_epi16(pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)], pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)], 1700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)], pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)]); 1710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar const_temp4_4x32b = _mm_set1_epi16(nt - 1); 1730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar const_temp6_4x32b = _mm_set1_epi16(nt); 1740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar const_temp7_4x32b = _mm_set1_epi16(4); 1750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar zero_8x16b = _mm_set1_epi32(0); 1770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(nt % 4 == 0) 1790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 1800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar const_temp7_4x32b = _mm_set1_epi16(4); 1810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(row = 0; row < nt; row++) 1830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 1840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i res_temp_8x16b, row_8x16b, res_temp1_8x16b, res_temp2_8x16b; 1850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i res_temp3_8x16b; 1860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar const_temp2_4x32b = _mm_set_epi16(pu1_ref[2 * (two_nt - 1 - row) + 1], pu1_ref[2 * (two_nt - 1 - row)], pu1_ref[2 * (two_nt - 1 - row) + 1], 1880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_ref[2 * (two_nt - 1 - row)], pu1_ref[2 * (two_nt - 1 - row) + 1], pu1_ref[2 * (two_nt - 1 - row)], 1890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_ref[2 * (two_nt - 1 - row) + 1], pu1_ref[2 * (two_nt - 1 - row)]); 1900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar const_temp3_4x32b = _mm_set1_epi16((row + 1)); 1920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar row_8x16b = _mm_set1_epi16((nt - 1 - row)); 1930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar const_temp5_4x32b = _mm_set_epi16(3, 3, 2, 2, 1, 1, 0, 0); 1950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar col_8x16b = _mm_set_epi16(4, 4, 3, 3, 2, 2, 1, 1); 1960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar const_temp5_4x32b = _mm_sub_epi16(const_temp4_4x32b, const_temp5_4x32b); 1980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*(row + 1) * pu1_ref[nt - 1]*/ 2000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp_8x16b = _mm_mullo_epi16(const_temp3_4x32b, const_temp1_4x32b); 2010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*(row + 1) * pu1_ref[nt - 1] + nt)*/ 2030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp_8x16b = _mm_add_epi16(res_temp_8x16b, const_temp6_4x32b); 2040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(col = 0; col < 2 * nt; col += 8) 2060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 2070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i src_temp_8x16b; 2080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* loding 8bit 16 pixles*/ 2100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (two_nt + 1) + col)); 2110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp_8x16b = _mm_cvtepu8_epi16(src_temp_8x16b); /* row=0*/ 2130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* (nt - 1 - row) * pu1_ref[two_nt + 1 + col] */ 2150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_8x16b = _mm_mullo_epi16(src_temp_8x16b, row_8x16b); 2160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*(col + 1) * pu1_ref[three_nt + 1]*/ 2180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp2_8x16b = _mm_mullo_epi16(const_temp_4x32b, col_8x16b); 2190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*(nt - 1 - col)* pu1_ref[two_nt - 1 - row]*/ 2210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp3_8x16b = _mm_mullo_epi16(const_temp2_4x32b, const_temp5_4x32b); 2220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_8x16b = _mm_add_epi16(res_temp_8x16b, res_temp1_8x16b); 2240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b); 2250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp3_8x16b); 2260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_8x16b = _mm_srli_epi16(res_temp1_8x16b, (log2nt + 1)); 2280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_8x16b = _mm_packus_epi16(res_temp1_8x16b, zero_8x16b); 2290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pu1_dst + (row * dst_strd) + col), res_temp1_8x16b); 2310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar const_temp5_4x32b = _mm_sub_epi16(const_temp5_4x32b, const_temp7_4x32b); 2330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar col_8x16b = _mm_add_epi16(col_8x16b, const_temp7_4x32b); 2340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } /* inner loop ends here */ 2350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 2360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 2370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar} 2380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/** 2400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar******************************************************************************* 2410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 2420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @brief 2430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Intraprediction for DC mode with reference neighboring samples location 2440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' Refer 2450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* to section 8.4.4.2.5 in the standard 2460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 2470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @par Description: 2480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 2490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 2500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pu1_src 2510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* UWORD8 pointer to the source 2520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 2530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pu1_dst 2540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* UWORD8 pointer to the destination 2550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 2560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] src_strd 2570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* integer source stride 2580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 2590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] dst_strd 2600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* integer destination stride 2610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 2620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] nt 2630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* integer Transform Block size (Chroma) 2640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 2650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] mode 2660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* integer intraprediction mode 2670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 2680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @returns 2690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 2700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @remarks 2710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* None 2720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 2730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar******************************************************************************* 2740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*/ 2750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarvoid ihevc_intra_pred_chroma_dc_sse42(UWORD8 *pu1_ref, 2770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 src_strd, 2780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UWORD8 *pu1_dst, 2790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 dst_strd, 2800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 nt, 2810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 mode) 2820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar{ 2830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 acc_dc_u, acc_dc_v; 2850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 dc_val_u, dc_val_v; 2860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 row; 2870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 log2nt = 5; 2880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i src_temp1, src_temp3, src_temp4, src_temp5, src_temp6, m_mask; 2890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i src_temp7, src_temp8, src_temp9, src_temp10; 2900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i m_zero = _mm_set1_epi32(0); 2910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UNUSED(src_strd); 2920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UNUSED(mode); 2930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar switch(nt) 2950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 2960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar case 32: 2970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar log2nt = 5; 2980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar break; 2990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar case 16: 3000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar log2nt = 4; 3010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar break; 3020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar case 8: 3030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar log2nt = 3; 3040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar break; 3050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar case 4: 3060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar log2nt = 2; 3070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar break; 3080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar default: 3090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar break; 3100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 3110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar acc_dc_u = 0; 3130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar acc_dc_v = 0; 3140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Calculate DC value for the transform block */ 3160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar m_mask = _mm_loadu_si128((__m128i *)&IHEVCE_SHUFFLEMASKY9[0]); 3180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(nt == 16) 3200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 3210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i temp_sad; 3220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt))); 3240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 16)); 3250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 32)); 3260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 48)); 3270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp5 = _mm_cvtepu8_epi16(src_temp3); 3290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp6 = _mm_cvtepu8_epi16(src_temp4); 3300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp9 = _mm_cvtepu8_epi16(src_temp7); 3310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp10 = _mm_cvtepu8_epi16(src_temp8); 3320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp3 = _mm_srli_si128(src_temp3, 8); 3340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp4 = _mm_srli_si128(src_temp4, 8); 3350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp7 = _mm_srli_si128(src_temp7, 8); 3360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp8 = _mm_srli_si128(src_temp8, 8); 3370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp3 = _mm_cvtepu8_epi16(src_temp3); 3390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp4 = _mm_cvtepu8_epi16(src_temp4); 3400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp7 = _mm_cvtepu8_epi16(src_temp7); 3410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp8 = _mm_cvtepu8_epi16(src_temp8); 3420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp4 = _mm_add_epi16(src_temp4, src_temp6); 3440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp6 = _mm_add_epi16(src_temp3, src_temp5); 3450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp8 = _mm_add_epi16(src_temp7, src_temp8); 3460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp10 = _mm_add_epi16(src_temp9, src_temp10); 3470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp4 = _mm_add_epi16(src_temp4, src_temp6); 3490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp8 = _mm_add_epi16(src_temp8, src_temp10); 3500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp4 = _mm_add_epi16(src_temp4, src_temp8); 3520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp4 = _mm_shuffle_epi8(src_temp4, m_mask); 3530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp4 = _mm_hadd_epi16(src_temp4, m_zero); 3540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp4 = _mm_hadd_epi16(src_temp4, m_zero); 3550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp4 = _mm_cvtepi16_epi32(src_temp4); 3570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_sad = _mm_srli_si128(src_temp4, 4); /* Next 32 bits */ 3580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar acc_dc_u = _mm_cvtsi128_si32(src_temp4); 3590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar acc_dc_v = _mm_cvtsi128_si32(temp_sad); 3600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 3610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else if(nt == 8) 3630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 3640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i temp_sad; 3650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt))); 3660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 16)); 3670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp5 = _mm_cvtepu8_epi16(src_temp3); 3690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp6 = _mm_cvtepu8_epi16(src_temp4); 3700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp3 = _mm_srli_si128(src_temp3, 8); 3720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp4 = _mm_srli_si128(src_temp4, 8); 3730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp3 = _mm_cvtepu8_epi16(src_temp3); 3750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp4 = _mm_cvtepu8_epi16(src_temp4); 3760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp4 = _mm_add_epi16(src_temp4, src_temp6); 3780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp6 = _mm_add_epi16(src_temp3, src_temp5); 3790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp4 = _mm_add_epi16(src_temp4, src_temp6); 3810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp4 = _mm_shuffle_epi8(src_temp4, m_mask); 3820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp4 = _mm_hadd_epi16(src_temp4, m_zero); 3830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp4 = _mm_hadd_epi16(src_temp4, m_zero); 3840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp4 = _mm_cvtepi16_epi32(src_temp4); 3860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_sad = _mm_srli_si128(src_temp4, 4); /* Next 32 bits */ 3870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar acc_dc_u = _mm_cvtsi128_si32(src_temp4); 3880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar acc_dc_v = _mm_cvtsi128_si32(temp_sad); 3890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 3900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else if(nt == 4) 3920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 3930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i temp_sad; 3940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt))); 3950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp5 = _mm_cvtepu8_epi16(src_temp3); 3970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp4 = _mm_srli_si128(src_temp3, 8); 3980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp4 = _mm_cvtepu8_epi16(src_temp4); 3990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp4 = _mm_add_epi16(src_temp4, src_temp5); 4010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp4 = _mm_shuffle_epi8(src_temp4, m_mask); 4030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp4 = _mm_hadd_epi16(src_temp4, m_zero); 4040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp4 = _mm_hadd_epi16(src_temp4, m_zero); 4050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp4 = _mm_cvtepi16_epi32(src_temp4); 4070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_sad = _mm_srli_si128(src_temp4, 4); /* Next 32 bits */ 4080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar acc_dc_u = _mm_cvtsi128_si32(src_temp4); 4090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar acc_dc_v = _mm_cvtsi128_si32(temp_sad); 4100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 4110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar acc_dc_u += pu1_ref[6 * nt]; 4140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar acc_dc_v += pu1_ref[6 * nt + 1]; 4150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar acc_dc_u -= pu1_ref[4 * nt]; 4170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar acc_dc_v -= pu1_ref[4 * nt + 1]; 4180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dc_val_u = (acc_dc_u + nt) >> (log2nt + 1); 4200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dc_val_v = (acc_dc_v + nt) >> (log2nt + 1); 4210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dc_val_u = dc_val_u | (dc_val_v << 8); 4230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Fill the remaining rows with DC value*/ 4250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(nt == 4) 4270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 4280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1 = _mm_set1_epi16(dc_val_u); 4290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* pu1_dst[(row * dst_strd) + col] = dc_val;*/ 4310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1); 4320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1); 4330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1); 4340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1); 4350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 4370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else if(nt == 8) 4380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 4390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1 = _mm_set1_epi16(dc_val_u); 4400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* pu1_dst[(row * dst_strd) + col] = dc_val;*/ 4420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1); 4430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1); 4440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1); 4450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1); 4460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp1); 4480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp1); 4490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp1); 4500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp1); 4510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 4530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else /* nt == 16 */ 4550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 4560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1 = _mm_set1_epi16(dc_val_u); 4580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(row = 0; row < nt; row += 8) 4600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 4610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* pu1_dst[(row * dst_strd) + col] = dc_val;*/ 4620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1); 4630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1); 4640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1); 4650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1); 4660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (0 * dst_strd)), src_temp1); 4670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (1 * dst_strd)), src_temp1); 4680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (2 * dst_strd)), src_temp1); 4690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (3 * dst_strd)), src_temp1); 4700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp1); 4720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp1); 4730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp1); 4740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp1); 4750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (4 * dst_strd)), src_temp1); 4760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (5 * dst_strd)), src_temp1); 4770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (6 * dst_strd)), src_temp1); 4780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (7 * dst_strd)), src_temp1); 4790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += 8 * dst_strd; 4810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 4820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 4850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar} 487