10d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/****************************************************************************** 20d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 30d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 40d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 50d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Licensed under the Apache License, Version 2.0 (the "License"); 60d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* you may not use this file except in compliance with the License. 70d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* You may obtain a copy of the License at: 80d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 90d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* http://www.apache.org/licenses/LICENSE-2.0 100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Unless required by applicable law or agreed to in writing, software 120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* distributed under the License is distributed on an "AS IS" BASIS, 130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* See the License for the specific language governing permissions and 150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* limitations under the License. 160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar******************************************************************************/ 180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/** 190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar******************************************************************************* 200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @file 210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* ihevc_deblck_atom_intr.c 220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @brief 240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Contains function definitions for deblocking filters 250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @author 270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Rishab 280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @par List of Functions: 300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* - ihevc_deblk_luma_vert_ssse3() 310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* - ihevc_deblk_luma_horz_ssse3() 320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* - ihevc_deblk_chroma_vert_ssse3() 330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* - ihevc_deblk_chroma_horz_ssse3() 340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @remarks 360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* None 370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar******************************************************************************* 390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*/ 400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <stdlib.h> 410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <stdio.h> 420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <assert.h> 430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_typedefs.h" 440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_platform_macros.h" 450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_macros.h" 460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_deblk.h" 470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_deblk_tables.h" 480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_debug.h" 490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_tables_x86_intr.h" 510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <immintrin.h> 530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/** 540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar******************************************************************************* 550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @brief 570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Decision process and filtering for the luma block vertical edge. 580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @par Description: 600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* The decision process for the luma block vertical edge is carried out and 610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* an appropriate filter is applied. The boundary filter strength, bs should 620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* be greater than 0. The pcm flags and the transquant bypass flags should 630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* be taken care of by the calling function. 640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pu1_src 660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Pointer to the src sample q(0,0) 670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] src_strd 690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Source stride 700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] bs 720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Boundary filter strength of q(0,0) 730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] quant_param_p 750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* quantization parameter of p block 760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] quant_param_q 780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* quantization parameter of p block 790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] beta_offset_div2 810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] tc_offset_div2 840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] filter_flag_p 870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* flag whether to filter the p block 880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] filter_flag_q 900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* flag whether to filter the q block 910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @returns 930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @remarks 950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* None 960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar******************************************************************************* 980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*/ 990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarvoid ihevc_deblk_luma_vert_ssse3(UWORD8 *pu1_src, 1010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 src_strd, 1020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 bs, 1030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 quant_param_p, 1040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 quant_param_q, 1050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 beta_offset_div2, 1060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 tc_offset_div2, 1070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 filter_flag_p, 1080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 filter_flag_q) 1090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar{ 1100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 qp_luma, beta_indx, tc_indx; 1110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 beta, tc; 1120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 d, dp, dq, d_sam0, d_sam3; 1130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 d3, d0, de_0, de_1, de_2, de_3; 1150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 de, dep, deq; 1160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i src_row0_8x16b, src_row1_8x16b, src_row2_8x16b, src_row3_8x16b; 1170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 1200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i src_tmp_8x16b, coef_8x16b, mask_d_result_4x32b, mask_de_result_8x16b; 1210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i mask_16x8b, temp_coef0_8x16b, temp_coef1_8x16b; 1220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ASSERT((bs > 0) && (bs <= 3)); 1260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ASSERT(filter_flag_p || filter_flag_q); 1270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar qp_luma = (quant_param_p + quant_param_q + 1) >> 1; 1290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar beta_indx = CLIP3(qp_luma + (beta_offset_div2 << 1), 0, 51); 1300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* BS based on implementation can take value 3 if it is intra/inter egde */ 1320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* based on BS, tc index is calcuated by adding 2 * ( bs - 1) to QP and tc_offset */ 1330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* for BS = 1 adding factor is (0*2), BS = 2 or 3 adding factor is (1*2) */ 1340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* the above desired functionallity is achieved by doing (2*(bs>>1)) */ 1350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tc_indx = CLIP3(qp_luma + (2 * (bs >> 1)) + (tc_offset_div2 << 1), 0, 53); 1370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar beta = gai4_ihevc_beta_table[beta_indx]; 1390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tc = gai4_ihevc_tc_table[tc_indx]; 1400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(0 == tc) 1410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 1420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar return; 1430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 1440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_row0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - 4)); 1450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_row3_8x16b = _mm_loadl_epi64((__m128i *)((pu1_src - 4) + 3 * src_strd)); 1460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar coef_8x16b = _mm_load_si128((__m128i *)(coef_d)); 1480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mask_16x8b = _mm_load_si128((__m128i *)(shuffle_d)); 1490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_tmp_8x16b = _mm_unpacklo_epi64(src_row0_8x16b, src_row3_8x16b); 1510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mask_de_result_8x16b = _mm_shuffle_epi8(src_tmp_8x16b, mask_16x8b); 1520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mask_d_result_4x32b = _mm_maddubs_epi16(src_tmp_8x16b, coef_8x16b); 1540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //to get all 1's of 8 bit in (1) 1570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_coef0_8x16b = _mm_cmpeq_epi16(src_tmp_8x16b, src_tmp_8x16b); 1580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_coef1_8x16b = _mm_srli_epi16(temp_coef0_8x16b, 15); 1590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //accumulating values foe dp3 dq3 , dp0 dq0 values 1600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mask_d_result_4x32b = _mm_madd_epi16(mask_d_result_4x32b, temp_coef1_8x16b); 1610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_coef1_8x16b = _mm_packus_epi16(temp_coef1_8x16b, temp_coef1_8x16b); 1630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar // to get all 1,-1 sets of 16 bits in (0) 1640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_coef0_8x16b = _mm_unpacklo_epi8(temp_coef0_8x16b, temp_coef1_8x16b); 1650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //q33-q30,p33-p30,q03-q00,p03-p00,0,q30-p30,0,q00-p00 1660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mask_de_result_8x16b = _mm_maddubs_epi16(mask_de_result_8x16b, temp_coef0_8x16b); 1670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //to get 16 bit 1's 1680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_coef0_8x16b = _mm_srli_epi16(temp_coef1_8x16b, 8); 1690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar // dq3 dp3 dq0 dp0 1720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mask_d_result_4x32b = _mm_abs_epi32(mask_d_result_4x32b); 1730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mask_16x8b = _mm_shuffle_epi32(mask_d_result_4x32b, 0xec); 1740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mask_d_result_4x32b = _mm_shuffle_epi32(mask_d_result_4x32b, 0x49); 1750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar // dq dp d3 d0 1760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mask_d_result_4x32b = _mm_add_epi32(mask_d_result_4x32b, mask_16x8b); 1770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //|q33-q30|,|p33-p30|,|q03-q00|,|p03-p00|,0,|q30-p30|,0,|q00-p00| 1780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mask_de_result_8x16b = _mm_abs_epi16(mask_de_result_8x16b); 1790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //|q33-q30|+|p33-p30|,|q03-q00|+|p03-p00|,0+|q30-p30|,0+|q00-p00| 1800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mask_de_result_8x16b = _mm_madd_epi16(mask_de_result_8x16b, temp_coef0_8x16b); 1810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ///store back in a single variable 1830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_coef0_8x16b = _mm_srli_si128(mask_d_result_4x32b, 4); 1840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_coef1_8x16b = _mm_srli_si128(mask_d_result_4x32b, 8); 1850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mask_16x8b = _mm_srli_si128(mask_d_result_4x32b, 12); 1860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar d0 = _mm_cvtsi128_si32(mask_d_result_4x32b); 1880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar d3 = _mm_cvtsi128_si32(temp_coef0_8x16b); 1890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dp = _mm_cvtsi128_si32(temp_coef1_8x16b); 1900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dq = _mm_cvtsi128_si32(mask_16x8b); 1910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //getting d 1920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar d = d0 + d3; 1930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ///store back in a single variable 1950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_coef0_8x16b = _mm_srli_si128(mask_de_result_8x16b, 4); 1960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_coef1_8x16b = _mm_srli_si128(mask_de_result_8x16b, 8); 1970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mask_16x8b = _mm_srli_si128(mask_de_result_8x16b, 12); 1980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar de_0 = _mm_cvtsi128_si32(mask_de_result_8x16b); 2000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar de_1 = _mm_cvtsi128_si32(temp_coef0_8x16b); 2010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar de_2 = _mm_cvtsi128_si32(temp_coef1_8x16b); 2020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar de_3 = _mm_cvtsi128_si32(mask_16x8b); 2030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar de = 0; 2050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dep = 0; 2060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar deq = 0; 2070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(d < beta) 2080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 2090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar d_sam0 = 0; 2100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if((2 * d0 < (beta >> 2)) 2110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar && (de_2 < (beta >> 3)) 2120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar && (de_0 < ((5 * tc + 1) >> 1))) 2130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 2140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar d_sam0 = 1; 2150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 2160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar d_sam3 = 0; 2180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if((2 * d3 < (beta >> 2)) 2190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar && (de_3 < (beta >> 3)) 2200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar && de_1 < ((5 * tc + 1) >> 1)) 2210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 2220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar d_sam3 = 1; 2230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 2240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar de = (d_sam0 & d_sam3) + 1; 2260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dep = (dp < (beta + (beta >> 1)) >> 3) ? 1 : 0; 2270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar deq = (dq < (beta + (beta >> 1)) >> 3) ? 1 : 0; 2280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(tc <= 1) 2290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 2300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dep = 0; 2310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar deq = 0; 2320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 2330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 2340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 2360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(de != 0) 2380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 2390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_row1_8x16b = _mm_loadl_epi64((__m128i *)((pu1_src - 4) + src_strd)); 2420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_row2_8x16b = _mm_loadl_epi64((__m128i *)((pu1_src - 4) + 2 * src_strd)); 2430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(de == 2) 2450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 2460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i temp_pq_str0_16x8b; 2470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i temp_pq1_str0_16x8b, temp_pq1_str1_16x8b; 2480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i temp_pq2_str0_16x8b; 2490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i temp_pq_str1_16x8b; 2500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i temp_str0_16x8b, temp_str1_16x8b, temp_str2_16x8b, temp_str3_16x8b; 2510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i temp_max0_16x8b, temp_max1_16x8b, temp_min0_16x8b, temp_min1_16x8b; 2520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i const2_8x16b, const2tc_8x16b; 2530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LWORD64 mask, tc2; 2540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tc = tc << 1; 2550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mask = (((LWORD64)filter_flag_q) << 63) | (((LWORD64)filter_flag_p) << 31); 2560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tc2 = ((LWORD64)tc); 2570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar const2_8x16b = _mm_cmpeq_epi16(src_row0_8x16b, src_row0_8x16b); 2590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //q'0-q'1-2 ,p'0-p'1-2 2600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_row0_8x16b = _mm_unpacklo_epi64(src_row0_8x16b, src_row2_8x16b); 2610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_row1_8x16b = _mm_unpacklo_epi64(src_row1_8x16b, src_row3_8x16b); 2620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar const2_8x16b = _mm_srli_epi16(const2_8x16b, 15); 2640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_pq_str0_16x8b = _mm_srli_epi64(src_row0_8x16b, 16); 2650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_pq_str1_16x8b = _mm_srli_epi64(src_row1_8x16b, 16); 2660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //arranged x x x x x x x x q31 q30 q1 q10 p30 p31 p10 p11 , x x x x x x x x q21 q20 q01 q00 p20 p21 p00 p01 2670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_str0_16x8b = _mm_unpacklo_epi16(temp_pq_str0_16x8b, temp_pq_str1_16x8b); 2680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_str1_16x8b = _mm_unpackhi_epi16(temp_pq_str0_16x8b, temp_pq_str1_16x8b); 2690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar const2_8x16b = _mm_packus_epi16(const2_8x16b, const2_8x16b); 2710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //arranged q31 q30 q21 q20 q1 q10 q01 q00 p30 p31 p20 p21 p10 p11 p00 p01 2720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_pq_str0_16x8b = _mm_unpacklo_epi32(temp_str0_16x8b, temp_str1_16x8b); 2730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_pq_str0_16x8b = _mm_maddubs_epi16(temp_pq_str0_16x8b, const2_8x16b); 2750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //q'1-2, p'1-2 2770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_pq1_str0_16x8b = _mm_srli_epi64(src_row0_8x16b, 8); 2780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_pq1_str1_16x8b = _mm_srli_epi64(src_row1_8x16b, 8); 2790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_str2_16x8b = _mm_unpacklo_epi16(temp_pq1_str0_16x8b, temp_pq1_str1_16x8b); 2810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_str3_16x8b = _mm_unpackhi_epi16(temp_pq1_str0_16x8b, temp_pq1_str1_16x8b); 2820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_str2_16x8b = _mm_shuffle_epi32(temp_str2_16x8b, 0x58); 2840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_str3_16x8b = _mm_shuffle_epi32(temp_str3_16x8b, 0x58); 2850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar // q30 p30 q20 p20 q10 p10 q01 q00 p30 q20 p20 q10 p10 q01 q00 p00 2860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_pq1_str0_16x8b = _mm_unpackhi_epi32(temp_str2_16x8b, temp_str3_16x8b); 2870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar // q32 q31 q22 q21 q12 q11 q02 q01 p32 p31 p22 p21 p12 p11 p02 p01 2880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_pq1_str1_16x8b = _mm_unpacklo_epi32(temp_str2_16x8b, temp_str3_16x8b); 2890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_pq1_str0_16x8b = _mm_maddubs_epi16(temp_pq1_str0_16x8b, const2_8x16b); 2910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_pq1_str1_16x8b = _mm_maddubs_epi16(temp_pq1_str1_16x8b, const2_8x16b); 2920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //clipping mask design 2940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_str1_16x8b = _mm_setzero_si128(); 2950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_str0_16x8b = _mm_loadl_epi64((__m128i *)(&mask)); 2960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar const2tc_8x16b = _mm_loadl_epi64((__m128i *)(&tc2)); 2970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_str0_16x8b = _mm_shuffle_epi32(temp_str0_16x8b, 0x44); 2980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar const2tc_8x16b = _mm_shuffle_epi8(const2tc_8x16b, temp_str1_16x8b); 2990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //clipping mask design 3010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_str0_16x8b = _mm_srai_epi32(temp_str0_16x8b, 31); 3020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar const2tc_8x16b = _mm_and_si128(const2tc_8x16b, temp_str0_16x8b); 3030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //calculating Clipping MAX for all pixel values. 3040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_max0_16x8b = _mm_adds_epu8(src_row0_8x16b, const2tc_8x16b); 3050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_max1_16x8b = _mm_adds_epu8(src_row1_8x16b, const2tc_8x16b); 3060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //q'2-q'0-2,p'2-p'0-2 3090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_pq2_str0_16x8b = _mm_unpacklo_epi16(src_row0_8x16b, src_row2_8x16b); 3100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_str3_16x8b = _mm_unpacklo_epi16(src_row1_8x16b, src_row3_8x16b); 3110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_pq2_str0_16x8b = _mm_shuffle_epi32(temp_pq2_str0_16x8b, 0x5c); 3130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_str3_16x8b = _mm_shuffle_epi32(temp_str3_16x8b, 0x5c); 3140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar const2_8x16b = _mm_slli_epi16(const2_8x16b, 1); 3160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //arranged q33 q32 q23 q22 q13 q12 q03 q02 p33 p32 p23 p22 p13 p12 p03 p02 3170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_str3_16x8b = _mm_unpacklo_epi16(temp_pq2_str0_16x8b, temp_str3_16x8b); 3180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_pq2_str0_16x8b = _mm_maddubs_epi16(temp_str3_16x8b, const2_8x16b); 3200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //calculating Clipping MIN for all pixel values. 3220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_min0_16x8b = _mm_subs_epu8(src_row0_8x16b, const2tc_8x16b); 3230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_min1_16x8b = _mm_subs_epu8(src_row1_8x16b, const2tc_8x16b); 3240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //q'0-q'1-2 ,p'0-p'1-2 3250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_pq_str1_16x8b = _mm_shuffle_epi32(temp_pq_str0_16x8b, 0x4e); 3260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_pq_str0_16x8b = _mm_add_epi16(temp_pq_str0_16x8b, temp_pq_str1_16x8b); 3270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //q'1-2 p'1-2 3280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_pq1_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq1_str1_16x8b); 3290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //to get 2 in 16 bit 3300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar const2_8x16b = _mm_srli_epi16(const2_8x16b, 8); 3310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //to get q33 q23 q13 q03, p33 p23 p13 p03 3320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_pq1_str1_16x8b = _mm_slli_epi16(temp_str3_16x8b, 8); 3330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_pq_str1_16x8b = _mm_srli_epi16(temp_str3_16x8b, 8); 3340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_pq1_str1_16x8b = _mm_srli_epi16(temp_pq1_str1_16x8b, 8); 3350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //q'1, p'1 (adding 2) 3370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_pq1_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, const2_8x16b); 3380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //q'0-q'1,p'0-p'1 3390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_pq_str0_16x8b = _mm_add_epi16(temp_pq_str0_16x8b, const2_8x16b); 3400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //q'2-q'1,p'2-p'1 3410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_pq2_str0_16x8b = _mm_add_epi16(temp_pq2_str0_16x8b, const2_8x16b); 3420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //q'0 = (q'0-q'1)+q'1 ,p'0 = (p'0-p'1)+p'1; 3430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_pq_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq_str0_16x8b); 3440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //q'2 = (q'2-q'1)+q'1 ,p'2 = (p'2-p'1)+p'1; 3450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_pq2_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq2_str0_16x8b); 3460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //normalisation of all modified pixels 3480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_pq_str0_16x8b = _mm_srai_epi16(temp_pq_str0_16x8b, 3); 3490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_pq1_str0_16x8b = _mm_srai_epi16(temp_pq1_str0_16x8b, 2); 3500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_pq2_str0_16x8b = _mm_srai_epi16(temp_pq2_str0_16x8b, 3); 3510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //getting p0 p1 together and p2 p3 together 3530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_str0_16x8b = _mm_unpacklo_epi16(temp_pq1_str0_16x8b, temp_pq_str0_16x8b); 3540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_str2_16x8b = _mm_unpacklo_epi16(temp_pq1_str1_16x8b, temp_pq2_str0_16x8b); 3550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //getting q1 q0 together and q3 q2 together 3560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_pq_str0_16x8b = _mm_unpackhi_epi16(temp_pq_str0_16x8b, temp_pq1_str0_16x8b); 3570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_pq2_str0_16x8b = _mm_unpackhi_epi16(temp_pq2_str0_16x8b, temp_pq_str1_16x8b); 3580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //getting p's of row0 row1 together and of row2 row3 together 3590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_pq_str1_16x8b = _mm_unpacklo_epi32(temp_str2_16x8b, temp_str0_16x8b); 3600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_str2_16x8b = _mm_unpackhi_epi32(temp_str2_16x8b, temp_str0_16x8b); 3610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //getting q's of row0 row1 together and of row2 row3 together 3620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_str0_16x8b = _mm_unpacklo_epi32(temp_pq_str0_16x8b, temp_pq2_str0_16x8b); 3630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_pq_str0_16x8b = _mm_unpackhi_epi32(temp_pq_str0_16x8b, temp_pq2_str0_16x8b); 3640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //getting values for respective rows in 16 bit 3650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_row0_8x16b = _mm_unpacklo_epi64(temp_pq_str1_16x8b, temp_str0_16x8b); 3660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_row1_8x16b = _mm_unpackhi_epi64(temp_pq_str1_16x8b, temp_str0_16x8b); 3670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_row2_8x16b = _mm_unpacklo_epi64(temp_str2_16x8b, temp_pq_str0_16x8b); 3680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_row3_8x16b = _mm_unpackhi_epi64(temp_str2_16x8b, temp_pq_str0_16x8b); 3690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //packing values to 8 bit 3700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_row0_8x16b = _mm_packus_epi16(src_row0_8x16b, src_row2_8x16b); 3710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_row1_8x16b = _mm_packus_epi16(src_row1_8x16b, src_row3_8x16b); 3720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //Clipping MAX 3730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_row0_8x16b = _mm_min_epu8(src_row0_8x16b, temp_max0_16x8b); 3740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_row1_8x16b = _mm_min_epu8(src_row1_8x16b, temp_max1_16x8b); 3750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //Clipping MIN 3760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_row0_8x16b = _mm_max_epu8(src_row0_8x16b, temp_min0_16x8b); 3770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_row1_8x16b = _mm_max_epu8(src_row1_8x16b, temp_min1_16x8b); 3780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //separating row 2 and row 3 3790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_row2_8x16b = _mm_srli_si128(src_row0_8x16b, 8); 3800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_row3_8x16b = _mm_srli_si128(src_row1_8x16b, 8); 3810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 3830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else 3850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 3860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i tmp_delta0_8x16b, tmp_delta1_8x16b, tmp_delta2_8x16b, tmp_delta3_8x16b; 3880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i tmp0_const_8x16b, tmp1_const_8x16b, tmp2_const_8x16b, tmp3_const_8x16b; 3890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i coefdelta_0_8x16b, mask_pq_8x16b; 3900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i const2_8x16b, consttc_8x16b; 3910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LWORD64 mask1; 3930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mask1 = (((LWORD64)(filter_flag_q & deq)) << 63) | (((LWORD64)filter_flag_q) << 47) | (((LWORD64)filter_flag_p) << 31) | (((LWORD64)(filter_flag_p & dep)) << 15); 3940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar consttc_8x16b = _mm_set1_epi32(tc); 3960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_row0_8x16b = _mm_unpacklo_epi64(src_row0_8x16b, src_row1_8x16b); 3990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_row2_8x16b = _mm_unpacklo_epi64(src_row2_8x16b, src_row3_8x16b); 4000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_delta2_8x16b = _mm_srli_epi64(src_row0_8x16b, 16); 4020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_delta3_8x16b = _mm_srli_epi64(src_row2_8x16b, 16); 4030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_delta2_8x16b = _mm_shuffle_epi32(tmp_delta2_8x16b, 0x08); 4050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_delta3_8x16b = _mm_shuffle_epi32(tmp_delta3_8x16b, 0x08); 4060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //arranged q31 q30 p30 p31 q21 q20 p20 p21 q1 q10 p10 p11 q01 q00 p00 p01 4070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_delta2_8x16b = _mm_unpacklo_epi64(tmp_delta2_8x16b, tmp_delta3_8x16b); 4080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar coefdelta_0_8x16b = _mm_load_si128((__m128i *)coef_de1); 4100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar // (-3q1+9q0),(-9p0+3p1) 4110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_delta3_8x16b = _mm_maddubs_epi16(tmp_delta2_8x16b, coefdelta_0_8x16b); 4120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //converting to 16 bit 4130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar consttc_8x16b = _mm_packs_epi32(consttc_8x16b, consttc_8x16b); 4140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //getting -tc store 4150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp1_const_8x16b = _mm_cmpeq_epi32(consttc_8x16b, consttc_8x16b); 4160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //calc 10 *tc = 2*tc +8*tc ; 2*tc 4170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp2_const_8x16b = _mm_slli_epi16(consttc_8x16b, 1); 4180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //calc 10 *tc = 2*tc +8*tc ; 8*tc 4190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp0_const_8x16b = _mm_slli_epi16(consttc_8x16b, 3); 4200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //getting -tc store 4210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp3_const_8x16b = _mm_sign_epi16(consttc_8x16b, tmp1_const_8x16b); 4220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //calc 10 *tc 4230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp2_const_8x16b = _mm_add_epi16(tmp2_const_8x16b, tmp0_const_8x16b); 4240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //const 1 4250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar const2_8x16b = _mm_srli_epi16(tmp1_const_8x16b, 15); 4260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_delta0_8x16b = _mm_madd_epi16(tmp_delta3_8x16b, const2_8x16b); 4270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar const2_8x16b = _mm_srli_epi32(tmp1_const_8x16b, 31); 4280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //getting the mask values 4290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mask_pq_8x16b = _mm_loadl_epi64((__m128i *)(&mask1)); 4300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //loaded coef for delta1 calculation 4310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar coefdelta_0_8x16b = _mm_load_si128((__m128i *)coef_dep1); 4320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //(-2q1+q0),(p0-2p1) 4330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_delta3_8x16b = _mm_maddubs_epi16(tmp_delta2_8x16b, coefdelta_0_8x16b); 4340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //const 8 4350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar const2_8x16b = _mm_slli_epi32(const2_8x16b, 3); 4360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //rearranging the mask values 4370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mask_pq_8x16b = _mm_unpacklo_epi64(mask_pq_8x16b, mask_pq_8x16b); 4380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //normalisation of the filter 4390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_delta0_8x16b = _mm_add_epi32(tmp_delta0_8x16b, const2_8x16b); 4400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_delta0_8x16b = _mm_srai_epi32(tmp_delta0_8x16b, 4); 4410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //getting deltaq0 4430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_delta2_8x16b = _mm_sign_epi32(tmp_delta0_8x16b, tmp1_const_8x16b); 4440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //packing d3q d2q d1q d0q d3p d2p d1p d0p 4450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_delta0_8x16b = _mm_packs_epi32(tmp_delta0_8x16b, tmp_delta2_8x16b); 4460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //absolute delta 4470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_delta2_8x16b = _mm_abs_epi16(tmp_delta0_8x16b); 4480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //Clipping of delta0 4490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_delta0_8x16b = _mm_min_epi16(tmp_delta0_8x16b, consttc_8x16b); 4500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //mask for |delta| < 10*tc 4510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp0_const_8x16b = _mm_cmpgt_epi16(tmp2_const_8x16b, tmp_delta2_8x16b); 4520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //Clipping of delta0 4530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_delta0_8x16b = _mm_max_epi16(tmp_delta0_8x16b, tmp3_const_8x16b); 4540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //delta 1 calc starts 4570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //getting q32 q22 q12 q02 p32 p12 p22 p02 4590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp2_const_8x16b = _mm_loadl_epi64((__m128i *)(shuffle0)); 4600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_delta2_8x16b = _mm_shuffle_epi8(src_row0_8x16b, tmp2_const_8x16b); 4610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_delta1_8x16b = _mm_shuffle_epi8(src_row2_8x16b, tmp2_const_8x16b); 4620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_delta1_8x16b = _mm_unpacklo_epi32(tmp_delta2_8x16b, tmp_delta1_8x16b); 4630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //constant 1 4640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar const2_8x16b = _mm_srli_epi16(tmp1_const_8x16b, 15); 4650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //tc>>1 16 bit 4660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar consttc_8x16b = _mm_srai_epi16(consttc_8x16b, 1); 4670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //getting -tc>>1 store 16 bit 4690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp1_const_8x16b = _mm_sign_epi16(consttc_8x16b, tmp1_const_8x16b); 4700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //2*delta0 4710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp2_const_8x16b = _mm_add_epi16(tmp_delta0_8x16b, tmp_delta0_8x16b); 4720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //getting all respective q's and p's together 4740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp3_const_8x16b = _mm_load_si128((__m128i *)(shuffle1)); 4750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_delta3_8x16b = _mm_shuffle_epi8(tmp_delta3_8x16b, tmp3_const_8x16b); 4760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //final adds for deltap1 and deltaq1 4770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_delta3_8x16b = _mm_add_epi16(tmp_delta3_8x16b, const2_8x16b); 4780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_delta1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, tmp2_const_8x16b); 4790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_delta1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, tmp_delta3_8x16b); 4800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp2_const_8x16b = _mm_setzero_si128(); 4810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_delta1_8x16b = _mm_srai_epi16(tmp_delta1_8x16b, 2); 4820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar // clipping delta1 4840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_delta1_8x16b = _mm_min_epi16(tmp_delta1_8x16b, consttc_8x16b); 4850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar // clipping delta1 4860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_delta1_8x16b = _mm_max_epi16(tmp_delta1_8x16b, tmp1_const_8x16b); 4870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //getting the mask ready 4890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mask_pq_8x16b = _mm_srai_epi16(mask_pq_8x16b, 15); 4900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //masking of the delta values |delta|<10*tc 4910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_delta1_8x16b = _mm_and_si128(tmp_delta1_8x16b, tmp0_const_8x16b); 4920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_delta0_8x16b = _mm_and_si128(tmp_delta0_8x16b, tmp0_const_8x16b); 4930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //packing dq1 dq0 dp0 dp1 4940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp1_const_8x16b = _mm_unpacklo_epi16(tmp_delta1_8x16b, tmp_delta0_8x16b); 4950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_delta0_8x16b = _mm_unpackhi_epi16(tmp_delta0_8x16b, tmp_delta1_8x16b); 4960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_delta1_8x16b = _mm_unpackhi_epi32(tmp1_const_8x16b, tmp_delta0_8x16b); 4970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_delta0_8x16b = _mm_unpacklo_epi32(tmp1_const_8x16b, tmp_delta0_8x16b); 4980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //masking of the delta values dep, deq , filter_p ,filter_q 5000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_delta0_8x16b = _mm_and_si128(tmp_delta0_8x16b, mask_pq_8x16b); 5010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_delta1_8x16b = _mm_and_si128(tmp_delta1_8x16b, mask_pq_8x16b); 5020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //converting 8bit to 16 bit 5030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_row0_8x16b = _mm_unpacklo_epi8(src_row0_8x16b, tmp2_const_8x16b); 5040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_row1_8x16b = _mm_unpacklo_epi8(src_row1_8x16b, tmp2_const_8x16b); 5050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_row2_8x16b = _mm_unpacklo_epi8(src_row2_8x16b, tmp2_const_8x16b); 5060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_row3_8x16b = _mm_unpacklo_epi8(src_row3_8x16b, tmp2_const_8x16b); 5070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //shuffle values loaded 5080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp0_const_8x16b = _mm_load_si128((__m128i *)shuffle2); 5090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp1_const_8x16b = _mm_load_si128((__m128i *)shuffle3); 5100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //arranging each row delta in different registers 5110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_delta3_8x16b = _mm_shuffle_epi8(tmp_delta1_8x16b, tmp1_const_8x16b); 5120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_delta2_8x16b = _mm_shuffle_epi8(tmp_delta1_8x16b, tmp0_const_8x16b); 5130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_delta1_8x16b = _mm_shuffle_epi8(tmp_delta0_8x16b, tmp1_const_8x16b); 5140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_delta0_8x16b = _mm_shuffle_epi8(tmp_delta0_8x16b, tmp0_const_8x16b); 5150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //adding the respective delta 5170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_row3_8x16b = _mm_add_epi16(tmp_delta3_8x16b, src_row3_8x16b); 5180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_row2_8x16b = _mm_add_epi16(tmp_delta2_8x16b, src_row2_8x16b); 5190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_row1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, src_row1_8x16b); 5200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_row0_8x16b = _mm_add_epi16(tmp_delta0_8x16b, src_row0_8x16b); 5210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //saturating to 8 bit 5220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_row2_8x16b = _mm_packus_epi16(src_row2_8x16b, src_row3_8x16b); 5230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_row0_8x16b = _mm_packus_epi16(src_row0_8x16b, src_row1_8x16b); 5240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //separating different rows 5250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_row1_8x16b = _mm_srli_si128(src_row0_8x16b, 8); 5260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_row3_8x16b = _mm_srli_si128(src_row2_8x16b, 8); 5270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 5280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pu1_src - 4), src_row0_8x16b); 5300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)((pu1_src - 4) + src_strd), src_row1_8x16b); 5310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)((pu1_src - 4) + 2 * src_strd), src_row2_8x16b); 5320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)((pu1_src - 4) + 3 * src_strd), src_row3_8x16b); 5330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 5340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar} 5350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarvoid ihevc_deblk_luma_horz_ssse3(UWORD8 *pu1_src, 5370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 src_strd, 5380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 bs, 5390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 quant_param_p, 5400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 quant_param_q, 5410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 beta_offset_div2, 5420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 tc_offset_div2, 5430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 filter_flag_p, 5440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 filter_flag_q) 5450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar{ 5460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 qp_luma, beta_indx, tc_indx; 5470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 beta, tc; 5480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 d0, d3, dp, dq, d; 5500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 de_0, de_1, de_2, de_3; 5510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 d_sam0, d_sam3; 5520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 de, dep, deq; 5530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i src_q0_8x16b, src_q1_8x16b, src_p0_8x16b, src_p1_8x16b, src_q2_8x16b; 5550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i tmp_pq_str1_8x16b, src_p2_8x16b, tmp_pq_str0_8x16b; 5560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 5610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i src_tmp_p_0_8x16b, src_tmp_p_1_8x16b, src_tmp_q_0_8x16b, src_tmp_q_1_8x16b; 5620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i coef_8x16b, mask_d_result_4x32b, mask_de_result_8x16b; 5630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i mask_16x8b, temp_coef0_8x16b, temp_coef1_8x16b; 5640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ASSERT((bs > 0)); 5660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ASSERT(filter_flag_p || filter_flag_q); 5670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar qp_luma = (quant_param_p + quant_param_q + 1) >> 1; 5690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar beta_indx = CLIP3(qp_luma + (beta_offset_div2 << 1), 0, 51); 5700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* BS based on implementation can take value 3 if it is intra/inter egde */ 5720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* based on BS, tc index is calcuated by adding 2 * ( bs - 1) to QP and tc_offset */ 5730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* for BS = 1 adding factor is (0*2), BS = 2 or 3 adding factor is (1*2) */ 5740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* the above desired functionallity is achieved by doing (2*(bs>>1)) */ 5750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tc_indx = CLIP3(qp_luma + 2 * (bs >> 1) + (tc_offset_div2 << 1), 0, 53); 5770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar beta = gai4_ihevc_beta_table[beta_indx]; 5790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tc = gai4_ihevc_tc_table[tc_indx]; 5800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(0 == tc) 5810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 5820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar return; 5830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 5840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_q0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src)); 5850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_q1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd)); 5860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_p0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - src_strd)); 5870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_p1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - 2 * src_strd)); 5880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_q2_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); 5890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_pq_str1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); 5900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_p2_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - 3 * src_strd)); 5910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_pq_str0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - 4 * src_strd)); 5920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_tmp_p_0_8x16b = _mm_unpacklo_epi8(src_p1_8x16b, src_p0_8x16b); 5950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_tmp_p_1_8x16b = _mm_unpacklo_epi8(tmp_pq_str0_8x16b, src_p2_8x16b); 5960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_tmp_q_0_8x16b = _mm_unpacklo_epi8(src_q0_8x16b, src_q1_8x16b); 5980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_tmp_q_1_8x16b = _mm_unpacklo_epi8(src_q2_8x16b, tmp_pq_str1_8x16b); 5990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_tmp_p_0_8x16b = _mm_unpacklo_epi16(src_tmp_p_1_8x16b, src_tmp_p_0_8x16b); 6010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_tmp_q_0_8x16b = _mm_unpacklo_epi16(src_tmp_q_0_8x16b, src_tmp_q_1_8x16b); 6020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_tmp_p_0_8x16b = _mm_shuffle_epi32(src_tmp_p_0_8x16b, 0x6c); 6040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_tmp_q_0_8x16b = _mm_shuffle_epi32(src_tmp_q_0_8x16b, 0x6c); 6050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar coef_8x16b = _mm_load_si128((__m128i *)(coef_d)); 6070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mask_16x8b = _mm_load_si128((__m128i *)(shuffle_d)); 6080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_tmp_p_0_8x16b = _mm_unpacklo_epi32(src_tmp_p_0_8x16b, src_tmp_q_0_8x16b); 6100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //WORD32 shuffle_d[4]={0x80800403,0x80800c0b,0x03000704,0x0b080f0c}; 6110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mask_de_result_8x16b = _mm_shuffle_epi8(src_tmp_p_0_8x16b, mask_16x8b); 6120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mask_d_result_4x32b = _mm_maddubs_epi16(src_tmp_p_0_8x16b, coef_8x16b); 6140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //to get all 1's of 8 bit in (1) 6170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_coef0_8x16b = _mm_cmpeq_epi16(src_tmp_p_0_8x16b, src_tmp_p_0_8x16b); 6180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_coef1_8x16b = _mm_srli_epi16(temp_coef0_8x16b, 15); 6190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //accumulating values foe dp3 dq3 , dp0 dq0 values 6200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mask_d_result_4x32b = _mm_madd_epi16(mask_d_result_4x32b, temp_coef1_8x16b); 6210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_coef1_8x16b = _mm_packus_epi16(temp_coef1_8x16b, temp_coef1_8x16b); 6230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar // to get all 1,-1 sets of 16 bits in (0) 6240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_coef0_8x16b = _mm_unpacklo_epi8(temp_coef0_8x16b, temp_coef1_8x16b); 6250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //q33-q30,p33-p30,q03-q00,p03-p00,0,q30-p30,0,q00-p00 6260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mask_de_result_8x16b = _mm_maddubs_epi16(mask_de_result_8x16b, temp_coef0_8x16b); 6270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //to get 16 bit 1's 6280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_coef0_8x16b = _mm_srli_epi16(temp_coef1_8x16b, 8); 6290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar // dq3 dp3 dq0 dp0 6320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mask_d_result_4x32b = _mm_abs_epi32(mask_d_result_4x32b); 6330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mask_16x8b = _mm_shuffle_epi32(mask_d_result_4x32b, 0xec); 6340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mask_d_result_4x32b = _mm_shuffle_epi32(mask_d_result_4x32b, 0x49); 6350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar // dq dp d3 d0 6360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mask_d_result_4x32b = _mm_add_epi32(mask_d_result_4x32b, mask_16x8b); 6370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //|q33-q30|,|p33-p30|,|q03-q00|,|p03-p00|,0,|q30-p30|,0,|q00-p00| 6380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mask_de_result_8x16b = _mm_abs_epi16(mask_de_result_8x16b); 6390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //|q33-q30|+|p33-p30|,|q03-q00|+|p03-p00|,0+|q30-p30|,0+|q00-p00| 6400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mask_de_result_8x16b = _mm_madd_epi16(mask_de_result_8x16b, temp_coef0_8x16b); 6410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ///store back in a single variable 6430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_coef0_8x16b = _mm_srli_si128(mask_d_result_4x32b, 4); 6440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_coef1_8x16b = _mm_srli_si128(mask_d_result_4x32b, 8); 6450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mask_16x8b = _mm_srli_si128(mask_d_result_4x32b, 12); 6460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar d0 = _mm_cvtsi128_si32(mask_d_result_4x32b); 6480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar d3 = _mm_cvtsi128_si32(temp_coef0_8x16b); 6490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dp = _mm_cvtsi128_si32(temp_coef1_8x16b); 6500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dq = _mm_cvtsi128_si32(mask_16x8b); 6510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //getting d 6520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar d = d0 + d3; 6530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ///store back in a single variable 6550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_coef0_8x16b = _mm_srli_si128(mask_de_result_8x16b, 4); 6560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_coef1_8x16b = _mm_srli_si128(mask_de_result_8x16b, 8); 6570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mask_16x8b = _mm_srli_si128(mask_de_result_8x16b, 12); 6580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar de_0 = _mm_cvtsi128_si32(mask_de_result_8x16b); 6600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar de_1 = _mm_cvtsi128_si32(temp_coef0_8x16b); 6610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar de_2 = _mm_cvtsi128_si32(temp_coef1_8x16b); 6620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar de_3 = _mm_cvtsi128_si32(mask_16x8b); 6630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar de = 0; 6650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dep = 0; 6660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar deq = 0; 6670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(d < beta) 6680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 6690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar d_sam0 = 0; 6700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if((2 * d0 < (beta >> 2)) 6710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar && (de_2 < (beta >> 3)) 6720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar && (de_0 < ((5 * tc + 1) >> 1))) 6730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 6740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar d_sam0 = 1; 6750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 6760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar d_sam3 = 0; 6780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if((2 * d3 < (beta >> 2)) 6790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar && (de_3 < (beta >> 3)) 6800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar && de_1 < ((5 * tc + 1) >> 1)) 6810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 6820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar d_sam3 = 1; 6830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 6840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar de = (d_sam0 & d_sam3) + 1; 6860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dep = (dp < (beta + (beta >> 1)) >> 3) ? 1 : 0; 6870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar deq = (dq < (beta + (beta >> 1)) >> 3) ? 1 : 0; 6880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(tc <= 1) 6890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 6900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dep = 0; 6910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar deq = 0; 6920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 6930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 6940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 6960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(de != 0) 6980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 6990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(2 == de) 7010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 7020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i temp_pq0_str0_16x8b; 7040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i temp_pq1_str0_16x8b, temp_pq1_str1_16x8b; 7050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i temp_pq2_str0_16x8b; 7060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i temp_str0_16x8b, temp_str1_16x8b; 7070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i const2_8x16b, const2tc_8x16b; 7080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LWORD64 mask, tc2; 7100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tc = tc << 1; 7110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mask = (((LWORD64)filter_flag_q) << 63) | (((LWORD64)filter_flag_p) << 31); 7120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tc2 = ((LWORD64)tc); 7130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar const2_8x16b = _mm_cmpeq_epi16(src_p1_8x16b, src_p1_8x16b); 7150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //q'0-q'1-2 ,p'0-p'1-2 7160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_pq0_str0_16x8b = _mm_unpacklo_epi8(src_p1_8x16b, src_p0_8x16b); 7170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_str0_16x8b = _mm_unpacklo_epi8(src_q0_8x16b, src_q1_8x16b); 7180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar const2_8x16b = _mm_srli_epi16(const2_8x16b, 15); 7190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //arranged q31 q30 q21 q20 q1 q10 q01 q00 p30 p31 p20 p21 p10 p11 p00 p01 7200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_pq0_str0_16x8b = _mm_unpacklo_epi64(temp_pq0_str0_16x8b, temp_str0_16x8b); 7210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar const2_8x16b = _mm_packus_epi16(const2_8x16b, const2_8x16b); 7230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_pq0_str0_16x8b = _mm_maddubs_epi16(temp_pq0_str0_16x8b, const2_8x16b); 7240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //q'1-2, p'1-2 7260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_pq1_str0_16x8b = _mm_unpacklo_epi8(src_p0_8x16b, src_q0_8x16b); 7270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_pq1_str1_16x8b = _mm_unpacklo_epi8(src_q1_8x16b, src_q2_8x16b); 7280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_str1_16x8b = _mm_unpacklo_epi8(src_p1_8x16b, src_p2_8x16b); 7290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar // q30 p30 q20 p20 q10 p10 q01 q00 p30 q20 p20 q10 p10 q01 q00 p00 7300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_pq1_str0_16x8b = _mm_unpacklo_epi64(temp_pq1_str0_16x8b, temp_pq1_str0_16x8b); 7310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar // q32 q31 q22 q21 q12 q11 q02 q01 p32 p31 p22 p21 p12 p11 p02 p01 7320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_pq1_str1_16x8b = _mm_unpacklo_epi64(temp_str1_16x8b, temp_pq1_str1_16x8b); 7330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_pq1_str0_16x8b = _mm_maddubs_epi16(temp_pq1_str0_16x8b, const2_8x16b); 7350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_pq1_str1_16x8b = _mm_maddubs_epi16(temp_pq1_str1_16x8b, const2_8x16b); 7360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //clipping mask design 7380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_str1_16x8b = _mm_setzero_si128(); 7390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_str0_16x8b = _mm_loadl_epi64((__m128i *)(&mask)); 7400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar const2tc_8x16b = _mm_loadl_epi64((__m128i *)(&tc2)); 7410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_str0_16x8b = _mm_shuffle_epi32(temp_str0_16x8b, 0x44); 7420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar const2tc_8x16b = _mm_shuffle_epi8(const2tc_8x16b, temp_str1_16x8b); 7430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //clipping mask design 7450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_str0_16x8b = _mm_srai_epi32(temp_str0_16x8b, 31); 7460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar const2tc_8x16b = _mm_and_si128(const2tc_8x16b, temp_str0_16x8b); 7470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //calculating Clipping MAX for all pixel values. 7480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_p0_8x16b = _mm_unpacklo_epi32(src_p0_8x16b, src_q0_8x16b); 7490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_q0_8x16b = _mm_unpacklo_epi32(src_p1_8x16b, src_q1_8x16b); 7500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //for clipping calc 7510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_p1_8x16b = _mm_unpacklo_epi64(src_p0_8x16b, src_q0_8x16b); 7520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //saving the unmodified data of q1 p1 q0 p0 7530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_q1_8x16b = _mm_unpackhi_epi64(src_p0_8x16b, src_q0_8x16b); 7540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //CLIpping MAX and MIN for q1 p1 q0 p0 7550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_p0_8x16b = _mm_adds_epu8(src_p1_8x16b, const2tc_8x16b); 7560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_p1_8x16b = _mm_subs_epu8(src_p1_8x16b, const2tc_8x16b); 7570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //q'2-q'0-2,p'2-p'0-2 7600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_pq_str0_8x16b = _mm_unpacklo_epi8(src_p2_8x16b, tmp_pq_str0_8x16b); 7610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_pq2_str0_16x8b = _mm_unpacklo_epi8(src_q2_8x16b, tmp_pq_str1_8x16b); 7620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar const2_8x16b = _mm_slli_epi16(const2_8x16b, 1); 7630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //arranged q33 q32 q23 q22 q13 q12 q03 q02 p32 p33 p22 p23 p12 p13 p02 p03 7640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_pq2_str0_16x8b = _mm_unpacklo_epi64(tmp_pq_str0_8x16b, temp_pq2_str0_16x8b); 7650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_p2_8x16b = _mm_unpacklo_epi32(src_p2_8x16b, src_q2_8x16b); 7660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_pq2_str0_16x8b = _mm_maddubs_epi16(temp_pq2_str0_16x8b, const2_8x16b); 7670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //calculating Clipping MAX and MIN for p2 and q2 . 7690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_pq_str0_8x16b = _mm_adds_epu8(src_p2_8x16b, const2tc_8x16b); 7700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_pq_str1_8x16b = _mm_subs_epu8(src_p2_8x16b, const2tc_8x16b); 7710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //q'0-q'1-2 ,p'0-p'1-2 7720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_str0_16x8b = _mm_shuffle_epi32(temp_pq0_str0_16x8b, 0x4e); 7730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_pq0_str0_16x8b = _mm_add_epi16(temp_pq0_str0_16x8b, temp_str0_16x8b); 7740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //q'1-2 p'1-2 7750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_pq1_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq1_str1_16x8b); 7760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //to get 2 in 16 bit 7770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar const2_8x16b = _mm_srli_epi16(const2_8x16b, 8); 7780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //q'1, p'1 (adding 2) 7810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_pq1_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, const2_8x16b); 7820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //q'0-q'1,p'0-p'1 7830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_pq0_str0_16x8b = _mm_add_epi16(temp_pq0_str0_16x8b, const2_8x16b); 7840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //q'2-q'1,p'2-p'1 7850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_pq2_str0_16x8b = _mm_add_epi16(temp_pq2_str0_16x8b, const2_8x16b); 7860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //q'0 = (q'0-q'1)+q'1 ,p'0 = (p'0-p'1)+p'1; 7870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_pq0_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq0_str0_16x8b); 7880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //q'2 = (q'2-q'1)+q'1 ,p'2 = (p'2-p'1)+p'1; 7890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_pq2_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq2_str0_16x8b); 7900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //normalisation of all modified pixels 7920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_pq0_str0_16x8b = _mm_srai_epi16(temp_pq0_str0_16x8b, 3); 7930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_pq1_str0_16x8b = _mm_srai_epi16(temp_pq1_str0_16x8b, 2); 7940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_pq2_str0_16x8b = _mm_srai_epi16(temp_pq2_str0_16x8b, 3); 7950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //q'1 p'1 q'0 p'0 7960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_pq0_str0_16x8b = _mm_packus_epi16(temp_pq0_str0_16x8b, temp_pq1_str0_16x8b); 7970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_pq2_str0_16x8b = _mm_packus_epi16(temp_pq2_str0_16x8b, temp_pq2_str0_16x8b); 7980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //pack with the unmodified data of q2 and p2 7990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_p2_8x16b = _mm_unpackhi_epi64(temp_pq2_str0_16x8b, src_p2_8x16b); 8000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //Clipping MAX and MIN for q'1 p'1 q'0 p'0 and q'2 p'2 8010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_pq0_str0_16x8b = _mm_min_epu8(temp_pq0_str0_16x8b, src_p0_8x16b); 8020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_p2_8x16b = _mm_min_epu8(src_p2_8x16b, tmp_pq_str0_8x16b); 8030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp_pq0_str0_16x8b = _mm_max_epu8(temp_pq0_str0_16x8b, src_p1_8x16b); 8040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_p2_8x16b = _mm_max_epu8(src_p2_8x16b, tmp_pq_str1_8x16b); 8050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //Reshuffling q'1 p'1 q'0 p'0 along with unmodified data 8060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_p0_8x16b = _mm_unpacklo_epi32(temp_pq0_str0_16x8b, src_q1_8x16b); 8070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_p1_8x16b = _mm_unpackhi_epi32(temp_pq0_str0_16x8b, src_q1_8x16b); 8080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_p2_8x16b = _mm_shuffle_epi32(src_p2_8x16b, 0xd8); 8090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_q0_8x16b = _mm_srli_si128(src_p0_8x16b, 8); 8100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_q1_8x16b = _mm_srli_si128(src_p1_8x16b, 8); 8110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_q2_8x16b = _mm_srli_si128(src_p2_8x16b, 8); 8120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pu1_src - 3 * src_strd), src_p2_8x16b); 8140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pu1_src - 2 * src_strd), src_p1_8x16b); 8150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pu1_src - src_strd), src_p0_8x16b); 8160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pu1_src), src_q0_8x16b); 8170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pu1_src + src_strd), src_q1_8x16b); 8180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pu1_src + 2 * src_strd), src_q2_8x16b); 8190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 8220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else 8240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 8250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i tmp_delta0_8x16b, tmp_delta1_8x16b; 8270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i tmp0_const_8x16b, tmp1_const_8x16b, tmp2_const_8x16b; 8280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i coefdelta_0_8x16b; 8290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i const2_8x16b, consttc_8x16b; 8300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LWORD64 maskp0, maskp1, maskq0, maskq1; 8320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar maskp0 = (LWORD64)filter_flag_p; 8330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar maskq0 = (LWORD64)filter_flag_q; 8340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar maskp1 = (LWORD64)dep; 8350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar maskq1 = (LWORD64)deq; 8360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar consttc_8x16b = _mm_set1_epi32(tc); 8370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_delta0_8x16b = _mm_unpacklo_epi8(src_p1_8x16b, src_p0_8x16b); 8390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_delta1_8x16b = _mm_unpacklo_epi8(src_q0_8x16b, src_q1_8x16b); 8400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //arranged q31 q30 p30 p31 q21 q20 p20 p21 q1 q10 p10 p11 q01 q00 p00 p01 8410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_delta1_8x16b = _mm_unpacklo_epi16(tmp_delta0_8x16b, tmp_delta1_8x16b); 8420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar coefdelta_0_8x16b = _mm_load_si128((__m128i *)coef_de1); 8440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar // (-3q1+9q0),(-9p0+3p1) 8450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_delta0_8x16b = _mm_maddubs_epi16(tmp_delta1_8x16b, coefdelta_0_8x16b); 8460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //getting -tc store 8480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp2_const_8x16b = _mm_cmpeq_epi32(consttc_8x16b, consttc_8x16b); 8490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //getting tc in 16 bit 8510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar consttc_8x16b = _mm_packs_epi32(consttc_8x16b, consttc_8x16b); 8520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //calc 10 *tc = 2*tc +8*tc ; 2*tc 8530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_pq_str0_8x16b = _mm_slli_epi16(consttc_8x16b, 1); 8540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //calc 10 *tc = 2*tc +8*tc ; 8*tc 8550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_pq_str1_8x16b = _mm_slli_epi16(consttc_8x16b, 3); 8560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //const 1 8580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar const2_8x16b = _mm_srli_epi16(tmp2_const_8x16b, 15); 8590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //calc 10 *tc 8600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_pq_str0_8x16b = _mm_add_epi16(tmp_pq_str0_8x16b, tmp_pq_str1_8x16b); 8610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //delta0 without normalisation and clipping 8620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_delta0_8x16b = _mm_madd_epi16(tmp_delta0_8x16b, const2_8x16b); 8630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar const2_8x16b = _mm_srli_epi32(tmp2_const_8x16b, 31); 8650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //loaded coef for delta1 calculation 8670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar coefdelta_0_8x16b = _mm_load_si128((__m128i *)coef_dep1); 8680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //(-2q1+q0),(p0-2p1) 8690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_delta1_8x16b = _mm_maddubs_epi16(tmp_delta1_8x16b, coefdelta_0_8x16b); 8700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //const 8 8710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar const2_8x16b = _mm_slli_epi32(const2_8x16b, 3); 8720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //normalisation of the filter 8740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_delta0_8x16b = _mm_add_epi32(tmp_delta0_8x16b, const2_8x16b); 8750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_delta0_8x16b = _mm_srai_epi32(tmp_delta0_8x16b, 4); 8760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //getting deltaq0 8780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_pq_str1_8x16b = _mm_sign_epi32(tmp_delta0_8x16b, tmp2_const_8x16b); 8790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //getting -tc 8800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp1_const_8x16b = _mm_sign_epi16(consttc_8x16b, tmp2_const_8x16b); 8810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //packing d03q d02q d01q d0q d03p d02p d01p d00p 8820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_delta0_8x16b = _mm_packs_epi32(tmp_delta0_8x16b, tmp_pq_str1_8x16b); 8830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //absolute delta 8840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_pq_str1_8x16b = _mm_abs_epi16(tmp_delta0_8x16b); 8850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //Clipping of delta0 8870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_delta0_8x16b = _mm_min_epi16(tmp_delta0_8x16b, consttc_8x16b); 8880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //tc>>1 16 bit 8890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar consttc_8x16b = _mm_srai_epi16(consttc_8x16b, 1); 8900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //Clipping of delta0 8910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_delta0_8x16b = _mm_max_epi16(tmp_delta0_8x16b, tmp1_const_8x16b); 8920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //(-tc)>>1 16 bit 8940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp1_const_8x16b = _mm_sign_epi16(consttc_8x16b, tmp2_const_8x16b); 8950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //mask for |delta| < 10*tc 8960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_pq_str0_8x16b = _mm_cmpgt_epi16(tmp_pq_str0_8x16b, tmp_pq_str1_8x16b); 8970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //delta 1 calc starts 8980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //getting q32 q22 q12 q02 p32 p12 p22 p02 9000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp0_const_8x16b = _mm_setzero_si128(); 9010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_q2_8x16b = _mm_unpacklo_epi8(src_q2_8x16b, tmp0_const_8x16b); 9020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_p2_8x16b = _mm_unpacklo_epi8(src_p2_8x16b, tmp0_const_8x16b); 9030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_p2_8x16b = _mm_unpacklo_epi64(src_p2_8x16b, src_q2_8x16b); 9040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //constant 1 9050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar const2_8x16b = _mm_srli_epi16(tmp2_const_8x16b, 15); 9060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //2*delta0 9070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp2_const_8x16b = _mm_add_epi16(tmp_delta0_8x16b, tmp_delta0_8x16b); 9080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //getting all respective q's and p's together 9090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar coefdelta_0_8x16b = _mm_load_si128((__m128i *)(shuffle1)); 9100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_delta1_8x16b = _mm_shuffle_epi8(tmp_delta1_8x16b, coefdelta_0_8x16b); 9110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //final adds for deltap1 and deltaq1 9120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_delta1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, const2_8x16b); 9130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_p2_8x16b = _mm_add_epi16(src_p2_8x16b, tmp2_const_8x16b); 9140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_delta1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, src_p2_8x16b); 9150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_delta1_8x16b = _mm_srai_epi16(tmp_delta1_8x16b, 2); 9160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //mask0= (((LWORD64)filter_flag_q)<<63)| (((LWORD64)filter_flag_p)<<31); 9180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_pq_str1_8x16b = _mm_loadl_epi64((__m128i *)(&(maskq0))); 9190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_p2_8x16b = _mm_loadl_epi64((__m128i *)(&(maskp0))); 9200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar // src_p2_8x16b = _mm_set_epi32(filter_flag_q,filter_flag_p,filter_flag_q,filter_flag_p); 9220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //mask1= (((LWORD64)(filter_flag_q&deq))<<63)|(((LWORD64)(filter_flag_p & dep))<<31); 9230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_q2_8x16b = _mm_loadl_epi64((__m128i *)(&(maskq1))); 9240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar coefdelta_0_8x16b = _mm_loadl_epi64((__m128i *)(&(maskp1))); 9250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_p2_8x16b = _mm_unpacklo_epi32(src_p2_8x16b, tmp_pq_str1_8x16b); 9270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_q2_8x16b = _mm_unpacklo_epi32(coefdelta_0_8x16b, src_q2_8x16b); 9280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //src_q2_8x16b = _mm_set_epi32(deq,dep,deq,dep); 9290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_q2_8x16b = _mm_and_si128(src_q2_8x16b, src_p2_8x16b); 9300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //rearranging the mask values 9320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_q2_8x16b = _mm_shuffle_epi32(src_q2_8x16b, 0x50); 9330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_p2_8x16b = _mm_shuffle_epi32(src_p2_8x16b, 0x50); 9340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_q2_8x16b = _mm_slli_epi32(src_q2_8x16b, 31); 9360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_p2_8x16b = _mm_slli_epi32(src_p2_8x16b, 31); 9370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_q2_8x16b = _mm_srai_epi32(src_q2_8x16b, 31); 9380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_p2_8x16b = _mm_srai_epi32(src_p2_8x16b, 31); 9390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //combining mask delta1 9410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_pq_str1_8x16b = _mm_and_si128(tmp_pq_str0_8x16b, src_q2_8x16b); 9420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar // clipping delta1 9430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_delta1_8x16b = _mm_min_epi16(tmp_delta1_8x16b, consttc_8x16b); 9440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //combining mask delat0 9450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_pq_str0_8x16b = _mm_and_si128(tmp_pq_str0_8x16b, src_p2_8x16b); 9460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar // clipping delta1 9470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_delta1_8x16b = _mm_max_epi16(tmp_delta1_8x16b, tmp1_const_8x16b); 9480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //masking of the delta values |delta|<10*tc 9510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_delta1_8x16b = _mm_and_si128(tmp_delta1_8x16b, tmp_pq_str1_8x16b); 9520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_delta0_8x16b = _mm_and_si128(tmp_delta0_8x16b, tmp_pq_str0_8x16b); 9530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //separating p and q delta 0 and addinq p0 and q0 9540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_pq_str0_8x16b = _mm_unpacklo_epi64(tmp_delta0_8x16b, tmp0_const_8x16b); 9550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_pq_str1_8x16b = _mm_unpackhi_epi64(tmp_delta0_8x16b, tmp0_const_8x16b); 9560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_p0_8x16b = _mm_unpacklo_epi8(src_p0_8x16b, tmp0_const_8x16b); 9570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_q0_8x16b = _mm_unpacklo_epi8(src_q0_8x16b, tmp0_const_8x16b); 9580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_p0_8x16b = _mm_add_epi16(src_p0_8x16b, tmp_pq_str0_8x16b); 9590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_q0_8x16b = _mm_add_epi16(src_q0_8x16b, tmp_pq_str1_8x16b); 9600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //separating p and q delta 0 and addinq p0 and q0 9610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_pq_str0_8x16b = _mm_unpacklo_epi64(tmp_delta1_8x16b, tmp0_const_8x16b); 9620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_pq_str1_8x16b = _mm_unpackhi_epi64(tmp_delta1_8x16b, tmp0_const_8x16b); 9630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_p1_8x16b = _mm_unpacklo_epi8(src_p1_8x16b, tmp0_const_8x16b); 9640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_q1_8x16b = _mm_unpacklo_epi8(src_q1_8x16b, tmp0_const_8x16b); 9650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_p1_8x16b = _mm_add_epi16(src_p1_8x16b, tmp_pq_str0_8x16b); 9660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_q1_8x16b = _mm_add_epi16(src_q1_8x16b, tmp_pq_str1_8x16b); 9670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //packing p1 q1 and p0 q0 to 8 bit 9680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_p1_8x16b = _mm_packus_epi16(src_p1_8x16b, src_q1_8x16b); 9690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_p0_8x16b = _mm_packus_epi16(src_p0_8x16b, src_q0_8x16b); 9700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_q1_8x16b = _mm_srli_si128(src_p1_8x16b, 8); 9720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_q0_8x16b = _mm_srli_si128(src_p0_8x16b, 8); 9730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pu1_src - 2 * src_strd), src_p1_8x16b); 9750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pu1_src - src_strd), src_p0_8x16b); 9760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pu1_src), src_q0_8x16b); 9770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pu1_src + src_strd), src_q1_8x16b); 9780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 9810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 9850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar} 9870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarvoid ihevc_deblk_chroma_vert_ssse3(UWORD8 *pu1_src, 9890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 src_strd, 9900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 quant_param_p, 9910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 quant_param_q, 9920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 qp_offset_u, 9930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 qp_offset_v, 9940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 tc_offset_div2, 9950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 filter_flag_p, 9960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 filter_flag_q) 9970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar{ 9980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 qp_indx_u, qp_chroma_u; 9990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 qp_indx_v, qp_chroma_v; 10000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 tc_indx_u, tc_u; 10010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 tc_indx_v, tc_v; 10020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i src_row_0_16x8b, tmp_pxl_0_16x8b, src_row_2_16x8b, tmp_pxl_1_16x8b; 10040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ASSERT(filter_flag_p || filter_flag_q); 10050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* chroma processing is done only if BS is 2 */ 10070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* this function is assumed to be called only if BS is 2 */ 10080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar qp_indx_u = qp_offset_u + ((quant_param_p + quant_param_q + 1) >> 1); 10090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar qp_chroma_u = qp_indx_u < 0 ? qp_indx_u : (qp_indx_u > 57 ? qp_indx_u - 6 : gai4_ihevc_qp_table[qp_indx_u]); 10100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar qp_indx_v = qp_offset_v + ((quant_param_p + quant_param_q + 1) >> 1); 10120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar qp_chroma_v = qp_indx_v < 0 ? qp_indx_v : (qp_indx_v > 57 ? qp_indx_v - 6 : gai4_ihevc_qp_table[qp_indx_v]); 10130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tc_indx_u = CLIP3(qp_chroma_u + 2 + (tc_offset_div2 << 1), 0, 53); 10150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tc_u = gai4_ihevc_tc_table[tc_indx_u]; 10160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tc_indx_v = CLIP3(qp_chroma_v + 2 + (tc_offset_div2 << 1), 0, 53); 10180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tc_v = gai4_ihevc_tc_table[tc_indx_v]; 10190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(0 == tc_u && 0 == tc_v) 10210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 10220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar return; 10230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 10240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_row_0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src - 4)); 10250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_pxl_0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd - 4)); 10260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_row_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd - 4)); 10270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_pxl_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd - 4)); 10280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 10300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LWORD64 mask_tc, mask_flag, mask; 10310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i delta_vu0_16x8b, delta_vu1_16x8b; 10320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i mask_tc_16x8, mask_16x8b, mask_flag_p_16x8b, mask_flag_q_16x8b; 10330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i min_0_16x8b; 10340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i const_16x8b; 10350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mask_flag = (((LWORD64)filter_flag_p) << 31) | (((LWORD64)filter_flag_q) << 63); 10360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mask_tc = (((LWORD64)tc_v) << 16) | ((LWORD64)tc_u); 10370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mask = 0xffff00000000ffffLL; 10380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_row_0_16x8b = _mm_unpacklo_epi64(src_row_0_16x8b, tmp_pxl_0_16x8b); 10400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_row_2_16x8b = _mm_unpacklo_epi64(src_row_2_16x8b, tmp_pxl_1_16x8b); 10410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mask_16x8b = _mm_load_si128((__m128i *)(shuffle_uv)); 10430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar // qv11 qu11 qv10 qu10 qv01 qu01 qv00 qu00 pv10 pu10 pv11 pu11 pv00 pu00 pv01 pu01 10440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar // qv31 qu31 qv30 qu30 qv21 qu21 qv20 qu20 pv30 pu30 pv31 pu31 pv20 pu20 pv21 pu21 10450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar delta_vu0_16x8b = _mm_shuffle_epi8(src_row_0_16x8b, mask_16x8b); 10460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar delta_vu1_16x8b = _mm_shuffle_epi8(src_row_2_16x8b, mask_16x8b); 10470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_pxl_0_16x8b = _mm_unpacklo_epi64(delta_vu0_16x8b, delta_vu1_16x8b); 10490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_pxl_1_16x8b = _mm_unpackhi_epi64(delta_vu0_16x8b, delta_vu1_16x8b); 10500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar // pv30 pv31 pu30 pu31 pv20 pv21 pu20 pu21 pv10 pv11 pu10 pu11 pv00 pv01 pu00 pu01 10510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar // qv31 qv30 qu31 qu30 qv21 qv20 qu21 qu20 qv11 qv10 qu11 qu10 qv01 qv00 qu01 qu00 10520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar delta_vu0_16x8b = _mm_load_si128((__m128i *)delta0); 10530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar delta_vu1_16x8b = _mm_load_si128((__m128i *)delta1); 10540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar delta_vu0_16x8b = _mm_maddubs_epi16(tmp_pxl_0_16x8b, delta_vu0_16x8b); 10560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar delta_vu1_16x8b = _mm_maddubs_epi16(tmp_pxl_1_16x8b, delta_vu1_16x8b); 10570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //generating offset 4 10590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar const_16x8b = _mm_cmpeq_epi16(tmp_pxl_0_16x8b, tmp_pxl_0_16x8b); 10600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar // filter flag mask and tc mask 10610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mask_tc_16x8 = _mm_loadl_epi64((__m128i *)(&mask_tc)); 10620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mask_flag_q_16x8b = _mm_loadl_epi64((__m128i *)(&mask_flag)); 10630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mask_tc_16x8 = _mm_shuffle_epi32(mask_tc_16x8, 0x00); 10650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mask_flag_q_16x8b = _mm_srai_epi32(mask_flag_q_16x8b, 31); 10660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //-tc 10670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar min_0_16x8b = _mm_sign_epi16(mask_tc_16x8, const_16x8b); 10680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //converting const 1 10690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar const_16x8b = _mm_srli_epi16(const_16x8b, 15); 10700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //filterp and filterq flag 10720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mask_flag_p_16x8b = _mm_shuffle_epi32(mask_flag_q_16x8b, 0x00); 10730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mask_flag_q_16x8b = _mm_shuffle_epi32(mask_flag_q_16x8b, 0x55); 10740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //modified delta with a filter (1 -4 4 -1) available in 16 bit 10760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar delta_vu0_16x8b = _mm_add_epi16(delta_vu0_16x8b, delta_vu1_16x8b); 10770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //converting const 4 10780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar const_16x8b = _mm_slli_epi16(const_16x8b, 2); 10790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mask_16x8b = _mm_loadl_epi64((__m128i *)(&mask)); 10810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //offset addition 10820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar delta_vu0_16x8b = _mm_add_epi16(delta_vu0_16x8b, const_16x8b); 10830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //eliminating q1 10840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_pxl_1_16x8b = _mm_slli_epi16(tmp_pxl_1_16x8b, 8); 10850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar const_16x8b = _mm_setzero_si128(); 10870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //filter after normalisation 10880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar delta_vu0_16x8b = _mm_srai_epi16(delta_vu0_16x8b, 3); 10890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mask_16x8b = _mm_shuffle_epi32(mask_16x8b, 0x44); 10900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //clipping MAX 10920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar delta_vu0_16x8b = _mm_min_epi16(delta_vu0_16x8b, mask_tc_16x8); 10930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //getting p0 and eliminating p1 10940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_pxl_0_16x8b = _mm_srli_epi16(tmp_pxl_0_16x8b, 8); 10950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //clipping MIN 10960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar delta_vu0_16x8b = _mm_max_epi16(delta_vu0_16x8b, min_0_16x8b); 10970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //getting q0 10980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_pxl_1_16x8b = _mm_srli_epi16(tmp_pxl_1_16x8b, 8); 10990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //masking filter flag 11000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar delta_vu1_16x8b = _mm_and_si128(delta_vu0_16x8b, mask_flag_q_16x8b); 11010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar delta_vu0_16x8b = _mm_and_si128(delta_vu0_16x8b, mask_flag_p_16x8b); 11020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar // q-delta ,p+delta 11040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_pxl_1_16x8b = _mm_sub_epi16(tmp_pxl_1_16x8b, delta_vu1_16x8b); 11050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_pxl_0_16x8b = _mm_add_epi16(tmp_pxl_0_16x8b, delta_vu0_16x8b); 11060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //merging q0 and p0 of respective rows 11070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar delta_vu1_16x8b = _mm_unpackhi_epi32(tmp_pxl_0_16x8b, tmp_pxl_1_16x8b); 11080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar delta_vu0_16x8b = _mm_unpacklo_epi32(tmp_pxl_0_16x8b, tmp_pxl_1_16x8b); 11090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar // row 0 and row 1 packed , row2 and row3 packed 11100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar delta_vu0_16x8b = _mm_packus_epi16(delta_vu0_16x8b, const_16x8b); 11110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar delta_vu1_16x8b = _mm_packus_epi16(delta_vu1_16x8b, const_16x8b); 11120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //removing older pixel values 11130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_row_0_16x8b = _mm_and_si128(src_row_0_16x8b, mask_16x8b); 11140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_row_2_16x8b = _mm_and_si128(src_row_2_16x8b, mask_16x8b); 11150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //arranging modified pixels 11160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar delta_vu0_16x8b = _mm_shuffle_epi32(delta_vu0_16x8b, 0xd8); 11170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar delta_vu1_16x8b = _mm_shuffle_epi32(delta_vu1_16x8b, 0xd8); 11180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar delta_vu0_16x8b = _mm_slli_epi64(delta_vu0_16x8b, 16); 11190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar delta_vu1_16x8b = _mm_slli_epi64(delta_vu1_16x8b, 16); 11200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //plugging the modified values 11210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_row_0_16x8b = _mm_or_si128(src_row_0_16x8b, delta_vu0_16x8b); 11220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_row_2_16x8b = _mm_or_si128(src_row_2_16x8b, delta_vu1_16x8b); 11230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //geting values for row1 and row 3 11260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_pxl_0_16x8b = _mm_srli_si128(src_row_0_16x8b, 8); 11270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_pxl_1_16x8b = _mm_srli_si128(src_row_2_16x8b, 8); 11280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pu1_src - 4), src_row_0_16x8b); 11300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)((pu1_src - 4) + src_strd), tmp_pxl_0_16x8b); 11310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)((pu1_src - 4) + 2 * src_strd), src_row_2_16x8b); 11320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)((pu1_src - 4) + 3 * src_strd), tmp_pxl_1_16x8b); 11330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 11340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar} 11380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarvoid ihevc_deblk_chroma_horz_ssse3(UWORD8 *pu1_src, 11400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 src_strd, 11410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 quant_param_p, 11420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 quant_param_q, 11430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 qp_offset_u, 11440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 qp_offset_v, 11450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 tc_offset_div2, 11460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 filter_flag_p, 11470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 filter_flag_q) 11480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar{ 11490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 qp_indx_u, qp_chroma_u; 11500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 qp_indx_v, qp_chroma_v; 11510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 tc_indx_u, tc_u; 11520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 tc_indx_v, tc_v; 11530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i tmp_p0_16x8b, src_p0_16x8b, src_q0_16x8b, tmp_q0_16x8b; 11560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ASSERT(filter_flag_p || filter_flag_q); 11580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* chroma processing is done only if BS is 2 */ 11600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* this function is assumed to be called only if BS is 2 */ 11610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar qp_indx_u = qp_offset_u + ((quant_param_p + quant_param_q + 1) >> 1); 11620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar qp_chroma_u = qp_indx_u < 0 ? qp_indx_u : (qp_indx_u > 57 ? qp_indx_u - 6 : gai4_ihevc_qp_table[qp_indx_u]); 11630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar qp_indx_v = qp_offset_v + ((quant_param_p + quant_param_q + 1) >> 1); 11650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar qp_chroma_v = qp_indx_v < 0 ? qp_indx_v : (qp_indx_v > 57 ? qp_indx_v - 6 : gai4_ihevc_qp_table[qp_indx_v]); 11660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tc_indx_u = CLIP3(qp_chroma_u + 2 + (tc_offset_div2 << 1), 0, 53); 11680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tc_u = gai4_ihevc_tc_table[tc_indx_u]; 11690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tc_indx_v = CLIP3(qp_chroma_v + 2 + (tc_offset_div2 << 1), 0, 53); 11710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tc_v = gai4_ihevc_tc_table[tc_indx_v]; 11720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(0 == tc_u && 0 == tc_v) 11740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 11750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar return; 11760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 11770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_p0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src - 2 * src_strd)); 11780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_p0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src - src_strd)); 11790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_q0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); 11800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_q0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd)); 11810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 11830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LWORD64 mask_tc, mask_flag; 11840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i delta_vu0_16x8b, delta_vu1_16x8b; 11850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i mask_tc_16x8, mask_16x8b, mask_flag_p_16x8b, mask_flag_q_16x8b; 11860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i min_0_16x8b; 11870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i const_16x8b; 11880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mask_flag = (((LWORD64)filter_flag_p) << 31) | (((LWORD64)filter_flag_q) << 63); 11890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mask_tc = (((LWORD64)tc_v) << 16) | ((LWORD64)tc_u); 11900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_p0_16x8b = _mm_unpacklo_epi8(tmp_p0_16x8b, src_p0_16x8b); 11920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar tmp_q0_16x8b = _mm_unpacklo_epi8(src_q0_16x8b, tmp_q0_16x8b); 11930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar // pv30 pv31 pu30 pu31 pv20 pv21 pu20 pu21 pv10 pv11 pu10 pu11 pv00 pv01 pu00 pu01 11950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar // qv31 qv30 qu31 qu30 qv21 qv20 qu21 qu20 qv11 qv10 qu11 qu10 qv01 qv00 qu01 qu00 11960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar delta_vu0_16x8b = _mm_load_si128((__m128i *)delta0); 11970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar delta_vu1_16x8b = _mm_load_si128((__m128i *)delta1); 11980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar delta_vu0_16x8b = _mm_maddubs_epi16(tmp_p0_16x8b, delta_vu0_16x8b); 12000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar delta_vu1_16x8b = _mm_maddubs_epi16(tmp_q0_16x8b, delta_vu1_16x8b); 12010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar // filter flag mask and tc mask 12040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mask_tc_16x8 = _mm_loadl_epi64((__m128i *)(&mask_tc)); 12050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mask_flag_q_16x8b = _mm_loadl_epi64((__m128i *)(&mask_flag)); 12060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //generating offset 4 12080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar const_16x8b = _mm_cmpeq_epi16(tmp_p0_16x8b, tmp_p0_16x8b); 12090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar // filter flag mask and tc mask 12100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mask_tc_16x8 = _mm_shuffle_epi32(mask_tc_16x8, 0x00); 12110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mask_flag_q_16x8b = _mm_srai_epi32(mask_flag_q_16x8b, 31); 12120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //-tc 12130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar min_0_16x8b = _mm_sign_epi16(mask_tc_16x8, const_16x8b); 12140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //converting const 1 12150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar const_16x8b = _mm_srli_epi16(const_16x8b, 15); 12160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //filterp 12180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mask_flag_p_16x8b = _mm_shuffle_epi32(mask_flag_q_16x8b, 0x00); 12190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //converting const 4 12220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar const_16x8b = _mm_slli_epi16(const_16x8b, 2); 12230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //modified delta with a filter (1 -4 4 -1) available in 16 bit 12240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar delta_vu0_16x8b = _mm_add_epi16(delta_vu0_16x8b, delta_vu1_16x8b); 12250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //filterq flag 12270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mask_flag_q_16x8b = _mm_shuffle_epi32(mask_flag_q_16x8b, 0x55); 12280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //offset addition 12290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar delta_vu0_16x8b = _mm_add_epi16(delta_vu0_16x8b, const_16x8b); 12300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mask_16x8b = _mm_setzero_si128(); 12310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //filter after normalisation 12320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar delta_vu0_16x8b = _mm_srai_epi16(delta_vu0_16x8b, 3); 12330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //converting p0 to 16bit 12350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_p0_16x8b = _mm_unpacklo_epi8(src_p0_16x8b, mask_16x8b); 12360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //clipping MAX 12370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar delta_vu0_16x8b = _mm_min_epi16(delta_vu0_16x8b, mask_tc_16x8); 12380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //converting q0 to 16bit 12390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_q0_16x8b = _mm_unpacklo_epi8(src_q0_16x8b, mask_16x8b); 12400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //clipping MIN 12410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar delta_vu0_16x8b = _mm_max_epi16(delta_vu0_16x8b, min_0_16x8b); 12420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //masking filter flag 12440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar delta_vu1_16x8b = _mm_and_si128(delta_vu0_16x8b, mask_flag_q_16x8b); 12450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar delta_vu0_16x8b = _mm_and_si128(delta_vu0_16x8b, mask_flag_p_16x8b); 12460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar // q-delta ,p+delta 12480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_q0_16x8b = _mm_sub_epi16(src_q0_16x8b, delta_vu1_16x8b); 12490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_p0_16x8b = _mm_add_epi16(src_p0_16x8b, delta_vu0_16x8b); 12500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar // p0 and q0 packed 12520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_q0_16x8b = _mm_packus_epi16(src_q0_16x8b, mask_16x8b); 12530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_p0_16x8b = _mm_packus_epi16(src_p0_16x8b, mask_16x8b); 12540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pu1_src - src_strd), src_p0_16x8b); 12580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pu1_src), src_q0_16x8b); 12590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 12610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar} 1264