10d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/******************************************************************************
20d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
30d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
40d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
50d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Licensed under the Apache License, Version 2.0 (the "License");
60d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* you may not use this file except in compliance with the License.
70d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* You may obtain a copy of the License at:
80d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
90d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* http://www.apache.org/licenses/LICENSE-2.0
100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Unless required by applicable law or agreed to in writing, software
120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* distributed under the License is distributed on an "AS IS" BASIS,
130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* See the License for the specific language governing permissions and
150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* limitations under the License.
160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar******************************************************************************/
180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/**
190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @file
210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  ihevc_deblck_atom_intr.c
220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @brief
240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Contains function definitions for deblocking filters
250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @author
270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Rishab
280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @par List of Functions:
300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*   - ihevc_deblk_luma_vert_ssse3()
310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*   - ihevc_deblk_luma_horz_ssse3()
320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*   - ihevc_deblk_chroma_vert_ssse3()
330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*   - ihevc_deblk_chroma_horz_ssse3()
340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @remarks
360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  None
370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*/
400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <stdlib.h>
410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <stdio.h>
420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <assert.h>
430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_typedefs.h"
440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_platform_macros.h"
450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_macros.h"
460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_deblk.h"
470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_deblk_tables.h"
480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_debug.h"
490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_tables_x86_intr.h"
510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <immintrin.h>
530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/**
540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @brief
570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*       Decision process and filtering for the luma block vertical edge.
580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @par Description:
600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*     The decision process for the luma block vertical edge is  carried out and
610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*     an appropriate filter is applied. The  boundary filter strength, bs should
620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*     be greater than 0.  The pcm flags and the transquant bypass flags should
630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*     be  taken care of by the calling function.
640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pu1_src
660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Pointer to the src sample q(0,0)
670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] src_strd
690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Source stride
700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] bs
720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Boundary filter strength of q(0,0)
730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] quant_param_p
750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  quantization parameter of p block
760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] quant_param_q
780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  quantization parameter of p block
790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] beta_offset_div2
810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] tc_offset_div2
840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] filter_flag_p
870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  flag whether to filter the p block
880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] filter_flag_q
900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  flag whether to filter the q block
910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @returns
930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @remarks
950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  None
960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*/
990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarvoid ihevc_deblk_luma_vert_ssse3(UWORD8 *pu1_src,
1010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                 WORD32 src_strd,
1020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                 WORD32 bs,
1030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                 WORD32 quant_param_p,
1040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                 WORD32 quant_param_q,
1050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                 WORD32 beta_offset_div2,
1060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                 WORD32 tc_offset_div2,
1070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                 WORD32 filter_flag_p,
1080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                 WORD32 filter_flag_q)
1090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar{
1100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 qp_luma, beta_indx, tc_indx;
1110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 beta, tc;
1120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 d, dp, dq, d_sam0, d_sam3;
1130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 d3, d0, de_0, de_1, de_2, de_3;
1150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 de, dep, deq;
1160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i src_row0_8x16b, src_row1_8x16b, src_row2_8x16b, src_row3_8x16b;
1170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
1200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i src_tmp_8x16b, coef_8x16b, mask_d_result_4x32b, mask_de_result_8x16b;
1210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i mask_16x8b, temp_coef0_8x16b, temp_coef1_8x16b;
1220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        ASSERT((bs > 0) && (bs <= 3));
1260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        ASSERT(filter_flag_p || filter_flag_q);
1270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        qp_luma = (quant_param_p + quant_param_q + 1) >> 1;
1290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        beta_indx = CLIP3(qp_luma + (beta_offset_div2 << 1), 0, 51);
1300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* BS based on implementation can take value 3 if it is intra/inter egde          */
1320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* based on BS, tc index is calcuated by adding 2 * ( bs - 1) to QP and tc_offset */
1330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* for BS = 1 adding factor is (0*2), BS = 2 or 3 adding factor is (1*2)          */
1340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* the above desired functionallity is achieved by doing (2*(bs>>1))              */
1350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        tc_indx = CLIP3(qp_luma + (2 * (bs >> 1)) + (tc_offset_div2 << 1), 0, 53);
1370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        beta = gai4_ihevc_beta_table[beta_indx];
1390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        tc = gai4_ihevc_tc_table[tc_indx];
1400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        if(0 == tc)
1410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
1420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            return;
1430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
1440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_row0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - 4));
1450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_row3_8x16b = _mm_loadl_epi64((__m128i *)((pu1_src - 4) + 3 * src_strd));
1460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        coef_8x16b = _mm_load_si128((__m128i *)(coef_d));
1480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        mask_16x8b =  _mm_load_si128((__m128i *)(shuffle_d));
1490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_tmp_8x16b = _mm_unpacklo_epi64(src_row0_8x16b, src_row3_8x16b);
1510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        mask_de_result_8x16b = _mm_shuffle_epi8(src_tmp_8x16b, mask_16x8b);
1520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        mask_d_result_4x32b = _mm_maddubs_epi16(src_tmp_8x16b, coef_8x16b);
1540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //to get all 1's of 8 bit in (1)
1570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp_coef0_8x16b = _mm_cmpeq_epi16(src_tmp_8x16b, src_tmp_8x16b);
1580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp_coef1_8x16b = _mm_srli_epi16(temp_coef0_8x16b, 15);
1590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //accumulating values foe dp3 dq3 , dp0 dq0 values
1600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        mask_d_result_4x32b = _mm_madd_epi16(mask_d_result_4x32b, temp_coef1_8x16b);
1610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp_coef1_8x16b = _mm_packus_epi16(temp_coef1_8x16b, temp_coef1_8x16b);
1630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        // to get all 1,-1 sets of 16 bits in (0)
1640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp_coef0_8x16b = _mm_unpacklo_epi8(temp_coef0_8x16b, temp_coef1_8x16b);
1650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //q33-q30,p33-p30,q03-q00,p03-p00,0,q30-p30,0,q00-p00
1660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        mask_de_result_8x16b = _mm_maddubs_epi16(mask_de_result_8x16b, temp_coef0_8x16b);
1670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //to get 16 bit 1's
1680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp_coef0_8x16b = _mm_srli_epi16(temp_coef1_8x16b, 8);
1690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        // dq3 dp3 dq0 dp0
1720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        mask_d_result_4x32b = _mm_abs_epi32(mask_d_result_4x32b);
1730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        mask_16x8b = _mm_shuffle_epi32(mask_d_result_4x32b, 0xec);
1740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        mask_d_result_4x32b = _mm_shuffle_epi32(mask_d_result_4x32b, 0x49);
1750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        // dq dp d3 d0
1760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        mask_d_result_4x32b = _mm_add_epi32(mask_d_result_4x32b, mask_16x8b);
1770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //|q33-q30|,|p33-p30|,|q03-q00|,|p03-p00|,0,|q30-p30|,0,|q00-p00|
1780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        mask_de_result_8x16b = _mm_abs_epi16(mask_de_result_8x16b);
1790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //|q33-q30|+|p33-p30|,|q03-q00|+|p03-p00|,0+|q30-p30|,0+|q00-p00|
1800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        mask_de_result_8x16b = _mm_madd_epi16(mask_de_result_8x16b, temp_coef0_8x16b);
1810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        ///store back in a single variable
1830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp_coef0_8x16b = _mm_srli_si128(mask_d_result_4x32b, 4);
1840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp_coef1_8x16b = _mm_srli_si128(mask_d_result_4x32b, 8);
1850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        mask_16x8b = _mm_srli_si128(mask_d_result_4x32b, 12);
1860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        d0 = _mm_cvtsi128_si32(mask_d_result_4x32b);
1880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        d3 = _mm_cvtsi128_si32(temp_coef0_8x16b);
1890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        dp = _mm_cvtsi128_si32(temp_coef1_8x16b);
1900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        dq = _mm_cvtsi128_si32(mask_16x8b);
1910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //getting d
1920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        d = d0 + d3;
1930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        ///store back in a single variable
1950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp_coef0_8x16b = _mm_srli_si128(mask_de_result_8x16b, 4);
1960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp_coef1_8x16b = _mm_srli_si128(mask_de_result_8x16b, 8);
1970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        mask_16x8b = _mm_srli_si128(mask_de_result_8x16b, 12);
1980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        de_0 = _mm_cvtsi128_si32(mask_de_result_8x16b);
2000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        de_1 = _mm_cvtsi128_si32(temp_coef0_8x16b);
2010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        de_2 = _mm_cvtsi128_si32(temp_coef1_8x16b);
2020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        de_3 = _mm_cvtsi128_si32(mask_16x8b);
2030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        de = 0;
2050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        dep = 0;
2060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        deq = 0;
2070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        if(d < beta)
2080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
2090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            d_sam0 = 0;
2100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            if((2 * d0 < (beta >> 2))
2110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                            && (de_2 < (beta >> 3))
2120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                            && (de_0 < ((5 * tc + 1) >> 1)))
2130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
2140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                d_sam0 = 1;
2150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
2160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            d_sam3 = 0;
2180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            if((2 * d3 < (beta >> 2))
2190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                            && (de_3 < (beta >> 3))
2200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                            && de_1 < ((5 * tc + 1) >> 1))
2210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
2220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                d_sam3 = 1;
2230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
2240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            de = (d_sam0 & d_sam3) + 1;
2260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            dep = (dp < (beta + (beta >> 1)) >> 3) ? 1 : 0;
2270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            deq = (dq < (beta + (beta >> 1)) >> 3) ? 1 : 0;
2280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            if(tc <= 1)
2290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
2300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                dep = 0;
2310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                deq = 0;
2320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
2330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
2340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
2360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    if(de != 0)
2380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
2390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_row1_8x16b = _mm_loadl_epi64((__m128i *)((pu1_src - 4) + src_strd));
2420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_row2_8x16b = _mm_loadl_epi64((__m128i *)((pu1_src - 4) + 2 * src_strd));
2430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        if(de == 2)
2450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
2460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i temp_pq_str0_16x8b;
2470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i temp_pq1_str0_16x8b, temp_pq1_str1_16x8b;
2480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i temp_pq2_str0_16x8b;
2490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i temp_pq_str1_16x8b;
2500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i temp_str0_16x8b, temp_str1_16x8b, temp_str2_16x8b, temp_str3_16x8b;
2510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i temp_max0_16x8b, temp_max1_16x8b, temp_min0_16x8b, temp_min1_16x8b;
2520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i const2_8x16b, const2tc_8x16b;
2530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            LWORD64 mask, tc2;
2540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tc = tc << 1;
2550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            mask = (((LWORD64)filter_flag_q) << 63) | (((LWORD64)filter_flag_p) << 31);
2560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tc2 = ((LWORD64)tc);
2570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            const2_8x16b = _mm_cmpeq_epi16(src_row0_8x16b, src_row0_8x16b);
2590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //q'0-q'1-2 ,p'0-p'1-2
2600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_row0_8x16b = _mm_unpacklo_epi64(src_row0_8x16b, src_row2_8x16b);
2610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_row1_8x16b = _mm_unpacklo_epi64(src_row1_8x16b, src_row3_8x16b);
2620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            const2_8x16b = _mm_srli_epi16(const2_8x16b, 15);
2640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_pq_str0_16x8b = _mm_srli_epi64(src_row0_8x16b, 16);
2650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_pq_str1_16x8b = _mm_srli_epi64(src_row1_8x16b, 16);
2660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //arranged x x x x x x x x q31 q30 q1 q10 p30 p31 p10 p11 , x x x x x x x x q21 q20 q01 q00 p20 p21 p00 p01
2670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_str0_16x8b = _mm_unpacklo_epi16(temp_pq_str0_16x8b, temp_pq_str1_16x8b);
2680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_str1_16x8b = _mm_unpackhi_epi16(temp_pq_str0_16x8b, temp_pq_str1_16x8b);
2690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            const2_8x16b = _mm_packus_epi16(const2_8x16b, const2_8x16b);
2710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //arranged q31 q30 q21 q20 q1 q10 q01 q00 p30 p31 p20 p21 p10 p11 p00 p01
2720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_pq_str0_16x8b = _mm_unpacklo_epi32(temp_str0_16x8b, temp_str1_16x8b);
2730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_pq_str0_16x8b = _mm_maddubs_epi16(temp_pq_str0_16x8b, const2_8x16b);
2750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //q'1-2, p'1-2
2770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_pq1_str0_16x8b = _mm_srli_epi64(src_row0_8x16b, 8);
2780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_pq1_str1_16x8b = _mm_srli_epi64(src_row1_8x16b, 8);
2790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_str2_16x8b = _mm_unpacklo_epi16(temp_pq1_str0_16x8b, temp_pq1_str1_16x8b);
2810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_str3_16x8b = _mm_unpackhi_epi16(temp_pq1_str0_16x8b, temp_pq1_str1_16x8b);
2820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_str2_16x8b = _mm_shuffle_epi32(temp_str2_16x8b, 0x58);
2840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_str3_16x8b = _mm_shuffle_epi32(temp_str3_16x8b, 0x58);
2850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            // q30 p30 q20 p20 q10 p10 q01 q00 p30 q20 p20 q10 p10 q01 q00 p00
2860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_pq1_str0_16x8b = _mm_unpackhi_epi32(temp_str2_16x8b, temp_str3_16x8b);
2870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            // q32 q31 q22 q21 q12 q11 q02 q01 p32 p31 p22 p21 p12 p11 p02 p01
2880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_pq1_str1_16x8b = _mm_unpacklo_epi32(temp_str2_16x8b, temp_str3_16x8b);
2890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_pq1_str0_16x8b = _mm_maddubs_epi16(temp_pq1_str0_16x8b, const2_8x16b);
2910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_pq1_str1_16x8b = _mm_maddubs_epi16(temp_pq1_str1_16x8b, const2_8x16b);
2920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //clipping mask design
2940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_str1_16x8b = _mm_setzero_si128();
2950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_str0_16x8b = _mm_loadl_epi64((__m128i *)(&mask));
2960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            const2tc_8x16b  = _mm_loadl_epi64((__m128i *)(&tc2));
2970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_str0_16x8b = _mm_shuffle_epi32(temp_str0_16x8b, 0x44);
2980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            const2tc_8x16b  = _mm_shuffle_epi8(const2tc_8x16b, temp_str1_16x8b);
2990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //clipping mask design
3010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_str0_16x8b = _mm_srai_epi32(temp_str0_16x8b, 31);
3020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            const2tc_8x16b = _mm_and_si128(const2tc_8x16b, temp_str0_16x8b);
3030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //calculating Clipping MAX for all pixel values.
3040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_max0_16x8b = _mm_adds_epu8(src_row0_8x16b, const2tc_8x16b);
3050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_max1_16x8b = _mm_adds_epu8(src_row1_8x16b, const2tc_8x16b);
3060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //q'2-q'0-2,p'2-p'0-2
3090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_pq2_str0_16x8b = _mm_unpacklo_epi16(src_row0_8x16b, src_row2_8x16b);
3100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_str3_16x8b     = _mm_unpacklo_epi16(src_row1_8x16b, src_row3_8x16b);
3110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_pq2_str0_16x8b = _mm_shuffle_epi32(temp_pq2_str0_16x8b, 0x5c);
3130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_str3_16x8b     = _mm_shuffle_epi32(temp_str3_16x8b, 0x5c);
3140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            const2_8x16b = _mm_slli_epi16(const2_8x16b, 1);
3160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //arranged q33 q32 q23 q22 q13 q12 q03 q02 p33 p32 p23 p22 p13 p12 p03 p02
3170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_str3_16x8b = _mm_unpacklo_epi16(temp_pq2_str0_16x8b, temp_str3_16x8b);
3180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_pq2_str0_16x8b = _mm_maddubs_epi16(temp_str3_16x8b, const2_8x16b);
3200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //calculating Clipping MIN for all pixel values.
3220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_min0_16x8b = _mm_subs_epu8(src_row0_8x16b, const2tc_8x16b);
3230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_min1_16x8b = _mm_subs_epu8(src_row1_8x16b, const2tc_8x16b);
3240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //q'0-q'1-2 ,p'0-p'1-2
3250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_pq_str1_16x8b = _mm_shuffle_epi32(temp_pq_str0_16x8b, 0x4e);
3260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_pq_str0_16x8b = _mm_add_epi16(temp_pq_str0_16x8b, temp_pq_str1_16x8b);
3270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //q'1-2 p'1-2
3280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_pq1_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq1_str1_16x8b);
3290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //to get 2 in 16 bit
3300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            const2_8x16b = _mm_srli_epi16(const2_8x16b, 8);
3310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //to get q33 q23 q13 q03, p33 p23 p13 p03
3320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_pq1_str1_16x8b = _mm_slli_epi16(temp_str3_16x8b, 8);
3330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_pq_str1_16x8b = _mm_srli_epi16(temp_str3_16x8b, 8);
3340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_pq1_str1_16x8b = _mm_srli_epi16(temp_pq1_str1_16x8b, 8);
3350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //q'1, p'1 (adding 2)
3370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_pq1_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, const2_8x16b);
3380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //q'0-q'1,p'0-p'1
3390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_pq_str0_16x8b = _mm_add_epi16(temp_pq_str0_16x8b, const2_8x16b);
3400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //q'2-q'1,p'2-p'1
3410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_pq2_str0_16x8b = _mm_add_epi16(temp_pq2_str0_16x8b, const2_8x16b);
3420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //q'0 = (q'0-q'1)+q'1 ,p'0 = (p'0-p'1)+p'1;
3430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_pq_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq_str0_16x8b);
3440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //q'2 = (q'2-q'1)+q'1 ,p'2 = (p'2-p'1)+p'1;
3450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_pq2_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq2_str0_16x8b);
3460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //normalisation of all modified pixels
3480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_pq_str0_16x8b  = _mm_srai_epi16(temp_pq_str0_16x8b, 3);
3490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_pq1_str0_16x8b = _mm_srai_epi16(temp_pq1_str0_16x8b, 2);
3500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_pq2_str0_16x8b = _mm_srai_epi16(temp_pq2_str0_16x8b, 3);
3510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //getting p0 p1 together and p2 p3 together
3530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_str0_16x8b = _mm_unpacklo_epi16(temp_pq1_str0_16x8b, temp_pq_str0_16x8b);
3540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_str2_16x8b = _mm_unpacklo_epi16(temp_pq1_str1_16x8b, temp_pq2_str0_16x8b);
3550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //getting q1 q0 together and  q3 q2 together
3560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_pq_str0_16x8b = _mm_unpackhi_epi16(temp_pq_str0_16x8b, temp_pq1_str0_16x8b);
3570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_pq2_str0_16x8b = _mm_unpackhi_epi16(temp_pq2_str0_16x8b, temp_pq_str1_16x8b);
3580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //getting p's of row0 row1 together and of row2 row3 together
3590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_pq_str1_16x8b = _mm_unpacklo_epi32(temp_str2_16x8b, temp_str0_16x8b);
3600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_str2_16x8b    = _mm_unpackhi_epi32(temp_str2_16x8b, temp_str0_16x8b);
3610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //getting q's of row0 row1 together and of row2 row3 together
3620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_str0_16x8b    = _mm_unpacklo_epi32(temp_pq_str0_16x8b, temp_pq2_str0_16x8b);
3630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_pq_str0_16x8b = _mm_unpackhi_epi32(temp_pq_str0_16x8b, temp_pq2_str0_16x8b);
3640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //getting values for respective rows in 16 bit
3650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_row0_8x16b = _mm_unpacklo_epi64(temp_pq_str1_16x8b, temp_str0_16x8b);
3660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_row1_8x16b = _mm_unpackhi_epi64(temp_pq_str1_16x8b, temp_str0_16x8b);
3670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_row2_8x16b = _mm_unpacklo_epi64(temp_str2_16x8b, temp_pq_str0_16x8b);
3680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_row3_8x16b = _mm_unpackhi_epi64(temp_str2_16x8b, temp_pq_str0_16x8b);
3690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //packing values to 8 bit
3700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_row0_8x16b = _mm_packus_epi16(src_row0_8x16b, src_row2_8x16b);
3710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_row1_8x16b = _mm_packus_epi16(src_row1_8x16b, src_row3_8x16b);
3720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //Clipping MAX
3730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_row0_8x16b = _mm_min_epu8(src_row0_8x16b, temp_max0_16x8b);
3740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_row1_8x16b = _mm_min_epu8(src_row1_8x16b, temp_max1_16x8b);
3750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //Clipping MIN
3760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_row0_8x16b = _mm_max_epu8(src_row0_8x16b, temp_min0_16x8b);
3770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_row1_8x16b = _mm_max_epu8(src_row1_8x16b, temp_min1_16x8b);
3780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //separating row 2 and row 3
3790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_row2_8x16b = _mm_srli_si128(src_row0_8x16b, 8);
3800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_row3_8x16b = _mm_srli_si128(src_row1_8x16b, 8);
3810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
3830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        else
3850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
3860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i tmp_delta0_8x16b, tmp_delta1_8x16b, tmp_delta2_8x16b, tmp_delta3_8x16b;
3880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i tmp0_const_8x16b, tmp1_const_8x16b, tmp2_const_8x16b, tmp3_const_8x16b;
3890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i coefdelta_0_8x16b, mask_pq_8x16b;
3900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i const2_8x16b, consttc_8x16b;
3910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            LWORD64 mask1;
3930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            mask1 = (((LWORD64)(filter_flag_q & deq)) << 63) | (((LWORD64)filter_flag_q) << 47) | (((LWORD64)filter_flag_p) << 31) | (((LWORD64)(filter_flag_p & dep)) << 15);
3940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            consttc_8x16b = _mm_set1_epi32(tc);
3960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_row0_8x16b = _mm_unpacklo_epi64(src_row0_8x16b, src_row1_8x16b);
3990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_row2_8x16b = _mm_unpacklo_epi64(src_row2_8x16b, src_row3_8x16b);
4000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_delta2_8x16b = _mm_srli_epi64(src_row0_8x16b, 16);
4020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_delta3_8x16b = _mm_srli_epi64(src_row2_8x16b, 16);
4030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_delta2_8x16b = _mm_shuffle_epi32(tmp_delta2_8x16b, 0x08);
4050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_delta3_8x16b = _mm_shuffle_epi32(tmp_delta3_8x16b, 0x08);
4060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //arranged q31 q30 p30 p31  q21 q20 p20 p21  q1 q10 p10 p11 q01 q00 p00 p01
4070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_delta2_8x16b = _mm_unpacklo_epi64(tmp_delta2_8x16b, tmp_delta3_8x16b);
4080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            coefdelta_0_8x16b = _mm_load_si128((__m128i *)coef_de1);
4100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            // (-3q1+9q0),(-9p0+3p1)
4110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_delta3_8x16b = _mm_maddubs_epi16(tmp_delta2_8x16b, coefdelta_0_8x16b);
4120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //converting to 16 bit
4130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            consttc_8x16b = _mm_packs_epi32(consttc_8x16b, consttc_8x16b);
4140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //getting -tc store
4150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp1_const_8x16b = _mm_cmpeq_epi32(consttc_8x16b, consttc_8x16b);
4160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //calc 10 *tc = 2*tc +8*tc ; 2*tc
4170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp2_const_8x16b = _mm_slli_epi16(consttc_8x16b, 1);
4180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //calc 10 *tc = 2*tc +8*tc ; 8*tc
4190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp0_const_8x16b = _mm_slli_epi16(consttc_8x16b, 3);
4200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //getting -tc store
4210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp3_const_8x16b = _mm_sign_epi16(consttc_8x16b, tmp1_const_8x16b);
4220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //calc 10 *tc
4230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp2_const_8x16b = _mm_add_epi16(tmp2_const_8x16b, tmp0_const_8x16b);
4240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //const 1
4250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            const2_8x16b = _mm_srli_epi16(tmp1_const_8x16b, 15);
4260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_delta0_8x16b = _mm_madd_epi16(tmp_delta3_8x16b, const2_8x16b);
4270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            const2_8x16b = _mm_srli_epi32(tmp1_const_8x16b, 31);
4280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //getting the mask values
4290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            mask_pq_8x16b = _mm_loadl_epi64((__m128i *)(&mask1));
4300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //loaded coef for delta1 calculation
4310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            coefdelta_0_8x16b = _mm_load_si128((__m128i *)coef_dep1);
4320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //(-2q1+q0),(p0-2p1)
4330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_delta3_8x16b = _mm_maddubs_epi16(tmp_delta2_8x16b, coefdelta_0_8x16b);
4340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //const 8
4350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            const2_8x16b = _mm_slli_epi32(const2_8x16b, 3);
4360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //rearranging the mask values
4370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            mask_pq_8x16b = _mm_unpacklo_epi64(mask_pq_8x16b, mask_pq_8x16b);
4380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //normalisation of the filter
4390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_delta0_8x16b = _mm_add_epi32(tmp_delta0_8x16b, const2_8x16b);
4400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_delta0_8x16b = _mm_srai_epi32(tmp_delta0_8x16b, 4);
4410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //getting deltaq0
4430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_delta2_8x16b = _mm_sign_epi32(tmp_delta0_8x16b, tmp1_const_8x16b);
4440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //packing  d3q d2q d1q d0q d3p d2p d1p d0p
4450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_delta0_8x16b = _mm_packs_epi32(tmp_delta0_8x16b, tmp_delta2_8x16b);
4460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //absolute delta
4470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_delta2_8x16b = _mm_abs_epi16(tmp_delta0_8x16b);
4480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //Clipping of delta0
4490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_delta0_8x16b = _mm_min_epi16(tmp_delta0_8x16b, consttc_8x16b);
4500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //mask for |delta| < 10*tc
4510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp0_const_8x16b = _mm_cmpgt_epi16(tmp2_const_8x16b, tmp_delta2_8x16b);
4520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //Clipping of delta0
4530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_delta0_8x16b = _mm_max_epi16(tmp_delta0_8x16b, tmp3_const_8x16b);
4540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //delta 1 calc starts
4570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //getting q32 q22 q12 q02 p32 p12 p22 p02
4590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp2_const_8x16b = _mm_loadl_epi64((__m128i *)(shuffle0));
4600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_delta2_8x16b = _mm_shuffle_epi8(src_row0_8x16b, tmp2_const_8x16b);
4610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_delta1_8x16b =  _mm_shuffle_epi8(src_row2_8x16b, tmp2_const_8x16b);
4620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_delta1_8x16b = _mm_unpacklo_epi32(tmp_delta2_8x16b, tmp_delta1_8x16b);
4630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //constant 1
4640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            const2_8x16b = _mm_srli_epi16(tmp1_const_8x16b, 15);
4650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //tc>>1 16 bit
4660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            consttc_8x16b = _mm_srai_epi16(consttc_8x16b, 1);
4670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //getting -tc>>1 store  16 bit
4690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp1_const_8x16b = _mm_sign_epi16(consttc_8x16b, tmp1_const_8x16b);
4700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //2*delta0
4710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp2_const_8x16b = _mm_add_epi16(tmp_delta0_8x16b, tmp_delta0_8x16b);
4720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //getting  all respective q's and p's together
4740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp3_const_8x16b = _mm_load_si128((__m128i *)(shuffle1));
4750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_delta3_8x16b = _mm_shuffle_epi8(tmp_delta3_8x16b, tmp3_const_8x16b);
4760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //final adds for deltap1 and deltaq1
4770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_delta3_8x16b = _mm_add_epi16(tmp_delta3_8x16b, const2_8x16b);
4780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_delta1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, tmp2_const_8x16b);
4790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_delta1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, tmp_delta3_8x16b);
4800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp2_const_8x16b = _mm_setzero_si128();
4810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_delta1_8x16b = _mm_srai_epi16(tmp_delta1_8x16b, 2);
4820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            // clipping delta1
4840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_delta1_8x16b = _mm_min_epi16(tmp_delta1_8x16b, consttc_8x16b);
4850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            // clipping delta1
4860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_delta1_8x16b = _mm_max_epi16(tmp_delta1_8x16b, tmp1_const_8x16b);
4870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //getting the mask ready
4890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            mask_pq_8x16b = _mm_srai_epi16(mask_pq_8x16b, 15);
4900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //masking of the delta values |delta|<10*tc
4910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_delta1_8x16b = _mm_and_si128(tmp_delta1_8x16b, tmp0_const_8x16b);
4920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_delta0_8x16b = _mm_and_si128(tmp_delta0_8x16b, tmp0_const_8x16b);
4930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //packing dq1 dq0 dp0 dp1
4940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp1_const_8x16b = _mm_unpacklo_epi16(tmp_delta1_8x16b, tmp_delta0_8x16b);
4950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_delta0_8x16b = _mm_unpackhi_epi16(tmp_delta0_8x16b, tmp_delta1_8x16b);
4960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_delta1_8x16b = _mm_unpackhi_epi32(tmp1_const_8x16b, tmp_delta0_8x16b);
4970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_delta0_8x16b = _mm_unpacklo_epi32(tmp1_const_8x16b, tmp_delta0_8x16b);
4980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //masking of the delta values dep, deq , filter_p ,filter_q
5000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_delta0_8x16b = _mm_and_si128(tmp_delta0_8x16b, mask_pq_8x16b);
5010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_delta1_8x16b = _mm_and_si128(tmp_delta1_8x16b, mask_pq_8x16b);
5020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //converting 8bit to 16 bit
5030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_row0_8x16b = _mm_unpacklo_epi8(src_row0_8x16b, tmp2_const_8x16b);
5040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_row1_8x16b = _mm_unpacklo_epi8(src_row1_8x16b, tmp2_const_8x16b);
5050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_row2_8x16b = _mm_unpacklo_epi8(src_row2_8x16b, tmp2_const_8x16b);
5060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_row3_8x16b = _mm_unpacklo_epi8(src_row3_8x16b, tmp2_const_8x16b);
5070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //shuffle values loaded
5080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp0_const_8x16b = _mm_load_si128((__m128i *)shuffle2);
5090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp1_const_8x16b = _mm_load_si128((__m128i *)shuffle3);
5100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //arranging each row delta in different registers
5110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_delta3_8x16b = _mm_shuffle_epi8(tmp_delta1_8x16b, tmp1_const_8x16b);
5120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_delta2_8x16b = _mm_shuffle_epi8(tmp_delta1_8x16b, tmp0_const_8x16b);
5130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_delta1_8x16b = _mm_shuffle_epi8(tmp_delta0_8x16b, tmp1_const_8x16b);
5140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_delta0_8x16b = _mm_shuffle_epi8(tmp_delta0_8x16b, tmp0_const_8x16b);
5150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //adding the respective delta
5170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_row3_8x16b = _mm_add_epi16(tmp_delta3_8x16b, src_row3_8x16b);
5180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_row2_8x16b = _mm_add_epi16(tmp_delta2_8x16b, src_row2_8x16b);
5190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_row1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, src_row1_8x16b);
5200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_row0_8x16b = _mm_add_epi16(tmp_delta0_8x16b, src_row0_8x16b);
5210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //saturating to 8 bit
5220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_row2_8x16b = _mm_packus_epi16(src_row2_8x16b, src_row3_8x16b);
5230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_row0_8x16b = _mm_packus_epi16(src_row0_8x16b, src_row1_8x16b);
5240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //separating different rows
5250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_row1_8x16b = _mm_srli_si128(src_row0_8x16b, 8);
5260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_row3_8x16b = _mm_srli_si128(src_row2_8x16b, 8);
5270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
5280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        _mm_storel_epi64((__m128i *)(pu1_src - 4), src_row0_8x16b);
5300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        _mm_storel_epi64((__m128i *)((pu1_src - 4) + src_strd), src_row1_8x16b);
5310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        _mm_storel_epi64((__m128i *)((pu1_src - 4) + 2 * src_strd), src_row2_8x16b);
5320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        _mm_storel_epi64((__m128i *)((pu1_src - 4) + 3 * src_strd), src_row3_8x16b);
5330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
5340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar}
5350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarvoid ihevc_deblk_luma_horz_ssse3(UWORD8 *pu1_src,
5370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                 WORD32 src_strd,
5380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                 WORD32 bs,
5390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                 WORD32 quant_param_p,
5400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                 WORD32 quant_param_q,
5410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                 WORD32 beta_offset_div2,
5420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                 WORD32 tc_offset_div2,
5430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                 WORD32 filter_flag_p,
5440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                 WORD32 filter_flag_q)
5450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar{
5460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 qp_luma, beta_indx, tc_indx;
5470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 beta, tc;
5480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 d0, d3, dp, dq, d;
5500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 de_0, de_1, de_2, de_3;
5510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 d_sam0, d_sam3;
5520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 de, dep, deq;
5530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i src_q0_8x16b, src_q1_8x16b, src_p0_8x16b, src_p1_8x16b, src_q2_8x16b;
5550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i tmp_pq_str1_8x16b, src_p2_8x16b, tmp_pq_str0_8x16b;
5560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
5610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i src_tmp_p_0_8x16b, src_tmp_p_1_8x16b, src_tmp_q_0_8x16b, src_tmp_q_1_8x16b;
5620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i coef_8x16b, mask_d_result_4x32b, mask_de_result_8x16b;
5630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i mask_16x8b, temp_coef0_8x16b, temp_coef1_8x16b;
5640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        ASSERT((bs > 0));
5660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        ASSERT(filter_flag_p || filter_flag_q);
5670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        qp_luma = (quant_param_p + quant_param_q + 1) >> 1;
5690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        beta_indx = CLIP3(qp_luma + (beta_offset_div2 << 1), 0, 51);
5700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* BS based on implementation can take value 3 if it is intra/inter egde          */
5720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* based on BS, tc index is calcuated by adding 2 * ( bs - 1) to QP and tc_offset */
5730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* for BS = 1 adding factor is (0*2), BS = 2 or 3 adding factor is (1*2)          */
5740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* the above desired functionallity is achieved by doing (2*(bs>>1))              */
5750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        tc_indx = CLIP3(qp_luma + 2 * (bs >> 1) + (tc_offset_div2 << 1), 0, 53);
5770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        beta = gai4_ihevc_beta_table[beta_indx];
5790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        tc = gai4_ihevc_tc_table[tc_indx];
5800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        if(0 == tc)
5810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
5820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            return;
5830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
5840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_q0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src));
5850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_q1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
5860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_p0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - src_strd));
5870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_p1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - 2 * src_strd));
5880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_q2_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd));
5890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        tmp_pq_str1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd));
5900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_p2_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - 3 * src_strd));
5910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        tmp_pq_str0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - 4 * src_strd));
5920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_tmp_p_0_8x16b = _mm_unpacklo_epi8(src_p1_8x16b, src_p0_8x16b);
5950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_tmp_p_1_8x16b = _mm_unpacklo_epi8(tmp_pq_str0_8x16b, src_p2_8x16b);
5960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_tmp_q_0_8x16b = _mm_unpacklo_epi8(src_q0_8x16b, src_q1_8x16b);
5980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_tmp_q_1_8x16b = _mm_unpacklo_epi8(src_q2_8x16b, tmp_pq_str1_8x16b);
5990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_tmp_p_0_8x16b = _mm_unpacklo_epi16(src_tmp_p_1_8x16b, src_tmp_p_0_8x16b);
6010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_tmp_q_0_8x16b = _mm_unpacklo_epi16(src_tmp_q_0_8x16b, src_tmp_q_1_8x16b);
6020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_tmp_p_0_8x16b = _mm_shuffle_epi32(src_tmp_p_0_8x16b, 0x6c);
6040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_tmp_q_0_8x16b = _mm_shuffle_epi32(src_tmp_q_0_8x16b, 0x6c);
6050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        coef_8x16b = _mm_load_si128((__m128i *)(coef_d));
6070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        mask_16x8b =  _mm_load_si128((__m128i *)(shuffle_d));
6080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_tmp_p_0_8x16b = _mm_unpacklo_epi32(src_tmp_p_0_8x16b, src_tmp_q_0_8x16b);
6100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //WORD32 shuffle_d[4]={0x80800403,0x80800c0b,0x03000704,0x0b080f0c};
6110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        mask_de_result_8x16b = _mm_shuffle_epi8(src_tmp_p_0_8x16b, mask_16x8b);
6120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        mask_d_result_4x32b = _mm_maddubs_epi16(src_tmp_p_0_8x16b, coef_8x16b);
6140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //to get all 1's of 8 bit in (1)
6170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp_coef0_8x16b = _mm_cmpeq_epi16(src_tmp_p_0_8x16b, src_tmp_p_0_8x16b);
6180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp_coef1_8x16b = _mm_srli_epi16(temp_coef0_8x16b, 15);
6190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //accumulating values foe dp3 dq3 , dp0 dq0 values
6200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        mask_d_result_4x32b = _mm_madd_epi16(mask_d_result_4x32b, temp_coef1_8x16b);
6210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp_coef1_8x16b = _mm_packus_epi16(temp_coef1_8x16b, temp_coef1_8x16b);
6230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        // to get all 1,-1 sets of 16 bits in (0)
6240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp_coef0_8x16b = _mm_unpacklo_epi8(temp_coef0_8x16b, temp_coef1_8x16b);
6250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //q33-q30,p33-p30,q03-q00,p03-p00,0,q30-p30,0,q00-p00
6260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        mask_de_result_8x16b = _mm_maddubs_epi16(mask_de_result_8x16b, temp_coef0_8x16b);
6270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //to get 16 bit 1's
6280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp_coef0_8x16b = _mm_srli_epi16(temp_coef1_8x16b, 8);
6290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        // dq3 dp3 dq0 dp0
6320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        mask_d_result_4x32b = _mm_abs_epi32(mask_d_result_4x32b);
6330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        mask_16x8b = _mm_shuffle_epi32(mask_d_result_4x32b, 0xec);
6340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        mask_d_result_4x32b = _mm_shuffle_epi32(mask_d_result_4x32b, 0x49);
6350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        // dq dp d3 d0
6360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        mask_d_result_4x32b = _mm_add_epi32(mask_d_result_4x32b, mask_16x8b);
6370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //|q33-q30|,|p33-p30|,|q03-q00|,|p03-p00|,0,|q30-p30|,0,|q00-p00|
6380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        mask_de_result_8x16b = _mm_abs_epi16(mask_de_result_8x16b);
6390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //|q33-q30|+|p33-p30|,|q03-q00|+|p03-p00|,0+|q30-p30|,0+|q00-p00|
6400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        mask_de_result_8x16b = _mm_madd_epi16(mask_de_result_8x16b, temp_coef0_8x16b);
6410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        ///store back in a single variable
6430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp_coef0_8x16b = _mm_srli_si128(mask_d_result_4x32b, 4);
6440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp_coef1_8x16b = _mm_srli_si128(mask_d_result_4x32b, 8);
6450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        mask_16x8b = _mm_srli_si128(mask_d_result_4x32b, 12);
6460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        d0 = _mm_cvtsi128_si32(mask_d_result_4x32b);
6480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        d3 = _mm_cvtsi128_si32(temp_coef0_8x16b);
6490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        dp = _mm_cvtsi128_si32(temp_coef1_8x16b);
6500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        dq = _mm_cvtsi128_si32(mask_16x8b);
6510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //getting d
6520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        d = d0 + d3;
6530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        ///store back in a single variable
6550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp_coef0_8x16b = _mm_srli_si128(mask_de_result_8x16b, 4);
6560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp_coef1_8x16b = _mm_srli_si128(mask_de_result_8x16b, 8);
6570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        mask_16x8b = _mm_srli_si128(mask_de_result_8x16b, 12);
6580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        de_0 = _mm_cvtsi128_si32(mask_de_result_8x16b);
6600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        de_1 = _mm_cvtsi128_si32(temp_coef0_8x16b);
6610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        de_2 = _mm_cvtsi128_si32(temp_coef1_8x16b);
6620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        de_3 = _mm_cvtsi128_si32(mask_16x8b);
6630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        de = 0;
6650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        dep = 0;
6660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        deq = 0;
6670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        if(d < beta)
6680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
6690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            d_sam0 = 0;
6700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            if((2 * d0 < (beta >> 2))
6710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                            && (de_2 < (beta >> 3))
6720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                            && (de_0 < ((5 * tc + 1) >> 1)))
6730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
6740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                d_sam0 = 1;
6750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
6760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            d_sam3 = 0;
6780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            if((2 * d3 < (beta >> 2))
6790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                            && (de_3 < (beta >> 3))
6800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                            && de_1 < ((5 * tc + 1) >> 1))
6810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
6820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                d_sam3 = 1;
6830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
6840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            de = (d_sam0 & d_sam3) + 1;
6860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            dep = (dp < (beta + (beta >> 1)) >> 3) ? 1 : 0;
6870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            deq = (dq < (beta + (beta >> 1)) >> 3) ? 1 : 0;
6880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            if(tc <= 1)
6890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
6900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                dep = 0;
6910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                deq = 0;
6920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
6930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
6940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
6960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    if(de != 0)
6980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
6990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        if(2 == de)
7010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
7020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i temp_pq0_str0_16x8b;
7040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i temp_pq1_str0_16x8b, temp_pq1_str1_16x8b;
7050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i temp_pq2_str0_16x8b;
7060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i temp_str0_16x8b, temp_str1_16x8b;
7070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i const2_8x16b, const2tc_8x16b;
7080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            LWORD64 mask, tc2;
7100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tc = tc << 1;
7110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            mask = (((LWORD64)filter_flag_q) << 63) | (((LWORD64)filter_flag_p) << 31);
7120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tc2 = ((LWORD64)tc);
7130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            const2_8x16b = _mm_cmpeq_epi16(src_p1_8x16b, src_p1_8x16b);
7150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //q'0-q'1-2 ,p'0-p'1-2
7160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_pq0_str0_16x8b = _mm_unpacklo_epi8(src_p1_8x16b, src_p0_8x16b);
7170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_str0_16x8b   = _mm_unpacklo_epi8(src_q0_8x16b, src_q1_8x16b);
7180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            const2_8x16b = _mm_srli_epi16(const2_8x16b, 15);
7190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //arranged q31 q30 q21 q20 q1 q10 q01 q00 p30 p31 p20 p21 p10 p11 p00 p01
7200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_pq0_str0_16x8b = _mm_unpacklo_epi64(temp_pq0_str0_16x8b, temp_str0_16x8b);
7210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            const2_8x16b = _mm_packus_epi16(const2_8x16b, const2_8x16b);
7230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_pq0_str0_16x8b = _mm_maddubs_epi16(temp_pq0_str0_16x8b, const2_8x16b);
7240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //q'1-2, p'1-2
7260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_pq1_str0_16x8b = _mm_unpacklo_epi8(src_p0_8x16b, src_q0_8x16b);
7270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_pq1_str1_16x8b = _mm_unpacklo_epi8(src_q1_8x16b, src_q2_8x16b);
7280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_str1_16x8b = _mm_unpacklo_epi8(src_p1_8x16b, src_p2_8x16b);
7290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            // q30 p30 q20 p20 q10 p10 q01 q00 p30 q20 p20 q10 p10 q01 q00 p00
7300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_pq1_str0_16x8b = _mm_unpacklo_epi64(temp_pq1_str0_16x8b, temp_pq1_str0_16x8b);
7310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            // q32 q31 q22 q21 q12 q11 q02 q01 p32 p31 p22 p21 p12 p11 p02 p01
7320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_pq1_str1_16x8b = _mm_unpacklo_epi64(temp_str1_16x8b, temp_pq1_str1_16x8b);
7330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_pq1_str0_16x8b = _mm_maddubs_epi16(temp_pq1_str0_16x8b, const2_8x16b);
7350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_pq1_str1_16x8b = _mm_maddubs_epi16(temp_pq1_str1_16x8b, const2_8x16b);
7360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //clipping mask design
7380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_str1_16x8b = _mm_setzero_si128();
7390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_str0_16x8b = _mm_loadl_epi64((__m128i *)(&mask));
7400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            const2tc_8x16b  = _mm_loadl_epi64((__m128i *)(&tc2));
7410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_str0_16x8b = _mm_shuffle_epi32(temp_str0_16x8b, 0x44);
7420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            const2tc_8x16b  = _mm_shuffle_epi8(const2tc_8x16b, temp_str1_16x8b);
7430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //clipping mask design
7450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_str0_16x8b = _mm_srai_epi32(temp_str0_16x8b, 31);
7460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            const2tc_8x16b = _mm_and_si128(const2tc_8x16b, temp_str0_16x8b);
7470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //calculating Clipping MAX for all pixel values.
7480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_p0_8x16b = _mm_unpacklo_epi32(src_p0_8x16b, src_q0_8x16b);
7490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_q0_8x16b = _mm_unpacklo_epi32(src_p1_8x16b, src_q1_8x16b);
7500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //for clipping calc
7510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_p1_8x16b = _mm_unpacklo_epi64(src_p0_8x16b, src_q0_8x16b);
7520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //saving the unmodified data of q1 p1 q0 p0
7530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_q1_8x16b = _mm_unpackhi_epi64(src_p0_8x16b, src_q0_8x16b);
7540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //CLIpping MAX and MIN for q1 p1 q0 p0
7550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_p0_8x16b = _mm_adds_epu8(src_p1_8x16b, const2tc_8x16b);
7560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_p1_8x16b = _mm_subs_epu8(src_p1_8x16b, const2tc_8x16b);
7570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //q'2-q'0-2,p'2-p'0-2
7600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_pq_str0_8x16b = _mm_unpacklo_epi8(src_p2_8x16b, tmp_pq_str0_8x16b);
7610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_pq2_str0_16x8b = _mm_unpacklo_epi8(src_q2_8x16b, tmp_pq_str1_8x16b);
7620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            const2_8x16b = _mm_slli_epi16(const2_8x16b, 1);
7630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //arranged q33 q32 q23 q22 q13 q12 q03 q02 p32 p33 p22 p23 p12 p13 p02 p03
7640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_pq2_str0_16x8b = _mm_unpacklo_epi64(tmp_pq_str0_8x16b, temp_pq2_str0_16x8b);
7650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_p2_8x16b = _mm_unpacklo_epi32(src_p2_8x16b, src_q2_8x16b);
7660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_pq2_str0_16x8b = _mm_maddubs_epi16(temp_pq2_str0_16x8b, const2_8x16b);
7670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //calculating Clipping MAX and MIN for p2 and q2 .
7690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_pq_str0_8x16b = _mm_adds_epu8(src_p2_8x16b, const2tc_8x16b);
7700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_pq_str1_8x16b = _mm_subs_epu8(src_p2_8x16b, const2tc_8x16b);
7710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //q'0-q'1-2 ,p'0-p'1-2
7720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_str0_16x8b = _mm_shuffle_epi32(temp_pq0_str0_16x8b, 0x4e);
7730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_pq0_str0_16x8b = _mm_add_epi16(temp_pq0_str0_16x8b, temp_str0_16x8b);
7740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //q'1-2 p'1-2
7750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_pq1_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq1_str1_16x8b);
7760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //to get 2 in 16 bit
7770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            const2_8x16b = _mm_srli_epi16(const2_8x16b, 8);
7780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //q'1, p'1 (adding 2)
7810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_pq1_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, const2_8x16b);
7820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //q'0-q'1,p'0-p'1
7830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_pq0_str0_16x8b = _mm_add_epi16(temp_pq0_str0_16x8b, const2_8x16b);
7840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //q'2-q'1,p'2-p'1
7850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_pq2_str0_16x8b = _mm_add_epi16(temp_pq2_str0_16x8b, const2_8x16b);
7860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //q'0 = (q'0-q'1)+q'1 ,p'0 = (p'0-p'1)+p'1;
7870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_pq0_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq0_str0_16x8b);
7880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //q'2 = (q'2-q'1)+q'1 ,p'2 = (p'2-p'1)+p'1;
7890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_pq2_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq2_str0_16x8b);
7900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //normalisation of all modified pixels
7920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_pq0_str0_16x8b = _mm_srai_epi16(temp_pq0_str0_16x8b, 3);
7930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_pq1_str0_16x8b = _mm_srai_epi16(temp_pq1_str0_16x8b, 2);
7940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_pq2_str0_16x8b = _mm_srai_epi16(temp_pq2_str0_16x8b, 3);
7950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //q'1 p'1 q'0 p'0
7960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_pq0_str0_16x8b = _mm_packus_epi16(temp_pq0_str0_16x8b, temp_pq1_str0_16x8b);
7970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_pq2_str0_16x8b = _mm_packus_epi16(temp_pq2_str0_16x8b, temp_pq2_str0_16x8b);
7980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //pack with the unmodified data of q2 and p2
7990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_p2_8x16b = _mm_unpackhi_epi64(temp_pq2_str0_16x8b, src_p2_8x16b);
8000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //Clipping MAX and MIN for q'1 p'1 q'0 p'0 and q'2  p'2
8010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_pq0_str0_16x8b = _mm_min_epu8(temp_pq0_str0_16x8b, src_p0_8x16b);
8020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_p2_8x16b = _mm_min_epu8(src_p2_8x16b, tmp_pq_str0_8x16b);
8030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp_pq0_str0_16x8b = _mm_max_epu8(temp_pq0_str0_16x8b, src_p1_8x16b);
8040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_p2_8x16b = _mm_max_epu8(src_p2_8x16b, tmp_pq_str1_8x16b);
8050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //Reshuffling q'1 p'1 q'0 p'0 along with unmodified data
8060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_p0_8x16b = _mm_unpacklo_epi32(temp_pq0_str0_16x8b, src_q1_8x16b);
8070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_p1_8x16b = _mm_unpackhi_epi32(temp_pq0_str0_16x8b, src_q1_8x16b);
8080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_p2_8x16b = _mm_shuffle_epi32(src_p2_8x16b, 0xd8);
8090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_q0_8x16b = _mm_srli_si128(src_p0_8x16b, 8);
8100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_q1_8x16b = _mm_srli_si128(src_p1_8x16b, 8);
8110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_q2_8x16b = _mm_srli_si128(src_p2_8x16b, 8);
8120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storel_epi64((__m128i *)(pu1_src - 3 * src_strd), src_p2_8x16b);
8140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storel_epi64((__m128i *)(pu1_src - 2 * src_strd), src_p1_8x16b);
8150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storel_epi64((__m128i *)(pu1_src - src_strd), src_p0_8x16b);
8160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storel_epi64((__m128i *)(pu1_src), src_q0_8x16b);
8170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storel_epi64((__m128i *)(pu1_src + src_strd), src_q1_8x16b);
8180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storel_epi64((__m128i *)(pu1_src + 2 * src_strd), src_q2_8x16b);
8190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
8220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        else
8240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
8250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i tmp_delta0_8x16b, tmp_delta1_8x16b;
8270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i tmp0_const_8x16b, tmp1_const_8x16b, tmp2_const_8x16b;
8280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i coefdelta_0_8x16b;
8290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i const2_8x16b, consttc_8x16b;
8300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            LWORD64 maskp0, maskp1, maskq0, maskq1;
8320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            maskp0 = (LWORD64)filter_flag_p;
8330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            maskq0 = (LWORD64)filter_flag_q;
8340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            maskp1 = (LWORD64)dep;
8350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            maskq1 = (LWORD64)deq;
8360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            consttc_8x16b = _mm_set1_epi32(tc);
8370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_delta0_8x16b = _mm_unpacklo_epi8(src_p1_8x16b, src_p0_8x16b);
8390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_delta1_8x16b = _mm_unpacklo_epi8(src_q0_8x16b, src_q1_8x16b);
8400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //arranged q31 q30 p30 p31  q21 q20 p20 p21  q1 q10 p10 p11 q01 q00 p00 p01
8410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_delta1_8x16b = _mm_unpacklo_epi16(tmp_delta0_8x16b, tmp_delta1_8x16b);
8420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            coefdelta_0_8x16b = _mm_load_si128((__m128i *)coef_de1);
8440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            // (-3q1+9q0),(-9p0+3p1)
8450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_delta0_8x16b = _mm_maddubs_epi16(tmp_delta1_8x16b, coefdelta_0_8x16b);
8460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //getting -tc store
8480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp2_const_8x16b = _mm_cmpeq_epi32(consttc_8x16b, consttc_8x16b);
8490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //getting tc in 16 bit
8510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            consttc_8x16b = _mm_packs_epi32(consttc_8x16b, consttc_8x16b);
8520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //calc 10 *tc = 2*tc +8*tc ; 2*tc
8530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_pq_str0_8x16b = _mm_slli_epi16(consttc_8x16b, 1);
8540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //calc 10 *tc = 2*tc +8*tc ; 8*tc
8550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_pq_str1_8x16b = _mm_slli_epi16(consttc_8x16b, 3);
8560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //const 1
8580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            const2_8x16b = _mm_srli_epi16(tmp2_const_8x16b, 15);
8590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //calc 10 *tc
8600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_pq_str0_8x16b = _mm_add_epi16(tmp_pq_str0_8x16b, tmp_pq_str1_8x16b);
8610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //delta0 without normalisation and clipping
8620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_delta0_8x16b = _mm_madd_epi16(tmp_delta0_8x16b, const2_8x16b);
8630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            const2_8x16b = _mm_srli_epi32(tmp2_const_8x16b, 31);
8650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //loaded coef for delta1 calculation
8670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            coefdelta_0_8x16b = _mm_load_si128((__m128i *)coef_dep1);
8680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //(-2q1+q0),(p0-2p1)
8690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_delta1_8x16b = _mm_maddubs_epi16(tmp_delta1_8x16b, coefdelta_0_8x16b);
8700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //const 8
8710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            const2_8x16b = _mm_slli_epi32(const2_8x16b, 3);
8720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //normalisation of the filter
8740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_delta0_8x16b = _mm_add_epi32(tmp_delta0_8x16b, const2_8x16b);
8750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_delta0_8x16b = _mm_srai_epi32(tmp_delta0_8x16b, 4);
8760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //getting deltaq0
8780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_pq_str1_8x16b = _mm_sign_epi32(tmp_delta0_8x16b, tmp2_const_8x16b);
8790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //getting -tc
8800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp1_const_8x16b = _mm_sign_epi16(consttc_8x16b, tmp2_const_8x16b);
8810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //packing  d03q d02q d01q d0q d03p d02p d01p d00p
8820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_delta0_8x16b = _mm_packs_epi32(tmp_delta0_8x16b, tmp_pq_str1_8x16b);
8830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //absolute delta
8840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_pq_str1_8x16b = _mm_abs_epi16(tmp_delta0_8x16b);
8850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //Clipping of delta0
8870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_delta0_8x16b = _mm_min_epi16(tmp_delta0_8x16b, consttc_8x16b);
8880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //tc>>1 16 bit
8890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            consttc_8x16b = _mm_srai_epi16(consttc_8x16b, 1);
8900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //Clipping of delta0
8910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_delta0_8x16b = _mm_max_epi16(tmp_delta0_8x16b, tmp1_const_8x16b);
8920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //(-tc)>>1 16 bit
8940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp1_const_8x16b = _mm_sign_epi16(consttc_8x16b, tmp2_const_8x16b);
8950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //mask for |delta| < 10*tc
8960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_pq_str0_8x16b = _mm_cmpgt_epi16(tmp_pq_str0_8x16b, tmp_pq_str1_8x16b);
8970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //delta 1 calc starts
8980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //getting q32 q22 q12 q02 p32 p12 p22 p02
9000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp0_const_8x16b = _mm_setzero_si128();
9010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_q2_8x16b = _mm_unpacklo_epi8(src_q2_8x16b, tmp0_const_8x16b);
9020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_p2_8x16b = _mm_unpacklo_epi8(src_p2_8x16b, tmp0_const_8x16b);
9030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_p2_8x16b = _mm_unpacklo_epi64(src_p2_8x16b, src_q2_8x16b);
9040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //constant 1
9050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            const2_8x16b = _mm_srli_epi16(tmp2_const_8x16b, 15);
9060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //2*delta0
9070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp2_const_8x16b = _mm_add_epi16(tmp_delta0_8x16b, tmp_delta0_8x16b);
9080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //getting  all respective q's and p's together
9090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            coefdelta_0_8x16b = _mm_load_si128((__m128i *)(shuffle1));
9100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_delta1_8x16b = _mm_shuffle_epi8(tmp_delta1_8x16b, coefdelta_0_8x16b);
9110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //final adds for deltap1 and deltaq1
9120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_delta1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, const2_8x16b);
9130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_p2_8x16b = _mm_add_epi16(src_p2_8x16b, tmp2_const_8x16b);
9140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_delta1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, src_p2_8x16b);
9150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_delta1_8x16b = _mm_srai_epi16(tmp_delta1_8x16b, 2);
9160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //mask0= (((LWORD64)filter_flag_q)<<63)| (((LWORD64)filter_flag_p)<<31);
9180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_pq_str1_8x16b = _mm_loadl_epi64((__m128i *)(&(maskq0)));
9190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_p2_8x16b = _mm_loadl_epi64((__m128i *)(&(maskp0)));
9200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //   src_p2_8x16b = _mm_set_epi32(filter_flag_q,filter_flag_p,filter_flag_q,filter_flag_p);
9220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //mask1= (((LWORD64)(filter_flag_q&deq))<<63)|(((LWORD64)(filter_flag_p & dep))<<31);
9230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_q2_8x16b = _mm_loadl_epi64((__m128i *)(&(maskq1)));
9240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            coefdelta_0_8x16b = _mm_loadl_epi64((__m128i *)(&(maskp1)));
9250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_p2_8x16b = _mm_unpacklo_epi32(src_p2_8x16b, tmp_pq_str1_8x16b);
9270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_q2_8x16b = _mm_unpacklo_epi32(coefdelta_0_8x16b, src_q2_8x16b);
9280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //src_q2_8x16b = _mm_set_epi32(deq,dep,deq,dep);
9290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_q2_8x16b = _mm_and_si128(src_q2_8x16b, src_p2_8x16b);
9300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //rearranging the mask values
9320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_q2_8x16b = _mm_shuffle_epi32(src_q2_8x16b, 0x50);
9330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_p2_8x16b = _mm_shuffle_epi32(src_p2_8x16b, 0x50);
9340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_q2_8x16b = _mm_slli_epi32(src_q2_8x16b, 31);
9360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_p2_8x16b = _mm_slli_epi32(src_p2_8x16b, 31);
9370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_q2_8x16b = _mm_srai_epi32(src_q2_8x16b, 31);
9380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_p2_8x16b = _mm_srai_epi32(src_p2_8x16b, 31);
9390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //combining mask delta1
9410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_pq_str1_8x16b = _mm_and_si128(tmp_pq_str0_8x16b, src_q2_8x16b);
9420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            // clipping delta1
9430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_delta1_8x16b = _mm_min_epi16(tmp_delta1_8x16b, consttc_8x16b);
9440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //combining mask delat0
9450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_pq_str0_8x16b = _mm_and_si128(tmp_pq_str0_8x16b, src_p2_8x16b);
9460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            // clipping delta1
9470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_delta1_8x16b = _mm_max_epi16(tmp_delta1_8x16b, tmp1_const_8x16b);
9480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //masking of the delta values |delta|<10*tc
9510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_delta1_8x16b = _mm_and_si128(tmp_delta1_8x16b, tmp_pq_str1_8x16b);
9520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_delta0_8x16b = _mm_and_si128(tmp_delta0_8x16b, tmp_pq_str0_8x16b);
9530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //separating p and q delta 0 and addinq p0 and q0
9540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_pq_str0_8x16b = _mm_unpacklo_epi64(tmp_delta0_8x16b, tmp0_const_8x16b);
9550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_pq_str1_8x16b = _mm_unpackhi_epi64(tmp_delta0_8x16b, tmp0_const_8x16b);
9560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_p0_8x16b = _mm_unpacklo_epi8(src_p0_8x16b, tmp0_const_8x16b);
9570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_q0_8x16b = _mm_unpacklo_epi8(src_q0_8x16b, tmp0_const_8x16b);
9580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_p0_8x16b = _mm_add_epi16(src_p0_8x16b, tmp_pq_str0_8x16b);
9590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_q0_8x16b = _mm_add_epi16(src_q0_8x16b, tmp_pq_str1_8x16b);
9600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //separating p and q delta 0 and addinq p0 and q0
9610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_pq_str0_8x16b = _mm_unpacklo_epi64(tmp_delta1_8x16b, tmp0_const_8x16b);
9620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            tmp_pq_str1_8x16b = _mm_unpackhi_epi64(tmp_delta1_8x16b, tmp0_const_8x16b);
9630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_p1_8x16b = _mm_unpacklo_epi8(src_p1_8x16b, tmp0_const_8x16b);
9640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_q1_8x16b = _mm_unpacklo_epi8(src_q1_8x16b, tmp0_const_8x16b);
9650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_p1_8x16b = _mm_add_epi16(src_p1_8x16b, tmp_pq_str0_8x16b);
9660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_q1_8x16b = _mm_add_epi16(src_q1_8x16b, tmp_pq_str1_8x16b);
9670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //packing p1 q1 and p0 q0 to 8 bit
9680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_p1_8x16b = _mm_packus_epi16(src_p1_8x16b, src_q1_8x16b);
9690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_p0_8x16b = _mm_packus_epi16(src_p0_8x16b, src_q0_8x16b);
9700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_q1_8x16b = _mm_srli_si128(src_p1_8x16b, 8);
9720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_q0_8x16b = _mm_srli_si128(src_p0_8x16b, 8);
9730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storel_epi64((__m128i *)(pu1_src - 2 * src_strd), src_p1_8x16b);
9750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storel_epi64((__m128i *)(pu1_src - src_strd), src_p0_8x16b);
9760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storel_epi64((__m128i *)(pu1_src), src_q0_8x16b);
9770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storel_epi64((__m128i *)(pu1_src + src_strd), src_q1_8x16b);
9780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
9810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
9850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar}
9870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarvoid ihevc_deblk_chroma_vert_ssse3(UWORD8 *pu1_src,
9890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                   WORD32 src_strd,
9900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                   WORD32 quant_param_p,
9910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                   WORD32 quant_param_q,
9920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                   WORD32 qp_offset_u,
9930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                   WORD32 qp_offset_v,
9940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                   WORD32 tc_offset_div2,
9950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                   WORD32 filter_flag_p,
9960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                   WORD32 filter_flag_q)
9970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar{
9980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 qp_indx_u, qp_chroma_u;
9990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 qp_indx_v, qp_chroma_v;
10000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 tc_indx_u, tc_u;
10010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 tc_indx_v, tc_v;
10020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i src_row_0_16x8b, tmp_pxl_0_16x8b, src_row_2_16x8b, tmp_pxl_1_16x8b;
10040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ASSERT(filter_flag_p || filter_flag_q);
10050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* chroma processing is done only if BS is 2             */
10070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* this function is assumed to be called only if BS is 2 */
10080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    qp_indx_u = qp_offset_u + ((quant_param_p + quant_param_q + 1) >> 1);
10090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    qp_chroma_u = qp_indx_u < 0 ? qp_indx_u : (qp_indx_u > 57 ? qp_indx_u - 6 : gai4_ihevc_qp_table[qp_indx_u]);
10100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    qp_indx_v = qp_offset_v + ((quant_param_p + quant_param_q + 1) >> 1);
10120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    qp_chroma_v = qp_indx_v < 0 ? qp_indx_v : (qp_indx_v > 57 ? qp_indx_v - 6 : gai4_ihevc_qp_table[qp_indx_v]);
10130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    tc_indx_u = CLIP3(qp_chroma_u + 2 + (tc_offset_div2 << 1), 0, 53);
10150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    tc_u = gai4_ihevc_tc_table[tc_indx_u];
10160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    tc_indx_v = CLIP3(qp_chroma_v + 2 + (tc_offset_div2 << 1), 0, 53);
10180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    tc_v = gai4_ihevc_tc_table[tc_indx_v];
10190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    if(0 == tc_u && 0 == tc_v)
10210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
10220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        return;
10230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
10240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    src_row_0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src - 4));
10250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    tmp_pxl_0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd - 4));
10260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    src_row_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd - 4));
10270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    tmp_pxl_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd - 4));
10280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
10300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        LWORD64 mask_tc, mask_flag, mask;
10310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i delta_vu0_16x8b, delta_vu1_16x8b;
10320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i mask_tc_16x8, mask_16x8b, mask_flag_p_16x8b, mask_flag_q_16x8b;
10330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i min_0_16x8b;
10340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i const_16x8b;
10350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        mask_flag = (((LWORD64)filter_flag_p) << 31) | (((LWORD64)filter_flag_q) << 63);
10360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        mask_tc = (((LWORD64)tc_v) << 16) | ((LWORD64)tc_u);
10370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        mask = 0xffff00000000ffffLL;
10380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_row_0_16x8b = _mm_unpacklo_epi64(src_row_0_16x8b, tmp_pxl_0_16x8b);
10400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_row_2_16x8b = _mm_unpacklo_epi64(src_row_2_16x8b, tmp_pxl_1_16x8b);
10410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        mask_16x8b = _mm_load_si128((__m128i *)(shuffle_uv));
10430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        // qv11 qu11 qv10 qu10 qv01 qu01 qv00 qu00 pv10 pu10 pv11 pu11 pv00 pu00 pv01 pu01
10440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        // qv31 qu31 qv30 qu30 qv21 qu21 qv20 qu20 pv30 pu30 pv31 pu31 pv20 pu20 pv21 pu21
10450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        delta_vu0_16x8b = _mm_shuffle_epi8(src_row_0_16x8b, mask_16x8b);
10460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        delta_vu1_16x8b = _mm_shuffle_epi8(src_row_2_16x8b, mask_16x8b);
10470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        tmp_pxl_0_16x8b = _mm_unpacklo_epi64(delta_vu0_16x8b, delta_vu1_16x8b);
10490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        tmp_pxl_1_16x8b = _mm_unpackhi_epi64(delta_vu0_16x8b, delta_vu1_16x8b);
10500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        // pv30 pv31 pu30 pu31 pv20 pv21 pu20 pu21 pv10 pv11 pu10 pu11 pv00 pv01 pu00 pu01
10510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        // qv31 qv30 qu31 qu30 qv21 qv20 qu21 qu20 qv11 qv10 qu11 qu10 qv01 qv00 qu01 qu00
10520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        delta_vu0_16x8b = _mm_load_si128((__m128i *)delta0);
10530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        delta_vu1_16x8b = _mm_load_si128((__m128i *)delta1);
10540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        delta_vu0_16x8b = _mm_maddubs_epi16(tmp_pxl_0_16x8b, delta_vu0_16x8b);
10560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        delta_vu1_16x8b = _mm_maddubs_epi16(tmp_pxl_1_16x8b, delta_vu1_16x8b);
10570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //generating offset 4
10590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        const_16x8b = _mm_cmpeq_epi16(tmp_pxl_0_16x8b, tmp_pxl_0_16x8b);
10600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        // filter flag mask and tc mask
10610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        mask_tc_16x8 = _mm_loadl_epi64((__m128i *)(&mask_tc));
10620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        mask_flag_q_16x8b = _mm_loadl_epi64((__m128i *)(&mask_flag));
10630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        mask_tc_16x8 = _mm_shuffle_epi32(mask_tc_16x8, 0x00);
10650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        mask_flag_q_16x8b = _mm_srai_epi32(mask_flag_q_16x8b, 31);
10660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //-tc
10670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        min_0_16x8b = _mm_sign_epi16(mask_tc_16x8, const_16x8b);
10680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //converting const 1
10690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        const_16x8b = _mm_srli_epi16(const_16x8b, 15);
10700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //filterp and filterq flag
10720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        mask_flag_p_16x8b = _mm_shuffle_epi32(mask_flag_q_16x8b, 0x00);
10730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        mask_flag_q_16x8b = _mm_shuffle_epi32(mask_flag_q_16x8b, 0x55);
10740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //modified delta with a filter (1 -4 4 -1) available in 16 bit
10760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        delta_vu0_16x8b = _mm_add_epi16(delta_vu0_16x8b, delta_vu1_16x8b);
10770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //converting const 4
10780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        const_16x8b = _mm_slli_epi16(const_16x8b, 2);
10790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        mask_16x8b = _mm_loadl_epi64((__m128i *)(&mask));
10810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //offset addition
10820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        delta_vu0_16x8b = _mm_add_epi16(delta_vu0_16x8b, const_16x8b);
10830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //eliminating q1
10840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        tmp_pxl_1_16x8b = _mm_slli_epi16(tmp_pxl_1_16x8b, 8);
10850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        const_16x8b = _mm_setzero_si128();
10870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //filter after normalisation
10880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        delta_vu0_16x8b = _mm_srai_epi16(delta_vu0_16x8b, 3);
10890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        mask_16x8b = _mm_shuffle_epi32(mask_16x8b, 0x44);
10900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //clipping MAX
10920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        delta_vu0_16x8b = _mm_min_epi16(delta_vu0_16x8b, mask_tc_16x8);
10930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //getting p0 and eliminating p1
10940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        tmp_pxl_0_16x8b = _mm_srli_epi16(tmp_pxl_0_16x8b, 8);
10950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //clipping MIN
10960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        delta_vu0_16x8b = _mm_max_epi16(delta_vu0_16x8b, min_0_16x8b);
10970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //getting q0
10980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        tmp_pxl_1_16x8b = _mm_srli_epi16(tmp_pxl_1_16x8b, 8);
10990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //masking filter flag
11000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        delta_vu1_16x8b = _mm_and_si128(delta_vu0_16x8b, mask_flag_q_16x8b);
11010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        delta_vu0_16x8b = _mm_and_si128(delta_vu0_16x8b, mask_flag_p_16x8b);
11020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        // q-delta ,p+delta
11040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        tmp_pxl_1_16x8b = _mm_sub_epi16(tmp_pxl_1_16x8b, delta_vu1_16x8b);
11050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        tmp_pxl_0_16x8b = _mm_add_epi16(tmp_pxl_0_16x8b, delta_vu0_16x8b);
11060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //merging q0 and p0 of respective rows
11070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        delta_vu1_16x8b = _mm_unpackhi_epi32(tmp_pxl_0_16x8b, tmp_pxl_1_16x8b);
11080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        delta_vu0_16x8b = _mm_unpacklo_epi32(tmp_pxl_0_16x8b, tmp_pxl_1_16x8b);
11090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        // row 0 and row 1 packed , row2 and row3 packed
11100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        delta_vu0_16x8b = _mm_packus_epi16(delta_vu0_16x8b, const_16x8b);
11110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        delta_vu1_16x8b = _mm_packus_epi16(delta_vu1_16x8b, const_16x8b);
11120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //removing older pixel values
11130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_row_0_16x8b = _mm_and_si128(src_row_0_16x8b, mask_16x8b);
11140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_row_2_16x8b = _mm_and_si128(src_row_2_16x8b, mask_16x8b);
11150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //arranging modified pixels
11160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        delta_vu0_16x8b = _mm_shuffle_epi32(delta_vu0_16x8b, 0xd8);
11170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        delta_vu1_16x8b = _mm_shuffle_epi32(delta_vu1_16x8b, 0xd8);
11180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        delta_vu0_16x8b = _mm_slli_epi64(delta_vu0_16x8b, 16);
11190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        delta_vu1_16x8b = _mm_slli_epi64(delta_vu1_16x8b, 16);
11200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //plugging the modified values
11210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_row_0_16x8b = _mm_or_si128(src_row_0_16x8b, delta_vu0_16x8b);
11220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_row_2_16x8b = _mm_or_si128(src_row_2_16x8b, delta_vu1_16x8b);
11230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //geting values for row1 and row 3
11260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        tmp_pxl_0_16x8b = _mm_srli_si128(src_row_0_16x8b, 8);
11270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        tmp_pxl_1_16x8b = _mm_srli_si128(src_row_2_16x8b, 8);
11280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        _mm_storel_epi64((__m128i *)(pu1_src - 4), src_row_0_16x8b);
11300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        _mm_storel_epi64((__m128i *)((pu1_src - 4) + src_strd), tmp_pxl_0_16x8b);
11310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        _mm_storel_epi64((__m128i *)((pu1_src - 4) + 2 * src_strd), src_row_2_16x8b);
11320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        _mm_storel_epi64((__m128i *)((pu1_src - 4) + 3 * src_strd), tmp_pxl_1_16x8b);
11330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
11340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar}
11380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarvoid ihevc_deblk_chroma_horz_ssse3(UWORD8 *pu1_src,
11400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                   WORD32 src_strd,
11410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                   WORD32 quant_param_p,
11420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                   WORD32 quant_param_q,
11430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                   WORD32 qp_offset_u,
11440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                   WORD32 qp_offset_v,
11450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                   WORD32 tc_offset_div2,
11460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                   WORD32 filter_flag_p,
11470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                   WORD32 filter_flag_q)
11480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar{
11490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 qp_indx_u, qp_chroma_u;
11500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 qp_indx_v, qp_chroma_v;
11510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 tc_indx_u, tc_u;
11520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 tc_indx_v, tc_v;
11530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i tmp_p0_16x8b, src_p0_16x8b, src_q0_16x8b, tmp_q0_16x8b;
11560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ASSERT(filter_flag_p || filter_flag_q);
11580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* chroma processing is done only if BS is 2             */
11600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* this function is assumed to be called only if BS is 2 */
11610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    qp_indx_u = qp_offset_u + ((quant_param_p + quant_param_q + 1) >> 1);
11620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    qp_chroma_u = qp_indx_u < 0 ? qp_indx_u : (qp_indx_u > 57 ? qp_indx_u - 6 : gai4_ihevc_qp_table[qp_indx_u]);
11630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    qp_indx_v = qp_offset_v + ((quant_param_p + quant_param_q + 1) >> 1);
11650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    qp_chroma_v = qp_indx_v < 0 ? qp_indx_v : (qp_indx_v > 57 ? qp_indx_v - 6 : gai4_ihevc_qp_table[qp_indx_v]);
11660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    tc_indx_u = CLIP3(qp_chroma_u + 2 + (tc_offset_div2 << 1), 0, 53);
11680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    tc_u = gai4_ihevc_tc_table[tc_indx_u];
11690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    tc_indx_v = CLIP3(qp_chroma_v + 2 + (tc_offset_div2 << 1), 0, 53);
11710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    tc_v = gai4_ihevc_tc_table[tc_indx_v];
11720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    if(0 == tc_u && 0 == tc_v)
11740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
11750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        return;
11760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
11770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    tmp_p0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src - 2 * src_strd));
11780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    src_p0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src - src_strd));
11790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    src_q0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));
11800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    tmp_q0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
11810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
11830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        LWORD64 mask_tc, mask_flag;
11840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i delta_vu0_16x8b, delta_vu1_16x8b;
11850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i mask_tc_16x8, mask_16x8b, mask_flag_p_16x8b, mask_flag_q_16x8b;
11860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i min_0_16x8b;
11870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i const_16x8b;
11880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        mask_flag = (((LWORD64)filter_flag_p) << 31) | (((LWORD64)filter_flag_q) << 63);
11890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        mask_tc = (((LWORD64)tc_v) << 16) | ((LWORD64)tc_u);
11900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        tmp_p0_16x8b = _mm_unpacklo_epi8(tmp_p0_16x8b, src_p0_16x8b);
11920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        tmp_q0_16x8b = _mm_unpacklo_epi8(src_q0_16x8b, tmp_q0_16x8b);
11930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        // pv30 pv31 pu30 pu31 pv20 pv21 pu20 pu21 pv10 pv11 pu10 pu11 pv00 pv01 pu00 pu01
11950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        // qv31 qv30 qu31 qu30 qv21 qv20 qu21 qu20 qv11 qv10 qu11 qu10 qv01 qv00 qu01 qu00
11960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        delta_vu0_16x8b = _mm_load_si128((__m128i *)delta0);
11970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        delta_vu1_16x8b = _mm_load_si128((__m128i *)delta1);
11980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        delta_vu0_16x8b = _mm_maddubs_epi16(tmp_p0_16x8b, delta_vu0_16x8b);
12000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        delta_vu1_16x8b = _mm_maddubs_epi16(tmp_q0_16x8b, delta_vu1_16x8b);
12010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        // filter flag mask and tc mask
12040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        mask_tc_16x8 = _mm_loadl_epi64((__m128i *)(&mask_tc));
12050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        mask_flag_q_16x8b = _mm_loadl_epi64((__m128i *)(&mask_flag));
12060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //generating offset 4
12080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        const_16x8b = _mm_cmpeq_epi16(tmp_p0_16x8b, tmp_p0_16x8b);
12090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        // filter flag mask and tc mask
12100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        mask_tc_16x8 = _mm_shuffle_epi32(mask_tc_16x8, 0x00);
12110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        mask_flag_q_16x8b = _mm_srai_epi32(mask_flag_q_16x8b, 31);
12120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //-tc
12130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        min_0_16x8b = _mm_sign_epi16(mask_tc_16x8, const_16x8b);
12140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //converting const 1
12150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        const_16x8b = _mm_srli_epi16(const_16x8b, 15);
12160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //filterp
12180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        mask_flag_p_16x8b = _mm_shuffle_epi32(mask_flag_q_16x8b, 0x00);
12190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //converting const 4
12220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        const_16x8b = _mm_slli_epi16(const_16x8b, 2);
12230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //modified delta with a filter (1 -4 4 -1) available in 16 bit
12240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        delta_vu0_16x8b = _mm_add_epi16(delta_vu0_16x8b, delta_vu1_16x8b);
12250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //filterq flag
12270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        mask_flag_q_16x8b = _mm_shuffle_epi32(mask_flag_q_16x8b, 0x55);
12280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //offset addition
12290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        delta_vu0_16x8b = _mm_add_epi16(delta_vu0_16x8b, const_16x8b);
12300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        mask_16x8b = _mm_setzero_si128();
12310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //filter after normalisation
12320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        delta_vu0_16x8b = _mm_srai_epi16(delta_vu0_16x8b, 3);
12330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //converting p0 to 16bit
12350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_p0_16x8b = _mm_unpacklo_epi8(src_p0_16x8b, mask_16x8b);
12360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //clipping MAX
12370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        delta_vu0_16x8b = _mm_min_epi16(delta_vu0_16x8b, mask_tc_16x8);
12380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //converting q0 to 16bit
12390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_q0_16x8b = _mm_unpacklo_epi8(src_q0_16x8b, mask_16x8b);
12400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //clipping MIN
12410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        delta_vu0_16x8b = _mm_max_epi16(delta_vu0_16x8b, min_0_16x8b);
12420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //masking filter flag
12440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        delta_vu1_16x8b = _mm_and_si128(delta_vu0_16x8b, mask_flag_q_16x8b);
12450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        delta_vu0_16x8b = _mm_and_si128(delta_vu0_16x8b, mask_flag_p_16x8b);
12460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        // q-delta ,p+delta
12480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_q0_16x8b = _mm_sub_epi16(src_q0_16x8b, delta_vu1_16x8b);
12490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_p0_16x8b = _mm_add_epi16(src_p0_16x8b, delta_vu0_16x8b);
12500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        // p0 and q0 packed
12520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_q0_16x8b = _mm_packus_epi16(src_q0_16x8b, mask_16x8b);
12530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_p0_16x8b = _mm_packus_epi16(src_p0_16x8b, mask_16x8b);
12540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        _mm_storel_epi64((__m128i *)(pu1_src - src_strd), src_p0_16x8b);
12580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        _mm_storel_epi64((__m128i *)(pu1_src), src_q0_16x8b);
12590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
12610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar}
1264