18d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/****************************************************************************** 28d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 38d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Copyright (C) 2015 The Android Open Source Project 48d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 58d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Licensed under the Apache License, Version 2.0 (the "License"); 68d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * you may not use this file except in compliance with the License. 78d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * You may obtain a copy of the License at: 88d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 98d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * http://www.apache.org/licenses/LICENSE-2.0 108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Unless required by applicable law or agreed to in writing, software 128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * distributed under the License is distributed on an "AS IS" BASIS, 138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * See the License for the specific language governing permissions and 158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * limitations under the License. 168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ***************************************************************************** 188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*/ 208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*****************************************************************************/ 218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* File Name : ih264_deblk_luma_ssse3.c */ 238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Description : Contains function definitions for deblocking */ 258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* List of Functions : ih264_deblk_luma_vert_bs4_ssse3() */ 278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* ih264_deblk_luma_horz_bs4_ssse3() */ 288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* ih264_deblk_luma_vert_bslt4_ssse3() */ 298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* ih264_deblk_luma_horz_bslt4_ssse3() */ 308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* ih264_deblk_luma_vert_bs4_mbaff_ssse3() */ 318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* ih264_deblk_luma_vert_bslt4_mbaff_ssse3() */ 328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Issues / Problems : None */ 348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Revision History : */ 368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* DD MM YYYY Author(s) Changes (Describe the changes made) */ 388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* 12 02 2015 Naveen Kumar P Added luma deblocking ssse3 */ 398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* intrinsics */ 408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*****************************************************************************/ 428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*****************************************************************************/ 448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* File Includes */ 458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*****************************************************************************/ 468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* System include files */ 488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S#include <stdio.h> 498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* User include files */ 518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S#include "ih264_typedefs.h" 528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S#include "ih264_platform_macros.h" 538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S#include "ih264_deblk_edge_filters.h" 548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S#include "ih264_macros.h" 558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*****************************************************************************/ 578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Function Definitions */ 588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*****************************************************************************/ 598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*****************************************************************************/ 618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Function Name : ih264_deblk_luma_vert_bs4_ssse3() */ 638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Description : This function performs filtering of a luma block */ 658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* vertical edge when the boundary strength is set to 4. */ 668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Inputs : pu1_src - pointer to the src sample q0 */ 688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* src_strd - source stride */ 698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* alpha - alpha value for the boundary */ 708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* beta - beta value for the boundary */ 718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Globals : None */ 738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Processing : This operation is described in Sec. 8.7.2.4 under the */ 758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* title "Filtering process for edges for bS equal to 4" in */ 768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* ITU T Rec H.264. */ 778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Outputs : None */ 798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Returns : None */ 818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Issues : None */ 838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Revision History: */ 858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* DD MM YYYY Author(s) Changes (Describe the changes made) */ 878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* 12 02 2015 Naveen Kumar P Initial version */ 888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*****************************************************************************/ 908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Svoid ih264_deblk_luma_vert_bs4_ssse3(UWORD8 *pu1_src, 918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 src_strd, 928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 alpha, 938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 beta) 948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S{ 958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i zero = _mm_setzero_si128(); 968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i q0_16x8, q1_16x8, q2_16x8, q3_16x8; 978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i p0_16x8, p1_16x8, p2_16x8, p3_16x8; 988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i q0_8x16, q1_8x16, q2_8x16, q3_8x16; 998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i p0_8x16, p1_8x16, p2_8x16, p3_8x16; 1008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i q0_16x8_1; 1018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i p0_16x8_1; 1028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i q0_16x8_2, q1_16x8_2, q2_16x8_2; 1038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i p0_16x8_2, p1_16x8_2, p2_16x8_2; 1048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i temp1, temp2, temp3, temp4, temp5, temp6; 1058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i Alpha_8x16, Beta_8x16; 1068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i flag1_16x8, flag2_16x8, flag3_16x8, flag4_16x8; 1078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i const_val2_16x8 = _mm_set1_epi16(2); 1088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i line1, line2, line3, line4, line5, line6, line7, line8; 1098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S Alpha_8x16 = _mm_set1_epi16(alpha); 1118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S Beta_8x16 = _mm_set1_epi16(beta); 1128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line1 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd)); 1148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line2 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd)); 1158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line3 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd)); 1168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line4 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd)); 1178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line5 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd)); 1188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line6 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd)); 1198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line7 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd)); 1208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line8 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd)); 1218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_unpacklo_epi8(line1, line2); 1238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_unpacklo_epi8(line3, line4); 1248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_unpacklo_epi8(line5, line6); 1258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp4 = _mm_unpacklo_epi8(line7, line8); 1268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line1 = _mm_unpacklo_epi16(temp1, temp2); 1288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line2 = _mm_unpackhi_epi16(temp1, temp2); 1298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line3 = _mm_unpacklo_epi16(temp3, temp4); 1308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line4 = _mm_unpackhi_epi16(temp3, temp4); 1318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p1_8x16 = _mm_unpacklo_epi32(line1, line3); 1338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p0_8x16 = _mm_unpackhi_epi32(line1, line3); 1348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q0_8x16 = _mm_unpacklo_epi32(line2, line4); 1358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q1_8x16 = _mm_unpackhi_epi32(line2, line4); 1368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line1 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 8 * src_strd)); 1388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line2 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 9 * src_strd)); 1398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line3 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 10 * src_strd)); 1408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line4 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 11 * src_strd)); 1418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line5 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 12 * src_strd)); 1428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line6 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 13 * src_strd)); 1438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line7 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 14 * src_strd)); 1448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line8 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 15 * src_strd)); 1458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_unpacklo_epi8(line1, line2); 1478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_unpacklo_epi8(line3, line4); 1488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_unpacklo_epi8(line5, line6); 1498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp4 = _mm_unpacklo_epi8(line7, line8); 1508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line1 = _mm_unpacklo_epi16(temp1, temp2); 1528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line2 = _mm_unpackhi_epi16(temp1, temp2); 1538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line3 = _mm_unpacklo_epi16(temp3, temp4); 1548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line4 = _mm_unpackhi_epi16(temp3, temp4); 1558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_unpacklo_epi32(line1, line3); 1578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_unpackhi_epi32(line1, line3); 1588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_unpacklo_epi32(line2, line4); 1598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp4 = _mm_unpackhi_epi32(line2, line4); 1608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p3_16x8 = _mm_unpacklo_epi64(p1_8x16, temp1); 1628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p2_16x8 = _mm_unpackhi_epi64(p1_8x16, temp1); 1638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q2_16x8 = _mm_unpacklo_epi64(q1_8x16, temp4); 1648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q3_16x8 = _mm_unpackhi_epi64(q1_8x16, temp4); 1658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p1_16x8 = _mm_unpacklo_epi64(p0_8x16, temp2); 1668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p0_16x8 = _mm_unpackhi_epi64(p0_8x16, temp2); 1678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q0_16x8 = _mm_unpacklo_epi64(q0_8x16, temp3); 1688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q1_16x8 = _mm_unpackhi_epi64(q0_8x16, temp3); 1698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S //Cond1 (ABS(p0 - q0) < alpha) 1718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_subs_epu8(q0_16x8, p0_16x8); 1728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_subs_epu8(p0_16x8, q0_16x8); 1738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi8(temp1, temp2); 1748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_unpacklo_epi8(temp1, zero); 1768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_unpackhi_epi8(temp1, zero); 1778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2); 1798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1); 1808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag1_16x8 = _mm_packs_epi16(temp2, temp1); 1828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S //Cond2 (ABS(q1 - q0) < beta) 1848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_subs_epu8(q0_16x8, q1_16x8); 1858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_subs_epu8(q1_16x8, q0_16x8); 1868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi8(temp1, temp2); 1878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_unpacklo_epi8(temp1, zero); 1898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_unpackhi_epi8(temp1, zero); 1908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); 1928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1); 1938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag2_16x8 = _mm_packs_epi16(temp2, temp1); 1958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8); 1978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S //Cond3 (ABS(p1 - p0) < beta) 1998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_subs_epu8(p0_16x8, p1_16x8); 2008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_subs_epu8(p1_16x8, p0_16x8); 2018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi8(temp1, temp2); 2028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_unpacklo_epi8(temp1, zero); 2048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_unpackhi_epi8(temp1, zero); 2058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); 2078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1); 2088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag2_16x8 = _mm_packs_epi16(temp2, temp1); 2108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta)) 2128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8); 2138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // (ABS(p0 - q0) < ((alpha >> 2) + 2)) 2158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_subs_epu8(p0_16x8, q0_16x8); 2168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_subs_epu8(q0_16x8, p0_16x8); 2178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi8(temp1, temp2); 2188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S Alpha_8x16 = _mm_srai_epi16(Alpha_8x16, 2); 2198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S Alpha_8x16 = _mm_add_epi16(Alpha_8x16, const_val2_16x8); 2208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_unpacklo_epi8(temp1, zero); 2228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_unpackhi_epi8(temp1, zero); 2238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2); 2248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1); 2258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag2_16x8 = _mm_packs_epi16(temp2, temp1); 2278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8); 2288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // (ABS(p2 - p0) < beta) 2308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_subs_epu8(p0_16x8, p2_16x8); 2318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_subs_epu8(p2_16x8, p0_16x8); 2328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi8(temp1, temp2); 2338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_unpacklo_epi8(temp1, zero); 2358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_unpackhi_epi8(temp1, zero); 2368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); 2378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1); 2388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag3_16x8 = _mm_packs_epi16(temp2, temp1); 2408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag3_16x8 = _mm_and_si128(flag3_16x8, flag2_16x8); 2418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // (ABS(q2 - q0) < beta) 2438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_subs_epu8(q0_16x8, q2_16x8); 2448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_subs_epu8(q2_16x8, q0_16x8); 2458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi8(temp1, temp2); 2468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_unpacklo_epi8(temp1, zero); 2488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_unpackhi_epi8(temp1, zero); 2498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); 2508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1); 2518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag4_16x8 = _mm_packs_epi16(temp2, temp1); 2538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag4_16x8 = _mm_and_si128(flag4_16x8, flag2_16x8); 2548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // First 8 pixels 2568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p3_8x16 = _mm_unpacklo_epi8(p3_16x8, zero); 2578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p2_8x16 = _mm_unpacklo_epi8(p2_16x8, zero); 2588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p1_8x16 = _mm_unpacklo_epi8(p1_16x8, zero); 2598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p0_8x16 = _mm_unpacklo_epi8(p0_16x8, zero); 2608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q0_8x16 = _mm_unpacklo_epi8(q0_16x8, zero); 2618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q1_8x16 = _mm_unpacklo_epi8(q1_16x8, zero); 2628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q2_8x16 = _mm_unpacklo_epi8(q2_16x8, zero); 2638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q3_8x16 = _mm_unpacklo_epi8(q3_16x8, zero); 2648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // p0_1 and q0_1 2668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(p0_8x16, q1_8x16); 2678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(p1_8x16, q0_8x16); 2688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp5 = _mm_add_epi16(temp1, const_val2_16x8); 2698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp6 = _mm_add_epi16(temp2, const_val2_16x8); 2708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_slli_epi16(p1_8x16, 1); 2718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp4 = _mm_slli_epi16(q1_8x16, 1); 2728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(temp5, temp3); 2738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(temp6, temp4); 2748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p0_16x8_1 = _mm_srai_epi16(temp1, 2); 2758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q0_16x8_1 = _mm_srai_epi16(temp2, 2); 2768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // p1_2 and q1_2 2788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp6 = _mm_add_epi16(temp6, p0_8x16); 2798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp5 = _mm_add_epi16(temp5, q0_8x16); 2808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(temp6, p2_8x16); 2818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(temp5, q2_8x16); 2828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p1_16x8_2 = _mm_srai_epi16(temp1, 2); 2838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q1_16x8_2 = _mm_srai_epi16(temp2, 2); 2848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // p0_2 and q0_2 2868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(temp3, p2_8x16); 2878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(temp4, q2_8x16); 2888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(temp1, q1_8x16); 2898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(temp2, p1_8x16); 2908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_add_epi16(p0_8x16, q0_8x16); 2918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_slli_epi16(temp3, 1); 2928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(temp1, temp3); 2938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(temp2, temp3); 2948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4)); 2958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4)); 2968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p0_16x8_2 = _mm_srai_epi16(temp1, 3); 2978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q0_16x8_2 = _mm_srai_epi16(temp2, 3); 2988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // p2_2 and q2_2 3008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(temp6, const_val2_16x8); 3018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(temp5, const_val2_16x8); 3028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_slli_epi16(p2_8x16, 1); 3038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp4 = _mm_slli_epi16(q2_8x16, 1); 3048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_add_epi16(p2_8x16, temp3); 3058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp4 = _mm_add_epi16(q2_8x16, temp4); 3068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp5 = _mm_slli_epi16(p3_8x16, 1); 3078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp6 = _mm_slli_epi16(q3_8x16, 1); 3088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(temp1, temp3); 3098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(temp2, temp4); 3108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(temp1, temp5); 3118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(temp2, temp6); 3128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p2_16x8_2 = _mm_srai_epi16(temp1, 3); 3138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q2_16x8_2 = _mm_srai_epi16(temp2, 3); 3148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 3158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // Second 8 pixels and packing with first 8 pixels 3168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p3_8x16 = _mm_unpackhi_epi8(p3_16x8, zero); 3178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p2_8x16 = _mm_unpackhi_epi8(p2_16x8, zero); 3188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p1_8x16 = _mm_unpackhi_epi8(p1_16x8, zero); 3198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p0_8x16 = _mm_unpackhi_epi8(p0_16x8, zero); 3208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q0_8x16 = _mm_unpackhi_epi8(q0_16x8, zero); 3218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q1_8x16 = _mm_unpackhi_epi8(q1_16x8, zero); 3228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q2_8x16 = _mm_unpackhi_epi8(q2_16x8, zero); 3238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q3_8x16 = _mm_unpackhi_epi8(q3_16x8, zero); 3248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 3258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // p0_1 and q0_1 3268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(p0_8x16, q1_8x16); 3278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(p1_8x16, q0_8x16); 3288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp5 = _mm_add_epi16(temp1, const_val2_16x8); 3298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp6 = _mm_add_epi16(temp2, const_val2_16x8); 3308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_slli_epi16(p1_8x16, 1); 3318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp4 = _mm_slli_epi16(q1_8x16, 1); 3328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(temp5, temp3); 3338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(temp6, temp4); 3348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_srai_epi16(temp1, 2); 3358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_srai_epi16(temp2, 2); 3368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p0_16x8_1 = _mm_packus_epi16(p0_16x8_1, temp1); 3378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q0_16x8_1 = _mm_packus_epi16(q0_16x8_1, temp2); 3388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 3398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // p1_2 and q1_2 3408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp6 = _mm_add_epi16(temp6, p0_8x16); 3418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp5 = _mm_add_epi16(temp5, q0_8x16); 3428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(temp6, p2_8x16); 3438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(temp5, q2_8x16); 3448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_srai_epi16(temp1, 2); 3458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_srai_epi16(temp2, 2); 3468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p1_16x8_2 = _mm_packus_epi16(p1_16x8_2, temp1); 3478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q1_16x8_2 = _mm_packus_epi16(q1_16x8_2, temp2); 3488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 3498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // p0_2 and q0_2 3508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(temp3, p2_8x16); 3518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(temp4, q2_8x16); 3528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(temp1, q1_8x16); 3538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(temp2, p1_8x16); 3548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_add_epi16(p0_8x16, q0_8x16); 3558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_slli_epi16(temp3, 1); 3568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(temp1, temp3); 3578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(temp2, temp3); 3588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4)); 3598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4)); 3608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_srai_epi16(temp1, 3); 3618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_srai_epi16(temp2, 3); 3628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p0_16x8_2 = _mm_packus_epi16(p0_16x8_2, temp1); 3638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q0_16x8_2 = _mm_packus_epi16(q0_16x8_2, temp2); 3648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 3658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // p2_2 and q2_2 3668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(temp6, const_val2_16x8); 3678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(temp5, const_val2_16x8); 3688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_slli_epi16(p2_8x16, 1); 3698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp4 = _mm_slli_epi16(q2_8x16, 1); 3708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_add_epi16(p2_8x16, temp3); 3718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp4 = _mm_add_epi16(q2_8x16, temp4); 3728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp5 = _mm_slli_epi16(p3_8x16, 1); 3738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp6 = _mm_slli_epi16(q3_8x16, 1); 3748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(temp1, temp3); 3758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(temp2, temp4); 3768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(temp1, temp5); 3778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(temp2, temp6); 3788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_srai_epi16(temp1, 3); 3798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_srai_epi16(temp2, 3); 3808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p2_16x8_2 = _mm_packus_epi16(p2_16x8_2, temp1); 3818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q2_16x8_2 = _mm_packus_epi16(q2_16x8_2, temp2); 3828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 3838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // p0 and q0 3848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p0_16x8 = _mm_and_si128(p0_16x8, 3858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF))); 3868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p0_16x8_1 = _mm_and_si128(p0_16x8_1, flag1_16x8); 3878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_1); 3888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q0_16x8 = _mm_and_si128(q0_16x8, 3898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF))); 3908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q0_16x8_1 = _mm_and_si128(q0_16x8_1, flag1_16x8); 3918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_1); 3928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 3938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // p0 and q0 3948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p0_16x8 = _mm_and_si128(p0_16x8, 3958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF))); 3968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p0_16x8_2 = _mm_and_si128(p0_16x8_2, flag3_16x8); 3978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_2); 3988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q0_16x8 = _mm_and_si128(q0_16x8, 3998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF))); 4008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q0_16x8_2 = _mm_and_si128(q0_16x8_2, flag4_16x8); 4018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_2); 4028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // p1 and q1 4048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p1_16x8 = _mm_and_si128(p1_16x8, 4058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF))); 4068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p1_16x8_2 = _mm_and_si128(p1_16x8_2, flag3_16x8); 4078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p1_16x8 = _mm_add_epi8(p1_16x8, p1_16x8_2); 4088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q1_16x8 = _mm_and_si128(q1_16x8, 4098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF))); 4108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q1_16x8_2 = _mm_and_si128(q1_16x8_2, flag4_16x8); 4118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q1_16x8 = _mm_add_epi8(q1_16x8, q1_16x8_2); 4128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // p2 and q2 4148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p2_16x8 = _mm_and_si128(p2_16x8, 4158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF))); 4168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p2_16x8_2 = _mm_and_si128(p2_16x8_2, flag3_16x8); 4178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p2_16x8 = _mm_add_epi8(p2_16x8, p2_16x8_2); 4188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q2_16x8 = _mm_and_si128(q2_16x8, 4198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF))); 4208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q2_16x8_2 = _mm_and_si128(q2_16x8_2, flag4_16x8); 4218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q2_16x8 = _mm_add_epi8(q2_16x8, q2_16x8_2); 4228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_unpacklo_epi8(p3_16x8, p2_16x8); 4248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_unpacklo_epi8(p1_16x8, p0_16x8); 4258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_unpacklo_epi8(q0_16x8, q1_16x8); 4268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp4 = _mm_unpacklo_epi8(q2_16x8, q3_16x8); 4278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p3_8x16 = _mm_unpacklo_epi16(temp1, temp2); 4298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p2_8x16 = _mm_unpackhi_epi16(temp1, temp2); 4308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q2_8x16 = _mm_unpacklo_epi16(temp3, temp4); 4318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q3_8x16 = _mm_unpackhi_epi16(temp3, temp4); 4328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line1 = _mm_unpacklo_epi32(p3_8x16, q2_8x16); 4348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line2 = _mm_srli_si128(line1, 8); 4358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line3 = _mm_unpackhi_epi32(p3_8x16, q2_8x16); 4368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line4 = _mm_srli_si128(line3, 8); 4378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line5 = _mm_unpacklo_epi32(p2_8x16, q3_8x16); 4388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line6 = _mm_srli_si128(line5, 8); 4398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line7 = _mm_unpackhi_epi32(p2_8x16, q3_8x16); 4408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line8 = _mm_srli_si128(line7, 8); 4418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_storel_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd), line1); 4438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_storel_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd), line2); 4448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_storel_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd), line3); 4458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_storel_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd), line4); 4468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_storel_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd), line5); 4478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_storel_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd), line6); 4488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_storel_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd), line7); 4498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_storel_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd), line8); 4508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_unpackhi_epi8(p3_16x8, p2_16x8); 4528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_unpackhi_epi8(p1_16x8, p0_16x8); 4538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_unpackhi_epi8(q0_16x8, q1_16x8); 4548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp4 = _mm_unpackhi_epi8(q2_16x8, q3_16x8); 4558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p3_8x16 = _mm_unpacklo_epi16(temp1, temp2); 4578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p2_8x16 = _mm_unpackhi_epi16(temp1, temp2); 4588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q2_8x16 = _mm_unpacklo_epi16(temp3, temp4); 4598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q3_8x16 = _mm_unpackhi_epi16(temp3, temp4); 4608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line1 = _mm_unpacklo_epi32(p3_8x16, q2_8x16); 4628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line2 = _mm_srli_si128(line1, 8); 4638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line3 = _mm_unpackhi_epi32(p3_8x16, q2_8x16); 4648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line4 = _mm_srli_si128(line3, 8); 4658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line5 = _mm_unpacklo_epi32(p2_8x16, q3_8x16); 4668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line6 = _mm_srli_si128(line5, 8); 4678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line7 = _mm_unpackhi_epi32(p2_8x16, q3_8x16); 4688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line8 = _mm_srli_si128(line7, 8); 4698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_storel_epi64((__m128i *)(pu1_src - 4 + 8 * src_strd), line1); 4718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_storel_epi64((__m128i *)(pu1_src - 4 + 9 * src_strd), line2); 4728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_storel_epi64((__m128i *)(pu1_src - 4 + 10 * src_strd), line3); 4738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_storel_epi64((__m128i *)(pu1_src - 4 + 11 * src_strd), line4); 4748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_storel_epi64((__m128i *)(pu1_src - 4 + 12 * src_strd), line5); 4758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_storel_epi64((__m128i *)(pu1_src - 4 + 13 * src_strd), line6); 4768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_storel_epi64((__m128i *)(pu1_src - 4 + 14 * src_strd), line7); 4778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_storel_epi64((__m128i *)(pu1_src - 4 + 15 * src_strd), line8); 4788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S} 4808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*****************************************************************************/ 4828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 4838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Function Name : ih264_deblk_luma_horz_bs4_ssse3() */ 4848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 4858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Description : This function performs filtering of a luma block */ 4868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* horizontal edge when the boundary strength is set to 4. */ 4878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 4888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Inputs : pu1_src - pointer to the src sample q0 */ 4898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* src_strd - source stride */ 4908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* alpha - alpha value for the boundary */ 4918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* beta - beta value for the boundary */ 4928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 4938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Globals : None */ 4948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 4958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Processing : This operation is described in Sec. 8.7.2.4 under the */ 4968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* title "Filtering process for edges for bS equal to 4" in */ 4978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* ITU T Rec H.264. */ 4988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 4998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Outputs : None */ 5008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 5018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Returns : None */ 5028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 5038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Issues : None */ 5048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 5058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Revision History: */ 5068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 5078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* DD MM YYYY Author(s) Changes (Describe the changes made) */ 5088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* 12 02 2015 Naveen Kumar P Initial version */ 5098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 5108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*****************************************************************************/ 5118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Svoid ih264_deblk_luma_horz_bs4_ssse3(UWORD8 *pu1_src, 5128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 src_strd, 5138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 alpha, 5148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 beta) 5158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S{ 5168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD16 i16_posP3, i16_posP2, i16_posP1, i16_posP0; 5178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD16 i16_posQ1, i16_posQ2, i16_posQ3; 5188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S UWORD8 *pu1_HorzPixel; 5198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i zero = _mm_setzero_si128(); 5208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i q0_16x8, q1_16x8, q2_16x8, q3_16x8; 5218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i p0_16x8, p1_16x8, p2_16x8, p3_16x8; 5228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i q0_8x16, q1_8x16, q2_8x16, q3_8x16; 5238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i p0_8x16, p1_8x16, p2_8x16, p3_8x16; 5248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i q0_16x8_1; 5258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i p0_16x8_1; 5268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i q0_16x8_2, q1_16x8_2, q2_16x8_2; 5278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i p0_16x8_2, p1_16x8_2, p2_16x8_2; 5288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i temp1, temp2, temp3, temp4, temp5, temp6; 5298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i Alpha_8x16, Beta_8x16; 5308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i flag1_16x8, flag2_16x8, flag3_16x8, flag4_16x8; 5318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i const_val2_16x8 = _mm_set1_epi16(2); 5328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_HorzPixel = pu1_src - (src_strd << 2); 5348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S i16_posQ1 = src_strd; 5368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S i16_posQ2 = X2(src_strd); 5378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S i16_posQ3 = X3(src_strd); 5388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S i16_posP0 = X3(src_strd); 5398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S i16_posP1 = X2(src_strd); 5408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S i16_posP2 = src_strd; 5418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S i16_posP3 = 0; 5428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S Alpha_8x16 = _mm_set1_epi16(alpha); 5448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S Beta_8x16 = _mm_set1_epi16(beta); 5458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p3_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP3)); 5478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p2_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP2)); 5488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p1_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP1)); 5498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p0_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP0)); 5508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q0_16x8 = _mm_loadu_si128((__m128i *)(pu1_src)); 5518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q1_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ1)); 5528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q2_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ2)); 5538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q3_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ3)); 5548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S //Cond1 (ABS(p0 - q0) < alpha) 5568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_subs_epu8(q0_16x8, p0_16x8); 5578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_subs_epu8(p0_16x8, q0_16x8); 5588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi8(temp1, temp2); 5598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_unpacklo_epi8(temp1, zero); 5618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_unpackhi_epi8(temp1, zero); 5628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2); 5648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1); 5658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag1_16x8 = _mm_packs_epi16(temp2, temp1); 5678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S //Cond2 (ABS(q1 - q0) < beta) 5698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_subs_epu8(q0_16x8, q1_16x8); 5708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_subs_epu8(q1_16x8, q0_16x8); 5718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi8(temp1, temp2); 5728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_unpacklo_epi8(temp1, zero); 5748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_unpackhi_epi8(temp1, zero); 5758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); 5778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1); 5788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag2_16x8 = _mm_packs_epi16(temp2, temp1); 5808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8); 5828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S //Cond3 (ABS(p1 - p0) < beta) 5848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_subs_epu8(p0_16x8, p1_16x8); 5858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_subs_epu8(p1_16x8, p0_16x8); 5868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi8(temp1, temp2); 5878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_unpacklo_epi8(temp1, zero); 5898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_unpackhi_epi8(temp1, zero); 5908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); 5928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1); 5938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag2_16x8 = _mm_packs_epi16(temp2, temp1); 5958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta)) 5978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8); 5988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // (ABS(p0 - q0) < ((alpha >> 2) + 2)) 6008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_subs_epu8(p0_16x8, q0_16x8); 6018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_subs_epu8(q0_16x8, p0_16x8); 6028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi8(temp1, temp2); 6038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S Alpha_8x16 = _mm_srai_epi16(Alpha_8x16, 2); 6048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S Alpha_8x16 = _mm_add_epi16(Alpha_8x16, const_val2_16x8); 6058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_unpacklo_epi8(temp1, zero); 6078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_unpackhi_epi8(temp1, zero); 6088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2); 6098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1); 6108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag2_16x8 = _mm_packs_epi16(temp2, temp1); 6128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8); 6138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // (ABS(p2 - p0) < beta) 6158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_subs_epu8(p0_16x8, p2_16x8); 6168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_subs_epu8(p2_16x8, p0_16x8); 6178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi8(temp1, temp2); 6188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_unpacklo_epi8(temp1, zero); 6208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_unpackhi_epi8(temp1, zero); 6218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); 6228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1); 6238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag3_16x8 = _mm_packs_epi16(temp2, temp1); 6258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag3_16x8 = _mm_and_si128(flag3_16x8, flag2_16x8); 6268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // (ABS(q2 - q0) < beta) 6288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_subs_epu8(q0_16x8, q2_16x8); 6298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_subs_epu8(q2_16x8, q0_16x8); 6308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi8(temp1, temp2); 6318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_unpacklo_epi8(temp1, zero); 6338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_unpackhi_epi8(temp1, zero); 6348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); 6358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1); 6368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag4_16x8 = _mm_packs_epi16(temp2, temp1); 6388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag4_16x8 = _mm_and_si128(flag4_16x8, flag2_16x8); 6398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // First 8 pixels 6418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p3_8x16 = _mm_unpacklo_epi8(p3_16x8, zero); 6428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p2_8x16 = _mm_unpacklo_epi8(p2_16x8, zero); 6438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p1_8x16 = _mm_unpacklo_epi8(p1_16x8, zero); 6448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p0_8x16 = _mm_unpacklo_epi8(p0_16x8, zero); 6458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q0_8x16 = _mm_unpacklo_epi8(q0_16x8, zero); 6468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q1_8x16 = _mm_unpacklo_epi8(q1_16x8, zero); 6478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q2_8x16 = _mm_unpacklo_epi8(q2_16x8, zero); 6488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q3_8x16 = _mm_unpacklo_epi8(q3_16x8, zero); 6498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // p0_1 and q0_1 6518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(p0_8x16, q1_8x16); 6528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(p1_8x16, q0_8x16); 6538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp5 = _mm_add_epi16(temp1, const_val2_16x8); 6548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp6 = _mm_add_epi16(temp2, const_val2_16x8); 6558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_slli_epi16(p1_8x16, 1); 6568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp4 = _mm_slli_epi16(q1_8x16, 1); 6578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(temp5, temp3); 6588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(temp6, temp4); 6598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p0_16x8_1 = _mm_srai_epi16(temp1, 2); 6608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q0_16x8_1 = _mm_srai_epi16(temp2, 2); 6618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // p1_2 and q1_2 6638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp6 = _mm_add_epi16(temp6, p0_8x16); 6648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp5 = _mm_add_epi16(temp5, q0_8x16); 6658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(temp6, p2_8x16); 6668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(temp5, q2_8x16); 6678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p1_16x8_2 = _mm_srai_epi16(temp1, 2); 6688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q1_16x8_2 = _mm_srai_epi16(temp2, 2); 6698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // p0_2 and q0_2 6718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(temp3, p2_8x16); 6728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(temp4, q2_8x16); 6738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(temp1, q1_8x16); 6748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(temp2, p1_8x16); 6758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_add_epi16(p0_8x16, q0_8x16); 6768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_slli_epi16(temp3, 1); 6778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(temp1, temp3); 6788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(temp2, temp3); 6798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4)); 6808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4)); 6818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p0_16x8_2 = _mm_srai_epi16(temp1, 3); 6828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q0_16x8_2 = _mm_srai_epi16(temp2, 3); 6838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // p2_2 and q2_2 6858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(temp6, const_val2_16x8); 6868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(temp5, const_val2_16x8); 6878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_slli_epi16(p2_8x16, 1); 6888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp4 = _mm_slli_epi16(q2_8x16, 1); 6898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_add_epi16(p2_8x16, temp3); 6908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp4 = _mm_add_epi16(q2_8x16, temp4); 6918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp5 = _mm_slli_epi16(p3_8x16, 1); 6928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp6 = _mm_slli_epi16(q3_8x16, 1); 6938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(temp1, temp3); 6948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(temp2, temp4); 6958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(temp1, temp5); 6968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(temp2, temp6); 6978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p2_16x8_2 = _mm_srai_epi16(temp1, 3); 6988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q2_16x8_2 = _mm_srai_epi16(temp2, 3); 6998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // Second 8 pixels and packing with first 8 pixels 7018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p3_8x16 = _mm_unpackhi_epi8(p3_16x8, zero); 7028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p2_8x16 = _mm_unpackhi_epi8(p2_16x8, zero); 7038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p1_8x16 = _mm_unpackhi_epi8(p1_16x8, zero); 7048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p0_8x16 = _mm_unpackhi_epi8(p0_16x8, zero); 7058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q0_8x16 = _mm_unpackhi_epi8(q0_16x8, zero); 7068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q1_8x16 = _mm_unpackhi_epi8(q1_16x8, zero); 7078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q2_8x16 = _mm_unpackhi_epi8(q2_16x8, zero); 7088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q3_8x16 = _mm_unpackhi_epi8(q3_16x8, zero); 7098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // p0_1 and q0_1 7118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(p0_8x16, q1_8x16); 7128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(p1_8x16, q0_8x16); 7138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp5 = _mm_add_epi16(temp1, const_val2_16x8); 7148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp6 = _mm_add_epi16(temp2, const_val2_16x8); 7158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_slli_epi16(p1_8x16, 1); 7168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp4 = _mm_slli_epi16(q1_8x16, 1); 7178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(temp5, temp3); 7188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(temp6, temp4); 7198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_srai_epi16(temp1, 2); 7208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_srai_epi16(temp2, 2); 7218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p0_16x8_1 = _mm_packus_epi16(p0_16x8_1, temp1); 7228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q0_16x8_1 = _mm_packus_epi16(q0_16x8_1, temp2); 7238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // p1_2 and q1_2 7258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp6 = _mm_add_epi16(temp6, p0_8x16); 7268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp5 = _mm_add_epi16(temp5, q0_8x16); 7278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(temp6, p2_8x16); 7288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(temp5, q2_8x16); 7298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_srai_epi16(temp1, 2); 7308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_srai_epi16(temp2, 2); 7318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p1_16x8_2 = _mm_packus_epi16(p1_16x8_2, temp1); 7328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q1_16x8_2 = _mm_packus_epi16(q1_16x8_2, temp2); 7338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // p0_2 and q0_2 7358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(temp3, p2_8x16); 7368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(temp4, q2_8x16); 7378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(temp1, q1_8x16); 7388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(temp2, p1_8x16); 7398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_add_epi16(p0_8x16, q0_8x16); 7408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_slli_epi16(temp3, 1); 7418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(temp1, temp3); 7428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(temp2, temp3); 7438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4)); 7448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4)); 7458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_srai_epi16(temp1, 3); 7468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_srai_epi16(temp2, 3); 7478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p0_16x8_2 = _mm_packus_epi16(p0_16x8_2, temp1); 7488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q0_16x8_2 = _mm_packus_epi16(q0_16x8_2, temp2); 7498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // p2_2 and q2_2 7518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(temp6, const_val2_16x8); 7528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(temp5, const_val2_16x8); 7538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_slli_epi16(p2_8x16, 1); 7548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp4 = _mm_slli_epi16(q2_8x16, 1); 7558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_add_epi16(p2_8x16, temp3); 7568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp4 = _mm_add_epi16(q2_8x16, temp4); 7578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp5 = _mm_slli_epi16(p3_8x16, 1); 7588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp6 = _mm_slli_epi16(q3_8x16, 1); 7598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(temp1, temp3); 7608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(temp2, temp4); 7618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(temp1, temp5); 7628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(temp2, temp6); 7638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_srai_epi16(temp1, 3); 7648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_srai_epi16(temp2, 3); 7658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p2_16x8_2 = _mm_packus_epi16(p2_16x8_2, temp1); 7668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q2_16x8_2 = _mm_packus_epi16(q2_16x8_2, temp2); 7678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // p0 and q0 7698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p0_16x8 = _mm_and_si128(p0_16x8, 7708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF))); 7718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p0_16x8_1 = _mm_and_si128(p0_16x8_1, flag1_16x8); 7728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_1); 7738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q0_16x8 = _mm_and_si128(q0_16x8, 7748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF))); 7758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q0_16x8_1 = _mm_and_si128(q0_16x8_1, flag1_16x8); 7768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_1); 7778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // p0 and q0 7798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p0_16x8 = _mm_and_si128(p0_16x8, 7808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF))); 7818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p0_16x8_2 = _mm_and_si128(p0_16x8_2, flag3_16x8); 7828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_2); 7838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q0_16x8 = _mm_and_si128(q0_16x8, 7848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF))); 7858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q0_16x8_2 = _mm_and_si128(q0_16x8_2, flag4_16x8); 7868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_2); 7878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // p1 and q1 7898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p1_16x8 = _mm_and_si128(p1_16x8, 7908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF))); 7918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p1_16x8_2 = _mm_and_si128(p1_16x8_2, flag3_16x8); 7928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p1_16x8 = _mm_add_epi8(p1_16x8, p1_16x8_2); 7938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q1_16x8 = _mm_and_si128(q1_16x8, 7948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF))); 7958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q1_16x8_2 = _mm_and_si128(q1_16x8_2, flag4_16x8); 7968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q1_16x8 = _mm_add_epi8(q1_16x8, q1_16x8_2); 7978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // p2 and q2 7998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p2_16x8 = _mm_and_si128(p2_16x8, 8008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF))); 8018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p2_16x8_2 = _mm_and_si128(p2_16x8_2, flag3_16x8); 8028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p2_16x8 = _mm_add_epi8(p2_16x8, p2_16x8_2); 8038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q2_16x8 = _mm_and_si128(q2_16x8, 8048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF))); 8058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q2_16x8_2 = _mm_and_si128(q2_16x8_2, flag4_16x8); 8068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q2_16x8 = _mm_add_epi8(q2_16x8, q2_16x8_2); 8078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 8088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP2), p2_16x8); 8098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP1), p1_16x8); 8108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP0), p0_16x8); 8118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 8128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_storeu_si128((__m128i *)(pu1_src), q0_16x8); 8138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_storeu_si128((__m128i *)(pu1_src + i16_posQ1), q1_16x8); 8148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_storeu_si128((__m128i *)(pu1_src + i16_posQ2), q2_16x8); 8158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 8168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S} 8178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 8188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*****************************************************************************/ 8198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 8208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Function Name : ih264_deblk_luma_vert_bslt4_ssse3() */ 8218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 8228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Description : This function performs filtering of a luma block */ 8238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* vertical edge when the boundary strength is less than 4. */ 8248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 8258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Inputs : pu1_src - pointer to the src sample q0 */ 8268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* src_strd - source stride */ 8278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* alpha - alpha value for the boundary */ 8288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* beta - beta value for the boundary */ 8298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* u4_bs - packed Boundary strength array */ 8308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* pu1_cliptab - tc0_table */ 8318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 8328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Globals : None */ 8338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 8348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Processing : This operation is described in Sec. 8.7.2.3 under the */ 8358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* title "Filtering process for edges for bS less than 4" */ 8368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* in ITU T Rec H.264. */ 8378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 8388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Outputs : None */ 8398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 8408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Returns : None */ 8418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 8428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Issues : None */ 8438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 8448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Revision History: */ 8458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 8468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* DD MM YYYY Author(s) Changes (Describe the changes made) */ 8478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* 12 02 2015 Naveen Kumar P Initial version */ 8488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 8498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*****************************************************************************/ 8508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Svoid ih264_deblk_luma_vert_bslt4_ssse3(UWORD8 *pu1_src, 8518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 src_strd, 8528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 alpha, 8538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 beta, 8548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S UWORD32 u4_bs, 8558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S const UWORD8 *pu1_cliptab) 8568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S{ 8578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S UWORD8 u1_Bs, u1_Bs1; 8588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 8597497191460a9504f8b4f64df169ab633f0b74353Harish Mahendrakar WORD32 j = 0; 8608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 8618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i linea, lineb, linec, lined, linee, linef, lineg, lineh; 8628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i int1, int2, int3, int4, high1, high2; 8638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i flag, flag1, i_C, i_C0; 8648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i i_Ap, i_Aq, diff, const1, const2, in_macro, in_macrotemp, temp, 8658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1; 8668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i zero = _mm_setzero_si128(); 8678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 8688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S for(j = 0; j <= 8 * src_strd; j += 8 * src_strd) 8698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S { 8708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S //Transpose 8718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S linea = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + j)); 8728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S lineb = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + src_strd + j)); 8738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S linec = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 2 * src_strd + j)); 8748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S lined = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 3 * src_strd + j)); 8758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 8768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S linea = _mm_unpacklo_epi8(linea, zero); 8778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S lineb = _mm_unpacklo_epi8(lineb, zero); 8788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S linec = _mm_unpacklo_epi8(linec, zero); 8798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S lined = _mm_unpacklo_epi8(lined, zero); 8808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 8818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S int1 = _mm_unpacklo_epi16(linea, lineb); 8828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S lineb = _mm_unpackhi_epi16(linea, lineb); 8838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 8848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S int2 = _mm_unpacklo_epi16(linec, lined); 8858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S lined = _mm_unpackhi_epi16(linec, lined); 8868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 8878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S linea = _mm_unpacklo_epi16(int1, int2); 8888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S int1 = _mm_unpackhi_epi16(int1, int2); 8898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 8908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S linec = _mm_unpacklo_epi16(lineb, lined); 8918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S high1 = _mm_unpackhi_epi16(lineb, lined); 8928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 8938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S linee = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 4 * src_strd + j)); 8948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S linef = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 5 * src_strd + j)); 8958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S lineg = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 6 * src_strd + j)); 8968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S lineh = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 7 * src_strd + j)); 8978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 8988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S linee = _mm_unpacklo_epi8(linee, zero); 8998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S linef = _mm_unpacklo_epi8(linef, zero); 9008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S lineg = _mm_unpacklo_epi8(lineg, zero); 9018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S lineh = _mm_unpacklo_epi8(lineh, zero); 9028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 9038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S int2 = _mm_unpacklo_epi16(linee, linef); 9048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S linef = _mm_unpackhi_epi16(linee, linef); 9058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 9068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S int3 = _mm_unpacklo_epi16(lineg, lineh); 9078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S lineh = _mm_unpackhi_epi16(lineg, lineh); 9088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 9098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S linee = _mm_unpacklo_epi16(int2, int3); 9108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S int2 = _mm_unpackhi_epi16(int2, int3); 9118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 9128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S lineg = _mm_unpacklo_epi16(linef, lineh); 9138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S high2 = _mm_unpackhi_epi16(linef, lineh); 9148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 9158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S int4 = _mm_unpacklo_epi16(linea, linee); 9168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S lineb = _mm_unpackhi_epi16(linea, linee); 9178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 9188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S int3 = _mm_unpacklo_epi16(int1, int2); 9198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S lined = _mm_unpackhi_epi16(int1, int2); 9208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 9218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S int2 = _mm_unpacklo_epi16(linec, lineg); 9228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S linef = _mm_unpackhi_epi16(linec, lineg); 9238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 9248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S linea = int4; 9258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S linec = int3; 9268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S linee = int2; 9278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 9288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S lineg = _mm_unpacklo_epi16(high1, high2); 9298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S lineh = _mm_unpackhi_epi16(high1, high2); 9308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 9318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S //end of transpose 9328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 9338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S u1_Bs = (u4_bs >> 24) & 0xff; 9348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S u1_Bs1 = (u4_bs >> 16) & 0xff; 9358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S u4_bs <<= 16; 9368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 9378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag1 = _mm_set_epi16(u1_Bs1, u1_Bs, u1_Bs1, u1_Bs, u1_Bs1, u1_Bs, 9388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S u1_Bs1, u1_Bs); 9398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag1 = _mm_cmpeq_epi16(flag1, zero); //Set flag to 1s and 0s 9408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag1 = _mm_xor_si128(flag1, _mm_set1_epi16(0xFFFF)); //Invert for required mask 9418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 9428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S i_C0 = _mm_set_epi16(pu1_cliptab[u1_Bs1], pu1_cliptab[u1_Bs], 9438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_cliptab[u1_Bs1], pu1_cliptab[u1_Bs], 9448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_cliptab[u1_Bs1], pu1_cliptab[u1_Bs], 9458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_cliptab[u1_Bs1], pu1_cliptab[u1_Bs]); 9468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 9478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S diff = _mm_subs_epi16(linec, lined); //Condn 1 9488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S diff = _mm_abs_epi16(diff); 9498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S const1 = _mm_set1_epi16(alpha); 9508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag = _mm_cmpgt_epi16(const1, diff); 9518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 9528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S diff = _mm_subs_epi16(linee, lined); //Condtn 2 9538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S diff = _mm_abs_epi16(diff); 9548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S const1 = _mm_set1_epi16(beta); 9558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag = _mm_and_si128(flag, _mm_cmpgt_epi16(const1, diff)); 9568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 9578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S diff = _mm_subs_epi16(lineb, linec); //Condtn 3 9588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S diff = _mm_abs_epi16(diff); 9598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag = _mm_and_si128(flag, _mm_cmpgt_epi16(const1, diff)); //Const 1= Beta from now on 9608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 9618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag = _mm_and_si128(flag, flag1); //Final flag (ui_B condition + other 3 conditions) 9628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 9638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S //Adding Ap<Beta and Aq<Beta 9648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S i_Ap = _mm_subs_epi16(linea, linec); 9658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S i_Ap = _mm_abs_epi16(i_Ap); 9668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S const2 = _mm_cmpgt_epi16(const1, i_Ap); 9678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S const2 = _mm_subs_epi16(zero, const2); //Make FFFF=1 and 0000=0 9688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S i_C = _mm_add_epi16(i_C0, const2); 9698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 9708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S i_Aq = _mm_subs_epi16(linef, lined); 9718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S i_Aq = _mm_abs_epi16(i_Aq); 9728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S const2 = _mm_cmpgt_epi16(const1, i_Aq); 9738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S const2 = _mm_subs_epi16(zero, const2); 9748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S i_C = _mm_add_epi16(i_C, const2); 9758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 9768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S //Calculate in_macro 9778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S diff = _mm_subs_epi16(lined, linec); 9788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S diff = _mm_slli_epi16(diff, 2); 9798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S const2 = _mm_subs_epi16(lineb, linee); 9808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S diff = _mm_add_epi16(diff, const2); 9818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S const2 = _mm_set1_epi16(4); 9828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S diff = _mm_add_epi16(diff, const2); 9838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S in_macro = _mm_srai_epi16(diff, 3); 9848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 9858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S in_macro = _mm_min_epi16(i_C, in_macro); //CLIP3 9868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S i_C = _mm_subs_epi16(zero, i_C); 9878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S in_macro = _mm_max_epi16(i_C, in_macro); 9888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 9898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S //Compute and store 9908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S in_macrotemp = _mm_add_epi16(linec, in_macro); 9918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S in_macrotemp = _mm_and_si128(in_macrotemp, flag); 9928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp = _mm_and_si128(linec, 9938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_xor_si128(flag, _mm_set1_epi16(0xFFFF))); 9948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp = _mm_add_epi16(temp, in_macrotemp); 9958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S //temp= _mm_packus_epi16 (temp, zero); 9968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S //_mm_storel_epi64(uc_HorzPixel+i16_posP0+i, in_macrotemp); 9978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 9988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S in_macrotemp = _mm_subs_epi16(lined, in_macro); 9998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S in_macrotemp = _mm_and_si128(in_macrotemp, flag); 10008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_and_si128(lined, 10018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_xor_si128(flag, _mm_set1_epi16(0xFFFF))); 10028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(temp1, in_macrotemp); 10038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S //temp1= _mm_packus_epi16 (temp1, zero); 10048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S //_mm_storel_epi64(pu1_src+i, in_macrotemp); 10058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 10068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S //If Ap<Beta 10078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag1 = _mm_cmpgt_epi16(const1, i_Ap); 10088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag1 = _mm_and_si128(flag, flag1); 10098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S in_macrotemp = _mm_add_epi16(linec, lined); 10108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S in_macrotemp = _mm_add_epi16(in_macrotemp, _mm_set1_epi16(1)); 10118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S in_macrotemp = _mm_srai_epi16(in_macrotemp, 1); 10128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S in_macro = _mm_add_epi16(in_macrotemp, linea); 10138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S in_macro = _mm_subs_epi16(in_macro, _mm_slli_epi16(lineb, 1)); 10148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S in_macro = _mm_srai_epi16(in_macro, 1); 10158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 10168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S in_macro = _mm_min_epi16(i_C0, in_macro); //CLIP3 10178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S i_C0 = _mm_subs_epi16(zero, i_C0); 10188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S in_macro = _mm_max_epi16(i_C0, in_macro); 10198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 10208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S in_macro = _mm_and_si128(in_macro, flag1); 10218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S lineb = _mm_add_epi16(lineb, in_macro); 10228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S //in_macro= _mm_packus_epi16 (i_p1, zero); 10238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S //_mm_storel_epi64(uc_HorzPixel+i16_posP1+i, in_macro); 10248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 10258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag1 = _mm_cmpgt_epi16(const1, i_Aq); 10268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag1 = _mm_and_si128(flag, flag1); 10278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S in_macro = _mm_add_epi16(in_macrotemp, linef); 10288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S in_macro = _mm_subs_epi16(in_macro, _mm_slli_epi16(linee, 1)); 10298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S in_macro = _mm_srai_epi16(in_macro, 1); 10308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 10318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S i_C0 = _mm_abs_epi16(i_C0); 10328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S in_macro = _mm_min_epi16(i_C0, in_macro); //CLIP3 10338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S i_C0 = _mm_subs_epi16(zero, i_C0); 10348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S in_macro = _mm_max_epi16(i_C0, in_macro); 10358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 10368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S in_macro = _mm_and_si128(in_macro, flag1); 10378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S linee = _mm_add_epi16(linee, in_macro); 10388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S //in_macro= _mm_packus_epi16 (i_q1, zero); 10398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S //_mm_storel_epi64(pu1_src+i16_posQ1+i, in_macro); 10408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S linec = temp; 10418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S lined = temp1; 10428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S //End of filtering 10438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 10448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S int1 = _mm_unpacklo_epi16(linea, linee); 10458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S linee = _mm_unpackhi_epi16(linea, linee); 10468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 10478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S int2 = _mm_unpacklo_epi16(linec, lineg); 10488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S lineg = _mm_unpackhi_epi16(linec, lineg); 10498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 10508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S linea = _mm_unpacklo_epi16(int1, int2); 10518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S int3 = _mm_unpackhi_epi16(int1, int2); 10528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 10538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S linec = _mm_unpacklo_epi16(linee, lineg); 10548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S lineg = _mm_unpackhi_epi16(linee, lineg); 10558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 10568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S int1 = _mm_unpacklo_epi16(lineb, linef); 10578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S linef = _mm_unpackhi_epi16(lineb, linef); 10588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 10598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S int2 = _mm_unpacklo_epi16(lined, lineh); 10608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S lineh = _mm_unpackhi_epi16(lined, lineh); 10618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 10628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S lineb = _mm_unpacklo_epi16(int1, int2); 10638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S int4 = _mm_unpackhi_epi16(int1, int2); 10648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 10658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S lined = _mm_unpacklo_epi16(linef, lineh); 10668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S lineh = _mm_unpackhi_epi16(linef, lineh); 10678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 10688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S int1 = _mm_unpackhi_epi16(linea, lineb); 10698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S linea = _mm_unpacklo_epi16(linea, lineb); 10708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 10718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S int2 = _mm_unpacklo_epi16(int3, int4); 10728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S high1 = _mm_unpackhi_epi16(int3, int4); 10738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 10748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S lineb = _mm_unpacklo_epi16(linec, lined); 10758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S linef = _mm_unpackhi_epi16(linec, lined); 10768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 10778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S lined = _mm_unpacklo_epi16(lineg, lineh); 10788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S lineh = _mm_unpackhi_epi16(lineg, lineh); 10798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 10808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S linee = int1; 10818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S lineg = high1; 10828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S linec = int2; 10838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S //End of inverse transpose 10848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 10858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S //Packs and stores 10868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S linea = _mm_packus_epi16(linea, zero); 10878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_storel_epi64((__m128i *)(pu1_src - 3 + j), linea); 10888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 10898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S lineb = _mm_packus_epi16(lineb, zero); 10908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_storel_epi64((__m128i *)(pu1_src - 3 + src_strd + j), lineb); 10918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 10928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S linec = _mm_packus_epi16(linec, zero); 10938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_storel_epi64((__m128i *)(pu1_src - 3 + 2 * src_strd + j), linec); 10948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 10958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S lined = _mm_packus_epi16(lined, zero); 10968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_storel_epi64((__m128i *)(pu1_src - 3 + 3 * src_strd + j), lined); 10978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 10988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S linee = _mm_packus_epi16(linee, zero); 10998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_storel_epi64((__m128i *)(pu1_src - 3 + 4 * src_strd + j), linee); 11008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 11018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S linef = _mm_packus_epi16(linef, zero); 11028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_storel_epi64((__m128i *)(pu1_src - 3 + 5 * src_strd + j), linef); 11038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 11048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S lineg = _mm_packus_epi16(lineg, zero); 11058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_storel_epi64((__m128i *)(pu1_src - 3 + 6 * src_strd + j), lineg); 11068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 11078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S lineh = _mm_packus_epi16(lineh, zero); 11088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_storel_epi64((__m128i *)(pu1_src - 3 + 7 * src_strd + j), lineh); 11098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 11108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S } 11118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S} 11128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 11138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*****************************************************************************/ 11148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 11158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Function Name : ih264_deblk_luma_horz_bslt4_ssse3() */ 11168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 11178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Description : This function performs filtering of a luma block */ 11188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* horizontal edge when boundary strength is less than 4. */ 11198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 11208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Inputs : pu1_src - pointer to the src sample q0 */ 11218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* src_strd - source stride */ 11228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* alpha - alpha value for the boundary */ 11238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* beta - beta value for the boundary */ 11248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* u4_bs - packed Boundary strength array */ 11258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* pu1_cliptab - tc0_table */ 11268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 11278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Globals : None */ 11288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 11298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Processing : This operation is described in Sec. 8.7.2.3 under the */ 11308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* title "Filtering process for edges for bS less than 4" */ 11318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* in ITU T Rec H.264. */ 11328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 11338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Outputs : None */ 11348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 11358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Returns : None */ 11368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 11378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Issues : None */ 11388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 11398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Revision History: */ 11408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 11418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* DD MM YYYY Author(s) Changes (Describe the changes made) */ 11428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* 12 02 2015 Naveen Kumar P Initial version */ 11438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 11448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*****************************************************************************/ 11458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Svoid ih264_deblk_luma_horz_bslt4_ssse3(UWORD8 *pu1_src, 11468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 src_strd, 11478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 alpha, 11488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 beta, 11498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S UWORD32 u4_bs, 11508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S const UWORD8 *pu1_cliptab) 11518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S{ 11528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD16 i16_posP2, i16_posP1, i16_posP0, i16_posQ1, i16_posQ2; 11538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S UWORD8 *pu1_HorzPixel; 11548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i zero = _mm_setzero_si128(); 11558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i bs_flag_16x8b, C0_16x8, C0_8x16, C0_hi_8x16, C_8x16, C_hi_8x16; 11568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i q0_16x8, q1_16x8, q2_16x8, p0_16x8, p1_16x8, p2_16x8; 11578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i temp1, temp2; 11588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i Alpha_8x16, Beta_8x16, flag1_16x8, flag2_16x8, flag3_16x8; 11598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i in_macro_16x8, in_macro_hi_16x8; 11608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i const_val4_8x16; 11618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3; 11628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S UWORD8 clip0, clip1, clip2, clip3; 11638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 11648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_HorzPixel = pu1_src - (src_strd << 2); 11658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 11668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S i16_posQ1 = src_strd; 11678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S i16_posQ2 = X2(src_strd); 11688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S i16_posP0 = X3(src_strd); 11698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S i16_posP1 = X2(src_strd); 11708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S i16_posP2 = src_strd; 11718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 11728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q0_16x8 = _mm_loadu_si128((__m128i *)(pu1_src)); 11738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q1_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ1)); 11748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 11758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S u1_Bs0 = (u4_bs >> 24) & 0xff; 11768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S u1_Bs1 = (u4_bs >> 16) & 0xff; 11778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S u1_Bs2 = (u4_bs >> 8) & 0xff; 11788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S u1_Bs3 = (u4_bs >> 0) & 0xff; 11798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S clip0 = pu1_cliptab[u1_Bs0]; 11808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S clip1 = pu1_cliptab[u1_Bs1]; 11818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S clip2 = pu1_cliptab[u1_Bs2]; 11828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S clip3 = pu1_cliptab[u1_Bs3]; 11838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 11848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S Alpha_8x16 = _mm_set1_epi16(alpha); 11858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S Beta_8x16 = _mm_set1_epi16(beta); 11868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 11878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S bs_flag_16x8b = _mm_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs2, u1_Bs2, 11888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S u1_Bs2, u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1, 11898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0); 11908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 11918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S C0_16x8 = _mm_set_epi8(clip3, clip3, clip3, clip3, clip2, clip2, clip2, 11928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S clip2, clip1, clip1, clip1, clip1, clip0, clip0, 11938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S clip0, clip0); 11948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 11958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S bs_flag_16x8b = _mm_cmpeq_epi8(bs_flag_16x8b, zero); 11968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S bs_flag_16x8b = _mm_xor_si128(bs_flag_16x8b, _mm_set1_epi8(0xFF)); //Invert for required mask 11978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S C0_8x16 = _mm_unpacklo_epi8(C0_16x8, zero); 11988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S C0_hi_8x16 = _mm_unpackhi_epi8(C0_16x8, zero); 11998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 12008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p1_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP1)); 12018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p0_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP0)); 12028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p2_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP2)); 12038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q2_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ2)); 12048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 12058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S //Cond1 (ABS(p0 - q0) < alpha) 12068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_subs_epu8(q0_16x8, p0_16x8); 12078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_subs_epu8(p0_16x8, q0_16x8); 12088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi8(temp1, temp2); 12098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 12108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_unpacklo_epi8(temp1, zero); 12118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_unpackhi_epi8(temp1, zero); 12128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 12138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2); 12148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1); 12158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 12168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag1_16x8 = _mm_packs_epi16(temp2, temp1); 12178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag1_16x8 = _mm_and_si128(flag1_16x8, bs_flag_16x8b); 12188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 12198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S //Cond2 (ABS(q1 - q0) < beta) 12208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_subs_epu8(q0_16x8, q1_16x8); 12218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_subs_epu8(q1_16x8, q0_16x8); 12228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi8(temp1, temp2); 12238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 12248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_unpacklo_epi8(temp1, zero); 12258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_unpackhi_epi8(temp1, zero); 12268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 12278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); 12288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1); 12298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 12308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag2_16x8 = _mm_packs_epi16(temp2, temp1); 12318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 12328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8); 12338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 12348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S //Cond3 (ABS(p1 - p0) < beta) 12358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_subs_epu8(p0_16x8, p1_16x8); 12368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_subs_epu8(p1_16x8, p0_16x8); 12378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi8(temp1, temp2); 12388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 12398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_unpacklo_epi8(temp1, zero); 12408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_unpackhi_epi8(temp1, zero); 12418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 12428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); 12438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1); 12448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 12458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag2_16x8 = _mm_packs_epi16(temp2, temp1); 12468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 12478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta)) 12488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8); 12498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 12508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // (ABS(p2 - p0) < beta) 12518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_subs_epu8(p0_16x8, p2_16x8); 12528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_subs_epu8(p2_16x8, p0_16x8); 12538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi8(temp1, temp2); 12548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 12558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_unpacklo_epi8(temp1, zero); 12568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_unpackhi_epi8(temp1, zero); 12578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); 12588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1); 12598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 12608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag2_16x8 = _mm_packs_epi16(temp2, temp1); 12618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8); 12628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 12638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_subs_epi16(zero, temp2); 12648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_subs_epi16(zero, temp1); 12658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 12668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S C_8x16 = _mm_add_epi16(C0_8x16, temp2); 12678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S C_hi_8x16 = _mm_add_epi16(C0_hi_8x16, temp1); 12688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 12698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // (ABS(q2 - q0) < beta) 12708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_subs_epu8(q0_16x8, q2_16x8); 12718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_subs_epu8(q2_16x8, q0_16x8); 12728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi8(temp1, temp2); 12738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 12748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_unpacklo_epi8(temp1, zero); 12758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_unpackhi_epi8(temp1, zero); 12768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); 12778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1); 12788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 12798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag3_16x8 = _mm_packs_epi16(temp2, temp1); 12808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag3_16x8 = _mm_and_si128(flag1_16x8, flag3_16x8); 12818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 12828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_subs_epi16(zero, temp2); 12838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_subs_epi16(zero, temp1); 12848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 12858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S C_8x16 = _mm_add_epi16(C_8x16, temp2); 12868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S C_hi_8x16 = _mm_add_epi16(C_hi_8x16, temp1); 12878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 12888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S const_val4_8x16 = _mm_set1_epi16(4); 12898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_subs_epi16(_mm_unpacklo_epi8(q0_16x8, zero), 12908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_unpacklo_epi8(p0_16x8, zero)); 12918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(p1_16x8, zero), 12928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_unpacklo_epi8(q1_16x8, zero)); 12938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_slli_epi16(temp1, 2); 12948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(temp1, temp2); 12958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(temp1, const_val4_8x16); 12968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S in_macro_16x8 = _mm_srai_epi16(temp1, 3); 12978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 12988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_subs_epi16(_mm_unpackhi_epi8(q0_16x8, zero), 12998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_unpackhi_epi8(p0_16x8, zero)); 13008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_subs_epi16(_mm_unpackhi_epi8(p1_16x8, zero), 13018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_unpackhi_epi8(q1_16x8, zero)); 13028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_slli_epi16(temp1, 2); 13038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(temp1, temp2); 13048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(temp1, const_val4_8x16); 13058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S in_macro_hi_16x8 = _mm_srai_epi16(temp1, 3); 13068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 13078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S in_macro_16x8 = _mm_min_epi16(C_8x16, in_macro_16x8); //CLIP3 13088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S in_macro_hi_16x8 = _mm_min_epi16(C_hi_8x16, in_macro_hi_16x8); //CLIP3 13098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S C_8x16 = _mm_subs_epi16(zero, C_8x16); 13108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S C_hi_8x16 = _mm_subs_epi16(zero, C_hi_8x16); 13118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S in_macro_16x8 = _mm_max_epi16(C_8x16, in_macro_16x8); //CLIP3 13128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S in_macro_hi_16x8 = _mm_max_epi16(C_hi_8x16, in_macro_hi_16x8); //CLIP3 13138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 13148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(_mm_unpacklo_epi8(p0_16x8, zero), in_macro_16x8); 13158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(_mm_unpackhi_epi8(p0_16x8, zero), in_macro_hi_16x8); 13168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 13178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_packus_epi16(temp1, temp2); 13188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 13198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_and_si128(temp1, flag1_16x8); 13208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_and_si128(p0_16x8, 13218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_xor_si128(flag1_16x8, _mm_set1_epi16(0xFFFF))); 13228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 13238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi8(temp1, temp2); 13248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 13258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP0), temp1); 13268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 13278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_sub_epi16(_mm_unpacklo_epi8(q0_16x8, zero), in_macro_16x8); 13288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_sub_epi16(_mm_unpackhi_epi8(q0_16x8, zero), in_macro_hi_16x8); 13298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 13308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_packus_epi16(temp1, temp2); 13318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 13328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_and_si128(temp1, flag1_16x8); 13338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_and_si128(q0_16x8, 13348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_xor_si128(flag1_16x8, _mm_set1_epi16(0xFFFF))); 13358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 13368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi8(temp1, temp2); 13378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_storeu_si128((__m128i *)(pu1_src), temp1); 13388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 13398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S //if(Ap < Beta) 13408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_avg_epu16(_mm_unpacklo_epi8(q0_16x8, zero), 13418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_unpacklo_epi8(p0_16x8, zero)); 13428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_slli_epi16(_mm_unpacklo_epi8(p1_16x8, zero), 1); 13438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S //temp2 = _mm_subs_epi16(zero,temp2); 13448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(p2_16x8, zero), temp2); 13458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(temp1, temp2); 13468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S in_macro_16x8 = _mm_srai_epi16(temp2, 1); 13478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 13488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_avg_epu16(_mm_unpackhi_epi8(q0_16x8, zero), 13498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_unpackhi_epi8(p0_16x8, zero)); 13508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_slli_epi16(_mm_unpackhi_epi8(p1_16x8, zero), 1); 13518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S //temp2 = _mm_subs_epi16(zero,temp2); 13528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_subs_epi16(_mm_unpackhi_epi8(p2_16x8, zero), temp2); 13538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(temp1, temp2); 13548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S in_macro_hi_16x8 = _mm_srai_epi16(temp2, 1); 13558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 13568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S in_macro_16x8 = _mm_min_epi16(C0_8x16, in_macro_16x8); //CLIP3 13578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S in_macro_hi_16x8 = _mm_min_epi16(C0_hi_8x16, in_macro_hi_16x8); //CLIP3 13588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S C0_8x16 = _mm_subs_epi16(zero, C0_8x16); 13598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S C0_hi_8x16 = _mm_subs_epi16(zero, C0_hi_8x16); 13608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S in_macro_16x8 = _mm_max_epi16(C0_8x16, in_macro_16x8); //CLIP3 13618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S in_macro_hi_16x8 = _mm_max_epi16(C0_hi_8x16, in_macro_hi_16x8); //CLIP3 13628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 13638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(_mm_unpacklo_epi8(p1_16x8, zero), in_macro_16x8); 13648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(_mm_unpackhi_epi8(p1_16x8, zero), in_macro_hi_16x8); 13658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 13668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_packus_epi16(temp1, temp2); 13678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 13688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_and_si128(temp1, flag2_16x8); 13698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_and_si128(p1_16x8, 13708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_xor_si128(flag2_16x8, _mm_set1_epi16(0xFFFF))); 13718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi8(temp1, temp2); 13728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP1), temp1); 13738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 13748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S //if(Aq < Beta) 13758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_avg_epu16(_mm_unpacklo_epi8(q0_16x8, zero), 13768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_unpacklo_epi8(p0_16x8, zero)); 13778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_slli_epi16(_mm_unpacklo_epi8(q1_16x8, zero), 1); 13788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S //temp2 = _mm_slli_epi16 (temp2, 1); 13798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(q2_16x8, zero), temp2); 13808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(temp1, temp2); 13818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S in_macro_16x8 = _mm_srai_epi16(temp2, 1); 13828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 13838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_avg_epu16(_mm_unpackhi_epi8(q0_16x8, zero), 13848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_unpackhi_epi8(p0_16x8, zero)); 13858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_slli_epi16(_mm_unpackhi_epi8(q1_16x8, zero), 1); 13868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S //temp2 = _mm_slli_epi16 (temp2, 1); 13878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_subs_epi16(_mm_unpackhi_epi8(q2_16x8, zero), temp2); 13888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(temp1, temp2); 13898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S in_macro_hi_16x8 = _mm_srai_epi16(temp2, 1); 13908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 13918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S in_macro_16x8 = _mm_max_epi16(C0_8x16, in_macro_16x8); //CLIP3 13928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S in_macro_hi_16x8 = _mm_max_epi16(C0_hi_8x16, in_macro_hi_16x8); //CLIP3 13938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S C0_8x16 = _mm_subs_epi16(zero, C0_8x16); 13948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S C0_hi_8x16 = _mm_subs_epi16(zero, C0_hi_8x16); 13958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S in_macro_16x8 = _mm_min_epi16(C0_8x16, in_macro_16x8); //CLIP3 13968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S in_macro_hi_16x8 = _mm_min_epi16(C0_hi_8x16, in_macro_hi_16x8); //CLIP3 13978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 13988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(_mm_unpacklo_epi8(q1_16x8, zero), in_macro_16x8); 13998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(_mm_unpackhi_epi8(q1_16x8, zero), in_macro_hi_16x8); 14008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 14018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_packus_epi16(temp1, temp2); 14028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 14038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_and_si128(temp1, flag3_16x8); 14048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_and_si128(q1_16x8, 14058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_xor_si128(flag3_16x8, _mm_set1_epi16(0xFFFF))); 14068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi8(temp1, temp2); 14078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 14088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_storeu_si128((__m128i *)(pu1_src + i16_posQ1), temp1); 14098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 14108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S} 14118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 14128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*****************************************************************************/ 14138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 14148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Function Name : ih264_deblk_luma_vert_bs4_mbaff_ssse3() */ 14158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 14168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Description : This function performs filtering of a luma block */ 14178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* vertical edge when boundary strength is set to 4. */ 14188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 14198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Inputs : pu1_src - pointer to the src sample q0 */ 14208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* src_strd - source stride */ 14218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* alpha - alpha value for the boundary */ 14228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* beta - beta value for the boundary */ 14238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 14248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Globals : None */ 14258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 14268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Processing : When the function is called twice, this operation is as */ 14278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* described in Sec. 8.7.2.3 under the title "Filtering */ 14288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* process for edges for bS equal to 4" in ITU T Rec H.264. */ 14298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 14308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Outputs : None */ 14318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 14328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Returns : None */ 14338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 14348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Issues : None */ 14358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 14368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Revision History: */ 14378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 14388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* DD MM YYYY Author(s) Changes (Describe the changes made) */ 14398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* 12 02 2015 Naveen Kumar P Initial version */ 14408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 14418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*****************************************************************************/ 14428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Svoid ih264_deblk_luma_vert_bs4_mbaff_ssse3(UWORD8 *pu1_src, 14438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 src_strd, 14448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 alpha, 14458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 beta) 14468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S{ 14478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i zero = _mm_setzero_si128(); 14488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i q0_16x8, q1_16x8, q2_16x8, q3_16x8; 14498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i p0_16x8, p1_16x8, p2_16x8, p3_16x8; 14508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i q0_8x16, q1_8x16, q2_8x16, q3_8x16; 14518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i p0_8x16, p1_8x16, p2_8x16, p3_8x16; 14528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i q0_16x8_1; 14538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i p0_16x8_1; 14548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i q0_16x8_2, q1_16x8_2, q2_16x8_2; 14558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i p0_16x8_2, p1_16x8_2, p2_16x8_2; 14568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i temp1, temp2, temp3, temp4, temp5, temp6; 14578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i Alpha_8x16, Beta_8x16; 14588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i flag1_16x8, flag2_16x8, flag3_16x8, flag4_16x8; 14598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i const_val2_16x8 = _mm_set1_epi16(2); 14608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i line1, line2, line3, line4, line5, line6, line7, line8; 14618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 14628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S Alpha_8x16 = _mm_set1_epi16(alpha); 14638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S Beta_8x16 = _mm_set1_epi16(beta); 14648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 14658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line1 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd)); 14668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line2 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd)); 14678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line3 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd)); 14688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line4 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd)); 14698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line5 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd)); 14708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line6 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd)); 14718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line7 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd)); 14728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line8 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd)); 14738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 14748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_unpacklo_epi8(line1, line2); 14758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_unpacklo_epi8(line3, line4); 14768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_unpacklo_epi8(line5, line6); 14778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp4 = _mm_unpacklo_epi8(line7, line8); 14788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 14798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line1 = _mm_unpacklo_epi16(temp1, temp2); 14808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line2 = _mm_unpackhi_epi16(temp1, temp2); 14818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line3 = _mm_unpacklo_epi16(temp3, temp4); 14828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line4 = _mm_unpackhi_epi16(temp3, temp4); 14838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 14848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p1_8x16 = _mm_unpacklo_epi32(line1, line3); 14858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p0_8x16 = _mm_unpackhi_epi32(line1, line3); 14868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q0_8x16 = _mm_unpacklo_epi32(line2, line4); 14878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q1_8x16 = _mm_unpackhi_epi32(line2, line4); 14888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 14898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p3_16x8 = _mm_unpacklo_epi64(p1_8x16, zero); 14908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p2_16x8 = _mm_unpackhi_epi64(p1_8x16, zero); 14918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q2_16x8 = _mm_unpacklo_epi64(q1_8x16, zero); 14928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q3_16x8 = _mm_unpackhi_epi64(q1_8x16, zero); 14938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p1_16x8 = _mm_unpacklo_epi64(p0_8x16, zero); 14948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p0_16x8 = _mm_unpackhi_epi64(p0_8x16, zero); 14958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q0_16x8 = _mm_unpacklo_epi64(q0_8x16, zero); 14968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q1_16x8 = _mm_unpackhi_epi64(q0_8x16, zero); 14978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 14988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S //Cond1 (ABS(p0 - q0) < alpha) 14998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_subs_epu8(q0_16x8, p0_16x8); 15008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_subs_epu8(p0_16x8, q0_16x8); 15018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi8(temp1, temp2); 15028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 15038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_unpacklo_epi8(temp1, zero); 15048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_unpackhi_epi8(temp1, zero); 15058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 15068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2); 15078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1); 15088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 15098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag1_16x8 = _mm_packs_epi16(temp2, temp1); 15108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 15118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S //Cond2 (ABS(q1 - q0) < beta) 15128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_subs_epu8(q0_16x8, q1_16x8); 15138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_subs_epu8(q1_16x8, q0_16x8); 15148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi8(temp1, temp2); 15158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 15168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_unpacklo_epi8(temp1, zero); 15178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_unpackhi_epi8(temp1, zero); 15188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 15198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); 15208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1); 15218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 15228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag2_16x8 = _mm_packs_epi16(temp2, temp1); 15238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 15248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8); 15258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 15268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S //Cond3 (ABS(p1 - p0) < beta) 15278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_subs_epu8(p0_16x8, p1_16x8); 15288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_subs_epu8(p1_16x8, p0_16x8); 15298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi8(temp1, temp2); 15308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 15318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_unpacklo_epi8(temp1, zero); 15328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_unpackhi_epi8(temp1, zero); 15338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 15348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); 15358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1); 15368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 15378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag2_16x8 = _mm_packs_epi16(temp2, temp1); 15388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 15398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta)) 15408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8); 15418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 15428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // (ABS(p0 - q0) < ((alpha >> 2) + 2)) 15438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_subs_epu8(p0_16x8, q0_16x8); 15448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_subs_epu8(q0_16x8, p0_16x8); 15458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi8(temp1, temp2); 15468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S Alpha_8x16 = _mm_srai_epi16(Alpha_8x16, 2); 15478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S Alpha_8x16 = _mm_add_epi16(Alpha_8x16, const_val2_16x8); 15488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 15498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_unpacklo_epi8(temp1, zero); 15508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_unpackhi_epi8(temp1, zero); 15518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2); 15528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1); 15538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 15548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag2_16x8 = _mm_packs_epi16(temp2, temp1); 15558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8); 15568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 15578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // (ABS(p2 - p0) < beta) 15588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_subs_epu8(p0_16x8, p2_16x8); 15598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_subs_epu8(p2_16x8, p0_16x8); 15608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi8(temp1, temp2); 15618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 15628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_unpacklo_epi8(temp1, zero); 15638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_unpackhi_epi8(temp1, zero); 15648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); 15658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1); 15668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 15678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag3_16x8 = _mm_packs_epi16(temp2, temp1); 15688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag3_16x8 = _mm_and_si128(flag3_16x8, flag2_16x8); 15698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 15708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // (ABS(q2 - q0) < beta) 15718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_subs_epu8(q0_16x8, q2_16x8); 15728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_subs_epu8(q2_16x8, q0_16x8); 15738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi8(temp1, temp2); 15748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 15758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_unpacklo_epi8(temp1, zero); 15768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_unpackhi_epi8(temp1, zero); 15778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); 15788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1); 15798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 15808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag4_16x8 = _mm_packs_epi16(temp2, temp1); 15818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag4_16x8 = _mm_and_si128(flag4_16x8, flag2_16x8); 15828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 15838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // First 8 pixels 15848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p3_8x16 = _mm_unpacklo_epi8(p3_16x8, zero); 15858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p2_8x16 = _mm_unpacklo_epi8(p2_16x8, zero); 15868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p1_8x16 = _mm_unpacklo_epi8(p1_16x8, zero); 15878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p0_8x16 = _mm_unpacklo_epi8(p0_16x8, zero); 15888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q0_8x16 = _mm_unpacklo_epi8(q0_16x8, zero); 15898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q1_8x16 = _mm_unpacklo_epi8(q1_16x8, zero); 15908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q2_8x16 = _mm_unpacklo_epi8(q2_16x8, zero); 15918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q3_8x16 = _mm_unpacklo_epi8(q3_16x8, zero); 15928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 15938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // p0_1 and q0_1 15948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(p0_8x16, q1_8x16); 15958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(p1_8x16, q0_8x16); 15968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp5 = _mm_add_epi16(temp1, const_val2_16x8); 15978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp6 = _mm_add_epi16(temp2, const_val2_16x8); 15988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_slli_epi16(p1_8x16, 1); 15998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp4 = _mm_slli_epi16(q1_8x16, 1); 16008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(temp5, temp3); 16018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(temp6, temp4); 16028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p0_16x8_1 = _mm_srai_epi16(temp1, 2); 16038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q0_16x8_1 = _mm_srai_epi16(temp2, 2); 16048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 16058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // p1_2 and q1_2 16068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp6 = _mm_add_epi16(temp6, p0_8x16); 16078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp5 = _mm_add_epi16(temp5, q0_8x16); 16088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(temp6, p2_8x16); 16098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(temp5, q2_8x16); 16108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p1_16x8_2 = _mm_srai_epi16(temp1, 2); 16118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q1_16x8_2 = _mm_srai_epi16(temp2, 2); 16128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 16138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // p0_2 and q0_2 16148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(temp3, p2_8x16); 16158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(temp4, q2_8x16); 16168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(temp1, q1_8x16); 16178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(temp2, p1_8x16); 16188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_add_epi16(p0_8x16, q0_8x16); 16198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_slli_epi16(temp3, 1); 16208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(temp1, temp3); 16218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(temp2, temp3); 16228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4)); 16238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4)); 16248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p0_16x8_2 = _mm_srai_epi16(temp1, 3); 16258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q0_16x8_2 = _mm_srai_epi16(temp2, 3); 16268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 16278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // p2_2 and q2_2 16288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(temp6, const_val2_16x8); 16298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(temp5, const_val2_16x8); 16308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_slli_epi16(p2_8x16, 1); 16318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp4 = _mm_slli_epi16(q2_8x16, 1); 16328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_add_epi16(p2_8x16, temp3); 16338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp4 = _mm_add_epi16(q2_8x16, temp4); 16348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp5 = _mm_slli_epi16(p3_8x16, 1); 16358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp6 = _mm_slli_epi16(q3_8x16, 1); 16368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(temp1, temp3); 16378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(temp2, temp4); 16388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(temp1, temp5); 16398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(temp2, temp6); 16408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p2_16x8_2 = _mm_srai_epi16(temp1, 3); 16418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q2_16x8_2 = _mm_srai_epi16(temp2, 3); 16428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 16438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // p0_1 and q0_1 16448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p0_16x8_1 = _mm_packus_epi16(p0_16x8_1, zero); 16458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q0_16x8_1 = _mm_packus_epi16(q0_16x8_1, zero); 16468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 16478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // p1_2 and q1_2 16488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p1_16x8_2 = _mm_packus_epi16(p1_16x8_2, zero); 16498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q1_16x8_2 = _mm_packus_epi16(q1_16x8_2, zero); 16508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 16518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // p0_2 and q0_2 16528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p0_16x8_2 = _mm_packus_epi16(p0_16x8_2, zero); 16538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q0_16x8_2 = _mm_packus_epi16(q0_16x8_2, zero); 16548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 16558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // p2_2 and q2_2 16568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p2_16x8_2 = _mm_packus_epi16(p2_16x8_2, zero); 16578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q2_16x8_2 = _mm_packus_epi16(q2_16x8_2, zero); 16588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 16598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // p0 and q0 16608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p0_16x8 = _mm_and_si128(p0_16x8, 16618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF))); 16628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p0_16x8_1 = _mm_and_si128(p0_16x8_1, flag1_16x8); 16638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_1); 16648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q0_16x8 = _mm_and_si128(q0_16x8, 16658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF))); 16668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q0_16x8_1 = _mm_and_si128(q0_16x8_1, flag1_16x8); 16678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_1); 16688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 16698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // p0 and q0 16708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p0_16x8 = _mm_and_si128(p0_16x8, 16718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF))); 16728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p0_16x8_2 = _mm_and_si128(p0_16x8_2, flag3_16x8); 16738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_2); 16748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q0_16x8 = _mm_and_si128(q0_16x8, 16758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF))); 16768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q0_16x8_2 = _mm_and_si128(q0_16x8_2, flag4_16x8); 16778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_2); 16788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 16798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // p1 and q1 16808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p1_16x8 = _mm_and_si128(p1_16x8, 16818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF))); 16828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p1_16x8_2 = _mm_and_si128(p1_16x8_2, flag3_16x8); 16838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p1_16x8 = _mm_add_epi8(p1_16x8, p1_16x8_2); 16848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q1_16x8 = _mm_and_si128(q1_16x8, 16858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF))); 16868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q1_16x8_2 = _mm_and_si128(q1_16x8_2, flag4_16x8); 16878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q1_16x8 = _mm_add_epi8(q1_16x8, q1_16x8_2); 16888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 16898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // p2 and q2 16908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p2_16x8 = _mm_and_si128(p2_16x8, 16918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF))); 16928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p2_16x8_2 = _mm_and_si128(p2_16x8_2, flag3_16x8); 16938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p2_16x8 = _mm_add_epi8(p2_16x8, p2_16x8_2); 16948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q2_16x8 = _mm_and_si128(q2_16x8, 16958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF))); 16968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q2_16x8_2 = _mm_and_si128(q2_16x8_2, flag4_16x8); 16978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q2_16x8 = _mm_add_epi8(q2_16x8, q2_16x8_2); 16988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 16998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_unpacklo_epi8(p3_16x8, p2_16x8); 17008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_unpacklo_epi8(p1_16x8, p0_16x8); 17018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_unpacklo_epi8(q0_16x8, q1_16x8); 17028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp4 = _mm_unpacklo_epi8(q2_16x8, q3_16x8); 17038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 17048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p3_8x16 = _mm_unpacklo_epi16(temp1, temp2); 17058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p2_8x16 = _mm_unpackhi_epi16(temp1, temp2); 17068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q2_8x16 = _mm_unpacklo_epi16(temp3, temp4); 17078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q3_8x16 = _mm_unpackhi_epi16(temp3, temp4); 17088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 17098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line1 = _mm_unpacklo_epi32(p3_8x16, q2_8x16); 17108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line2 = _mm_srli_si128(line1, 8); 17118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line3 = _mm_unpackhi_epi32(p3_8x16, q2_8x16); 17128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line4 = _mm_srli_si128(line3, 8); 17138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line5 = _mm_unpacklo_epi32(p2_8x16, q3_8x16); 17148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line6 = _mm_srli_si128(line5, 8); 17158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line7 = _mm_unpackhi_epi32(p2_8x16, q3_8x16); 17168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line8 = _mm_srli_si128(line7, 8); 17178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 17188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_storel_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd), line1); 17198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_storel_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd), line2); 17208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_storel_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd), line3); 17218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_storel_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd), line4); 17228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_storel_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd), line5); 17238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_storel_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd), line6); 17248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_storel_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd), line7); 17258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_storel_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd), line8); 17268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 17278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S} 17288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 17298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*****************************************************************************/ 17308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 17318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Function Name : ih264_deblk_luma_vert_bslt4_mbaff_ssse3() */ 17328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 17338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Description : This function performs filtering of a luma block */ 17348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* vertical edge when boundary strength is less than 4. */ 17358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 17368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Inputs : pu1_src - pointer to the src sample q0 */ 17378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* src_strd - source stride */ 17388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* alpha - alpha value for the boundary */ 17398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* beta - beta value for the boundary */ 17408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* u4_bs - packed Boundary strength array */ 17418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* pu1_cliptab - tc0_table */ 17428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 17438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Globals : None */ 17448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 17458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Processing : When the function is called twice, this operation is as */ 17468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* described in Sec. 8.7.2.3 under the title "Filtering */ 17478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* process for edges for bS less than 4" in ITU T Rec H.264.*/ 17488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 17498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Outputs : None */ 17508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 17518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Returns : None */ 17528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 17538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Issues : None */ 17548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 17558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Revision History: */ 17568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 17578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* DD MM YYYY Author(s) Changes (Describe the changes made) */ 17588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* 12 02 2015 Naveen Kumar P Initial version */ 17598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* */ 17608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*****************************************************************************/ 17618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Svoid ih264_deblk_luma_vert_bslt4_mbaff_ssse3(UWORD8 *pu1_src, 17628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 src_strd, 17638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 alpha, 17648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 beta, 17658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S UWORD32 u4_bs, 17668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S const UWORD8 *pu1_cliptab) 17678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S{ 17688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i zero = _mm_setzero_si128(); 17698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i bs_flag_16x8b, C0_16x8, C0_8x16, C_8x16; 17708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i q0_16x8, q1_16x8, q2_16x8, q3_16x8; 17718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i p0_16x8, p1_16x8, p2_16x8, p3_16x8; 17728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i temp1, temp2, temp3, temp4; 17738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i Alpha_8x16, Beta_8x16, flag1_16x8, flag2_16x8, flag3_16x8; 17748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i in_macro_16x8; 17758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i const_val4_8x16; 17768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3; 17778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S UWORD8 clip0, clip1, clip2, clip3; 17788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i line1, line2, line3, line4, line5, line6, line7, line8; 17798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i q0_16x8_1, q1_16x8_1, q0_16x8_2; 17808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i p0_16x8_1, p1_16x8_1, p0_16x8_2; 17818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 17828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line1 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd)); 17838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line2 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd)); 17848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line3 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd)); 17858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line4 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd)); 17868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line5 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd)); 17878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line6 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd)); 17888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line7 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd)); 17898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line8 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd)); 17908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 17918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_unpacklo_epi8(line1, line2); 17928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_unpacklo_epi8(line3, line4); 17938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_unpacklo_epi8(line5, line6); 17948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp4 = _mm_unpacklo_epi8(line7, line8); 17958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 17968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line1 = _mm_unpacklo_epi16(temp1, temp2); 17978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line2 = _mm_unpackhi_epi16(temp1, temp2); 17988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line3 = _mm_unpacklo_epi16(temp3, temp4); 17998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line4 = _mm_unpackhi_epi16(temp3, temp4); 18008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 18018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_unpacklo_epi32(line1, line3); 18028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_unpackhi_epi32(line1, line3); 18038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_unpacklo_epi32(line2, line4); 18048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp4 = _mm_unpackhi_epi32(line2, line4); 18058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 18068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p3_16x8 = _mm_unpacklo_epi64(temp1, zero); 18078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p2_16x8 = _mm_unpackhi_epi64(temp1, zero); 18088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q2_16x8 = _mm_unpacklo_epi64(temp4, zero); 18098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q3_16x8 = _mm_unpackhi_epi64(temp4, zero); 18108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p1_16x8 = _mm_unpacklo_epi64(temp2, zero); 18118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p0_16x8 = _mm_unpackhi_epi64(temp2, zero); 18128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q0_16x8 = _mm_unpacklo_epi64(temp3, zero); 18138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q1_16x8 = _mm_unpackhi_epi64(temp3, zero); 18148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 18158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S u1_Bs0 = (u4_bs >> 24) & 0xff; 18168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S u1_Bs1 = (u4_bs >> 16) & 0xff; 18178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S u1_Bs2 = (u4_bs >> 8) & 0xff; 18188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S u1_Bs3 = (u4_bs >> 0) & 0xff; 18198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S clip0 = pu1_cliptab[u1_Bs0]; 18208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S clip1 = pu1_cliptab[u1_Bs1]; 18218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S clip2 = pu1_cliptab[u1_Bs2]; 18228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S clip3 = pu1_cliptab[u1_Bs3]; 18238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 18248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S Alpha_8x16 = _mm_set1_epi16(alpha); 18258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S Beta_8x16 = _mm_set1_epi16(beta); 18268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 18278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S bs_flag_16x8b = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, u1_Bs3, u1_Bs3, u1_Bs2, 18288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs0, u1_Bs0); 18298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 18308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S C0_16x8 = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, clip3, clip3, clip2, clip2, 18318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S clip1, clip1, clip0, clip0); 18328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 18338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S bs_flag_16x8b = _mm_cmpeq_epi8(bs_flag_16x8b, zero); 18348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S bs_flag_16x8b = _mm_xor_si128(bs_flag_16x8b, _mm_set1_epi8(0xFF)); //Invert for required mask 18358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S C0_8x16 = _mm_unpacklo_epi8(C0_16x8, zero); 18368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 18378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S //Cond1 (ABS(p0 - q0) < alpha) 18388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_subs_epu8(q0_16x8, p0_16x8); 18398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_subs_epu8(p0_16x8, q0_16x8); 18408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi8(temp1, temp2); 18418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 18428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_unpacklo_epi8(temp1, zero); 18438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2); 18448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 18458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag1_16x8 = _mm_packs_epi16(temp2, zero); 18468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag1_16x8 = _mm_and_si128(flag1_16x8, bs_flag_16x8b); 18478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 18488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S //Cond2 (ABS(q1 - q0) < beta) 18498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_subs_epu8(q0_16x8, q1_16x8); 18508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_subs_epu8(q1_16x8, q0_16x8); 18518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi8(temp1, temp2); 18528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 18538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_unpacklo_epi8(temp1, zero); 18548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); 18558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 18568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag2_16x8 = _mm_packs_epi16(temp2, zero); 18578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8); 18588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 18598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S //Cond3 (ABS(p1 - p0) < beta) 18608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_subs_epu8(p0_16x8, p1_16x8); 18618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_subs_epu8(p1_16x8, p0_16x8); 18628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi8(temp1, temp2); 18638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 18648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_unpacklo_epi8(temp1, zero); 18658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); 18668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 18678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag2_16x8 = _mm_packs_epi16(temp2, zero); 18688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 18698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta)) 18708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8); 18718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 18728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // (ABS(p2 - p0) < beta) 18738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_subs_epu8(p0_16x8, p2_16x8); 18748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_subs_epu8(p2_16x8, p0_16x8); 18758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi8(temp1, temp2); 18768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 18778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_unpacklo_epi8(temp1, zero); 18788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); 18798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 18808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag2_16x8 = _mm_packs_epi16(temp2, zero); 18818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8); 18828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 18838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_subs_epi16(zero, temp2); 18848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 18858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S C_8x16 = _mm_add_epi16(C0_8x16, temp2); 18868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 18878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // (ABS(q2 - q0) < beta) 18888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_subs_epu8(q0_16x8, q2_16x8); 18898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_subs_epu8(q2_16x8, q0_16x8); 18908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi8(temp1, temp2); 18918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 18928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_unpacklo_epi8(temp1, zero); 18938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); 18948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 18958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag3_16x8 = _mm_packs_epi16(temp2, zero); 18968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag3_16x8 = _mm_and_si128(flag1_16x8, flag3_16x8); 18978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 18988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_subs_epi16(zero, temp2); 18998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 19008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S C_8x16 = _mm_add_epi16(C_8x16, temp2); 19018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 19028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S const_val4_8x16 = _mm_set1_epi16(4); 19038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_subs_epi16(_mm_unpacklo_epi8(q0_16x8, zero), 19048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_unpacklo_epi8(p0_16x8, zero)); 19058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(p1_16x8, zero), 19068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_unpacklo_epi8(q1_16x8, zero)); 19078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_slli_epi16(temp1, 2); 19088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(temp1, temp2); 19098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(temp1, const_val4_8x16); 19108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S in_macro_16x8 = _mm_srai_epi16(temp1, 3); 19118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 19128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S in_macro_16x8 = _mm_min_epi16(C_8x16, in_macro_16x8); //CLIP3 19138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S C_8x16 = _mm_subs_epi16(zero, C_8x16); 19148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S in_macro_16x8 = _mm_max_epi16(C_8x16, in_macro_16x8); //CLIP3 19158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 19168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // p0 19178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(_mm_unpacklo_epi8(p0_16x8, zero), in_macro_16x8); 19188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 19198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_packus_epi16(temp1, zero); 19208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 19218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p0_16x8_1 = _mm_and_si128(temp1, flag1_16x8); 19228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p0_16x8_2 = _mm_and_si128( 19238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p0_16x8, _mm_xor_si128(flag1_16x8, _mm_set1_epi16(0xFFFF))); 19248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 19258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p0_16x8_1 = _mm_add_epi8(p0_16x8_1, p0_16x8_2); 19268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 19278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // q0 19288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_sub_epi16(_mm_unpacklo_epi8(q0_16x8, zero), in_macro_16x8); 19298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 19308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_packus_epi16(temp1, zero); 19318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 19328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q0_16x8_1 = _mm_and_si128(temp1, flag1_16x8); 19338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q0_16x8_2 = _mm_and_si128( 19348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q0_16x8, _mm_xor_si128(flag1_16x8, _mm_set1_epi16(0xFFFF))); 19358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 19368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q0_16x8_1 = _mm_add_epi8(q0_16x8_1, q0_16x8_2); 19378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 19388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S //if(Ap < Beta) 19398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_avg_epu16(_mm_unpacklo_epi8(q0_16x8, zero), 19408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_unpacklo_epi8(p0_16x8, zero)); 19418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_slli_epi16(_mm_unpacklo_epi8(p1_16x8, zero), 1); 19428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S //temp2 = _mm_subs_epi16(zero,temp2); 19438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(p2_16x8, zero), temp2); 19448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(temp1, temp2); 19458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S in_macro_16x8 = _mm_srai_epi16(temp2, 1); 19468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 19478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S in_macro_16x8 = _mm_min_epi16(C0_8x16, in_macro_16x8); //CLIP3 19488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S C0_8x16 = _mm_subs_epi16(zero, C0_8x16); 19498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S in_macro_16x8 = _mm_max_epi16(C0_8x16, in_macro_16x8); //CLIP3 19508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 19518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // p1 19528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(_mm_unpacklo_epi8(p1_16x8, zero), in_macro_16x8); 19538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 19548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_packus_epi16(temp1, zero); 19558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 19568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p1_16x8_1 = _mm_and_si128(temp1, flag2_16x8); 19578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p1_16x8 = _mm_and_si128(p1_16x8, 19588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_xor_si128(flag2_16x8, _mm_set1_epi16(0xFFFF))); 19598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S p1_16x8 = _mm_add_epi8(p1_16x8, p1_16x8_1); 19608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 19618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S //if(Aq < Beta) 19628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_avg_epu16(_mm_unpacklo_epi8(q0_16x8, zero), 19638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_unpacklo_epi8(p0_16x8, zero)); 19648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_slli_epi16(_mm_unpacklo_epi8(q1_16x8, zero), 1); 19658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S //temp2 = _mm_slli_epi16 (temp2, 1); 19668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(q2_16x8, zero), temp2); 19678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_add_epi16(temp1, temp2); 19688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S in_macro_16x8 = _mm_srai_epi16(temp2, 1); 19698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 19708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S in_macro_16x8 = _mm_max_epi16(C0_8x16, in_macro_16x8); //CLIP3 19718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S C0_8x16 = _mm_subs_epi16(zero, C0_8x16); 19728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S in_macro_16x8 = _mm_min_epi16(C0_8x16, in_macro_16x8); //CLIP3 19738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 19748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_add_epi16(_mm_unpacklo_epi8(q1_16x8, zero), in_macro_16x8); 19758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 19768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // q1 19778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_packus_epi16(temp1, zero); 19788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 19798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q1_16x8_1 = _mm_and_si128(temp1, flag3_16x8); 19808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q1_16x8 = _mm_and_si128(q1_16x8, 19818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_xor_si128(flag3_16x8, _mm_set1_epi16(0xFFFF))); 19828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S q1_16x8 = _mm_add_epi8(q1_16x8, q1_16x8_1); 19838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 19848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_unpacklo_epi8(p3_16x8, p2_16x8); 19858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_unpacklo_epi8(p1_16x8, p0_16x8_1); 19868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_unpacklo_epi8(q0_16x8_1, q1_16x8); 19878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp4 = _mm_unpacklo_epi8(q2_16x8, q3_16x8); 19888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 19898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line7 = _mm_unpacklo_epi16(temp1, temp2); 19908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_unpackhi_epi16(temp1, temp2); 19918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line8 = _mm_unpacklo_epi16(temp3, temp4); 19928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_unpackhi_epi16(temp3, temp4); 19938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 19948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line1 = _mm_unpacklo_epi32(line7, line8); 19958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line2 = _mm_srli_si128(line1, 8); 19968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line3 = _mm_unpackhi_epi32(line7, line8); 19978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line4 = _mm_srli_si128(line3, 8); 19988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line5 = _mm_unpacklo_epi32(temp1, temp2); 19998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line6 = _mm_srli_si128(line5, 8); 20008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line7 = _mm_unpackhi_epi32(temp1, temp2); 20018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S line8 = _mm_srli_si128(line7, 8); 20028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 20038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_storel_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd), line1); 20048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_storel_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd), line2); 20058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_storel_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd), line3); 20068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_storel_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd), line4); 20078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_storel_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd), line5); 20088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_storel_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd), line6); 20098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_storel_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd), line7); 20108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S _mm_storel_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd), line8); 20118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S} 20128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2013