18d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/******************************************************************************
28d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
38d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Copyright (C) 2015 The Android Open Source Project
48d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
58d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Licensed under the Apache License, Version 2.0 (the "License");
68d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * you may not use this file except in compliance with the License.
78d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * You may obtain a copy of the License at:
88d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
98d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * http://www.apache.org/licenses/LICENSE-2.0
108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Unless required by applicable law or agreed to in writing, software
128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * distributed under the License is distributed on an "AS IS" BASIS,
138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * See the License for the specific language governing permissions and
158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * limitations under the License.
168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *****************************************************************************
188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*/
208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*****************************************************************************/
218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  File Name         : ih264_deblk_luma_ssse3.c                             */
238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  Description       : Contains function definitions for deblocking         */
258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  List of Functions : ih264_deblk_luma_vert_bs4_ssse3()                    */
278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                      ih264_deblk_luma_horz_bs4_ssse3()                    */
288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                      ih264_deblk_luma_vert_bslt4_ssse3()                  */
298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                      ih264_deblk_luma_horz_bslt4_ssse3()                  */
308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                      ih264_deblk_luma_vert_bs4_mbaff_ssse3()              */
318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                      ih264_deblk_luma_vert_bslt4_mbaff_ssse3()            */
328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  Issues / Problems : None                                                 */
348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  Revision History  :                                                      */
368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*         12 02 2015   Naveen Kumar P  Added luma deblocking ssse3          */
398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                      intrinsics                           */
408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*****************************************************************************/
428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*****************************************************************************/
448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* File Includes                                                             */
458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*****************************************************************************/
468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* System include files */
488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S#include <stdio.h>
498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* User include files */
518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S#include "ih264_typedefs.h"
528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S#include "ih264_platform_macros.h"
538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S#include "ih264_deblk_edge_filters.h"
548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S#include "ih264_macros.h"
558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*****************************************************************************/
578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Function Definitions                                                      */
588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*****************************************************************************/
598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*****************************************************************************/
618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  Function Name : ih264_deblk_luma_vert_bs4_ssse3()                        */
638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  Description   : This function performs filtering of a luma block         */
658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                  vertical edge when the boundary strength is set to 4.    */
668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  Inputs        : pu1_src    - pointer to the src sample q0                */
688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                  src_strd   - source stride                               */
698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                  alpha      - alpha value for the boundary                */
708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                  beta       - beta value for the boundary                 */
718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  Globals       : None                                                     */
738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  Processing    : This operation is described in Sec. 8.7.2.4 under the    */
758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                  title "Filtering process for edges for bS equal to 4" in */
768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                  ITU T Rec H.264.                                         */
778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  Outputs       : None                                                     */
798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  Returns       : None                                                     */
818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  Issues        : None                                                     */
838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  Revision History:                                                        */
858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*         12 02 2015   Naveen Kumar P  Initial version                      */
888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*****************************************************************************/
908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Svoid ih264_deblk_luma_vert_bs4_ssse3(UWORD8 *pu1_src,
918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                     WORD32 src_strd,
928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                     WORD32 alpha,
938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                     WORD32 beta)
948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S{
958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i zero = _mm_setzero_si128();
968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i q0_16x8, q1_16x8, q2_16x8, q3_16x8;
978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i p0_16x8, p1_16x8, p2_16x8, p3_16x8;
988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i q0_8x16, q1_8x16, q2_8x16, q3_8x16;
998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i p0_8x16, p1_8x16, p2_8x16, p3_8x16;
1008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i q0_16x8_1;
1018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i p0_16x8_1;
1028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i q0_16x8_2, q1_16x8_2, q2_16x8_2;
1038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i p0_16x8_2, p1_16x8_2, p2_16x8_2;
1048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i temp1, temp2, temp3, temp4, temp5, temp6;
1058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i Alpha_8x16, Beta_8x16;
1068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i flag1_16x8, flag2_16x8, flag3_16x8, flag4_16x8;
1078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i const_val2_16x8 = _mm_set1_epi16(2);
1088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i line1, line2, line3, line4, line5, line6, line7, line8;
1098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    Alpha_8x16 = _mm_set1_epi16(alpha);
1118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    Beta_8x16 = _mm_set1_epi16(beta);
1128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line1 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd));
1148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line2 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd));
1158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line3 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd));
1168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line4 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd));
1178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line5 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd));
1188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line6 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd));
1198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line7 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd));
1208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line8 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd));
1218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_unpacklo_epi8(line1, line2);
1238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_unpacklo_epi8(line3, line4);
1248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp3 = _mm_unpacklo_epi8(line5, line6);
1258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp4 = _mm_unpacklo_epi8(line7, line8);
1268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line1 = _mm_unpacklo_epi16(temp1, temp2);
1288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line2 = _mm_unpackhi_epi16(temp1, temp2);
1298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line3 = _mm_unpacklo_epi16(temp3, temp4);
1308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line4 = _mm_unpackhi_epi16(temp3, temp4);
1318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p1_8x16 = _mm_unpacklo_epi32(line1, line3);
1338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p0_8x16 = _mm_unpackhi_epi32(line1, line3);
1348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q0_8x16 = _mm_unpacklo_epi32(line2, line4);
1358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q1_8x16 = _mm_unpackhi_epi32(line2, line4);
1368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line1 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 8 * src_strd));
1388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line2 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 9 * src_strd));
1398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line3 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 10 * src_strd));
1408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line4 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 11 * src_strd));
1418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line5 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 12 * src_strd));
1428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line6 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 13 * src_strd));
1438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line7 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 14 * src_strd));
1448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line8 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 15 * src_strd));
1458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_unpacklo_epi8(line1, line2);
1478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_unpacklo_epi8(line3, line4);
1488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp3 = _mm_unpacklo_epi8(line5, line6);
1498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp4 = _mm_unpacklo_epi8(line7, line8);
1508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line1 = _mm_unpacklo_epi16(temp1, temp2);
1528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line2 = _mm_unpackhi_epi16(temp1, temp2);
1538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line3 = _mm_unpacklo_epi16(temp3, temp4);
1548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line4 = _mm_unpackhi_epi16(temp3, temp4);
1558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_unpacklo_epi32(line1, line3);
1578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_unpackhi_epi32(line1, line3);
1588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp3 = _mm_unpacklo_epi32(line2, line4);
1598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp4 = _mm_unpackhi_epi32(line2, line4);
1608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p3_16x8 = _mm_unpacklo_epi64(p1_8x16, temp1);
1628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p2_16x8 = _mm_unpackhi_epi64(p1_8x16, temp1);
1638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q2_16x8 = _mm_unpacklo_epi64(q1_8x16, temp4);
1648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q3_16x8 = _mm_unpackhi_epi64(q1_8x16, temp4);
1658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p1_16x8 = _mm_unpacklo_epi64(p0_8x16, temp2);
1668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p0_16x8 = _mm_unpackhi_epi64(p0_8x16, temp2);
1678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q0_16x8 = _mm_unpacklo_epi64(q0_8x16, temp3);
1688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q1_16x8 = _mm_unpackhi_epi64(q0_8x16, temp3);
1698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    //Cond1 (ABS(p0 - q0) < alpha)
1718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_subs_epu8(q0_16x8, p0_16x8);
1728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_subs_epu8(p0_16x8, q0_16x8);
1738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi8(temp1, temp2);
1748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_unpacklo_epi8(temp1, zero);
1768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_unpackhi_epi8(temp1, zero);
1778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
1798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
1808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    flag1_16x8 = _mm_packs_epi16(temp2, temp1);
1828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    //Cond2 (ABS(q1 - q0) < beta)
1848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_subs_epu8(q0_16x8, q1_16x8);
1858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_subs_epu8(q1_16x8, q0_16x8);
1868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi8(temp1, temp2);
1878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_unpacklo_epi8(temp1, zero);
1898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_unpackhi_epi8(temp1, zero);
1908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
1938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    flag2_16x8 = _mm_packs_epi16(temp2, temp1);
1958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
1978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    //Cond3 (ABS(p1 - p0) < beta)
1998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_subs_epu8(p0_16x8, p1_16x8);
2008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_subs_epu8(p1_16x8, p0_16x8);
2018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi8(temp1, temp2);
2028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_unpacklo_epi8(temp1, zero);
2048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_unpackhi_epi8(temp1, zero);
2058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
2078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
2088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    flag2_16x8 = _mm_packs_epi16(temp2, temp1);
2108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta))
2128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
2138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // (ABS(p0 - q0) < ((alpha >> 2) + 2))
2158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_subs_epu8(p0_16x8, q0_16x8);
2168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_subs_epu8(q0_16x8, p0_16x8);
2178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi8(temp1, temp2);
2188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    Alpha_8x16 = _mm_srai_epi16(Alpha_8x16, 2);
2198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    Alpha_8x16 = _mm_add_epi16(Alpha_8x16, const_val2_16x8);
2208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_unpacklo_epi8(temp1, zero);
2228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_unpackhi_epi8(temp1, zero);
2238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
2248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
2258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    flag2_16x8 = _mm_packs_epi16(temp2, temp1);
2278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
2288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // (ABS(p2 - p0) < beta)
2308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_subs_epu8(p0_16x8, p2_16x8);
2318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_subs_epu8(p2_16x8, p0_16x8);
2328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi8(temp1, temp2);
2338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_unpacklo_epi8(temp1, zero);
2358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_unpackhi_epi8(temp1, zero);
2368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
2378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
2388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    flag3_16x8 = _mm_packs_epi16(temp2, temp1);
2408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    flag3_16x8 = _mm_and_si128(flag3_16x8, flag2_16x8);
2418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // (ABS(q2 - q0) < beta)
2438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_subs_epu8(q0_16x8, q2_16x8);
2448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_subs_epu8(q2_16x8, q0_16x8);
2458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi8(temp1, temp2);
2468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_unpacklo_epi8(temp1, zero);
2488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_unpackhi_epi8(temp1, zero);
2498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
2508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
2518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    flag4_16x8 = _mm_packs_epi16(temp2, temp1);
2538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    flag4_16x8 = _mm_and_si128(flag4_16x8, flag2_16x8);
2548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // First 8 pixels
2568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p3_8x16 = _mm_unpacklo_epi8(p3_16x8, zero);
2578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p2_8x16 = _mm_unpacklo_epi8(p2_16x8, zero);
2588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p1_8x16 = _mm_unpacklo_epi8(p1_16x8, zero);
2598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p0_8x16 = _mm_unpacklo_epi8(p0_16x8, zero);
2608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q0_8x16 = _mm_unpacklo_epi8(q0_16x8, zero);
2618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q1_8x16 = _mm_unpacklo_epi8(q1_16x8, zero);
2628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q2_8x16 = _mm_unpacklo_epi8(q2_16x8, zero);
2638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q3_8x16 = _mm_unpacklo_epi8(q3_16x8, zero);
2648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // p0_1 and q0_1
2668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(p0_8x16, q1_8x16);
2678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(p1_8x16, q0_8x16);
2688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp5 = _mm_add_epi16(temp1, const_val2_16x8);
2698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp6 = _mm_add_epi16(temp2, const_val2_16x8);
2708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp3 = _mm_slli_epi16(p1_8x16, 1);
2718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp4 = _mm_slli_epi16(q1_8x16, 1);
2728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(temp5, temp3);
2738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(temp6, temp4);
2748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p0_16x8_1 = _mm_srai_epi16(temp1, 2);
2758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q0_16x8_1 = _mm_srai_epi16(temp2, 2);
2768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // p1_2 and q1_2
2788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp6 = _mm_add_epi16(temp6, p0_8x16);
2798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp5 = _mm_add_epi16(temp5, q0_8x16);
2808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(temp6, p2_8x16);
2818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(temp5, q2_8x16);
2828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p1_16x8_2 = _mm_srai_epi16(temp1, 2);
2838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q1_16x8_2 = _mm_srai_epi16(temp2, 2);
2848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // p0_2 and q0_2
2868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(temp3, p2_8x16);
2878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(temp4, q2_8x16);
2888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(temp1, q1_8x16);
2898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(temp2, p1_8x16);
2908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp3 = _mm_add_epi16(p0_8x16, q0_8x16);
2918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp3 = _mm_slli_epi16(temp3, 1);
2928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(temp1, temp3);
2938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(temp2, temp3);
2948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4));
2958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4));
2968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p0_16x8_2 = _mm_srai_epi16(temp1, 3);
2978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q0_16x8_2 = _mm_srai_epi16(temp2, 3);
2988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // p2_2 and q2_2
3008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(temp6, const_val2_16x8);
3018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(temp5, const_val2_16x8);
3028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp3 = _mm_slli_epi16(p2_8x16, 1);
3038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp4 = _mm_slli_epi16(q2_8x16, 1);
3048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp3 = _mm_add_epi16(p2_8x16, temp3);
3058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp4 = _mm_add_epi16(q2_8x16, temp4);
3068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp5 = _mm_slli_epi16(p3_8x16, 1);
3078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp6 = _mm_slli_epi16(q3_8x16, 1);
3088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(temp1, temp3);
3098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(temp2, temp4);
3108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(temp1, temp5);
3118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(temp2, temp6);
3128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p2_16x8_2 = _mm_srai_epi16(temp1, 3);
3138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q2_16x8_2 = _mm_srai_epi16(temp2, 3);
3148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // Second 8 pixels and packing with first 8 pixels
3168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p3_8x16 = _mm_unpackhi_epi8(p3_16x8, zero);
3178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p2_8x16 = _mm_unpackhi_epi8(p2_16x8, zero);
3188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p1_8x16 = _mm_unpackhi_epi8(p1_16x8, zero);
3198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p0_8x16 = _mm_unpackhi_epi8(p0_16x8, zero);
3208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q0_8x16 = _mm_unpackhi_epi8(q0_16x8, zero);
3218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q1_8x16 = _mm_unpackhi_epi8(q1_16x8, zero);
3228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q2_8x16 = _mm_unpackhi_epi8(q2_16x8, zero);
3238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q3_8x16 = _mm_unpackhi_epi8(q3_16x8, zero);
3248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // p0_1 and q0_1
3268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(p0_8x16, q1_8x16);
3278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(p1_8x16, q0_8x16);
3288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp5 = _mm_add_epi16(temp1, const_val2_16x8);
3298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp6 = _mm_add_epi16(temp2, const_val2_16x8);
3308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp3 = _mm_slli_epi16(p1_8x16, 1);
3318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp4 = _mm_slli_epi16(q1_8x16, 1);
3328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(temp5, temp3);
3338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(temp6, temp4);
3348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_srai_epi16(temp1, 2);
3358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_srai_epi16(temp2, 2);
3368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p0_16x8_1 = _mm_packus_epi16(p0_16x8_1, temp1);
3378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q0_16x8_1 = _mm_packus_epi16(q0_16x8_1, temp2);
3388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // p1_2 and q1_2
3408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp6 = _mm_add_epi16(temp6, p0_8x16);
3418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp5 = _mm_add_epi16(temp5, q0_8x16);
3428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(temp6, p2_8x16);
3438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(temp5, q2_8x16);
3448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_srai_epi16(temp1, 2);
3458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_srai_epi16(temp2, 2);
3468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p1_16x8_2 = _mm_packus_epi16(p1_16x8_2, temp1);
3478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q1_16x8_2 = _mm_packus_epi16(q1_16x8_2, temp2);
3488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // p0_2 and q0_2
3508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(temp3, p2_8x16);
3518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(temp4, q2_8x16);
3528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(temp1, q1_8x16);
3538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(temp2, p1_8x16);
3548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp3 = _mm_add_epi16(p0_8x16, q0_8x16);
3558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp3 = _mm_slli_epi16(temp3, 1);
3568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(temp1, temp3);
3578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(temp2, temp3);
3588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4));
3598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4));
3608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_srai_epi16(temp1, 3);
3618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_srai_epi16(temp2, 3);
3628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p0_16x8_2 = _mm_packus_epi16(p0_16x8_2, temp1);
3638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q0_16x8_2 = _mm_packus_epi16(q0_16x8_2, temp2);
3648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // p2_2 and q2_2
3668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(temp6, const_val2_16x8);
3678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(temp5, const_val2_16x8);
3688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp3 = _mm_slli_epi16(p2_8x16, 1);
3698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp4 = _mm_slli_epi16(q2_8x16, 1);
3708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp3 = _mm_add_epi16(p2_8x16, temp3);
3718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp4 = _mm_add_epi16(q2_8x16, temp4);
3728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp5 = _mm_slli_epi16(p3_8x16, 1);
3738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp6 = _mm_slli_epi16(q3_8x16, 1);
3748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(temp1, temp3);
3758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(temp2, temp4);
3768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(temp1, temp5);
3778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(temp2, temp6);
3788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_srai_epi16(temp1, 3);
3798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_srai_epi16(temp2, 3);
3808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p2_16x8_2 = _mm_packus_epi16(p2_16x8_2, temp1);
3818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q2_16x8_2 = _mm_packus_epi16(q2_16x8_2, temp2);
3828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // p0 and q0
3848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p0_16x8 = _mm_and_si128(p0_16x8,
3858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                            _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF)));
3868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p0_16x8_1 = _mm_and_si128(p0_16x8_1, flag1_16x8);
3878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_1);
3888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q0_16x8 = _mm_and_si128(q0_16x8,
3898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                            _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF)));
3908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q0_16x8_1 = _mm_and_si128(q0_16x8_1, flag1_16x8);
3918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_1);
3928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // p0 and q0
3948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p0_16x8 = _mm_and_si128(p0_16x8,
3958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                            _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
3968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p0_16x8_2 = _mm_and_si128(p0_16x8_2, flag3_16x8);
3978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_2);
3988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q0_16x8 = _mm_and_si128(q0_16x8,
3998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                            _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
4008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q0_16x8_2 = _mm_and_si128(q0_16x8_2, flag4_16x8);
4018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_2);
4028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // p1 and q1
4048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p1_16x8 = _mm_and_si128(p1_16x8,
4058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                            _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
4068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p1_16x8_2 = _mm_and_si128(p1_16x8_2, flag3_16x8);
4078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p1_16x8 = _mm_add_epi8(p1_16x8, p1_16x8_2);
4088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q1_16x8 = _mm_and_si128(q1_16x8,
4098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                            _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
4108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q1_16x8_2 = _mm_and_si128(q1_16x8_2, flag4_16x8);
4118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q1_16x8 = _mm_add_epi8(q1_16x8, q1_16x8_2);
4128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // p2 and q2
4148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p2_16x8 = _mm_and_si128(p2_16x8,
4158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                            _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
4168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p2_16x8_2 = _mm_and_si128(p2_16x8_2, flag3_16x8);
4178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p2_16x8 = _mm_add_epi8(p2_16x8, p2_16x8_2);
4188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q2_16x8 = _mm_and_si128(q2_16x8,
4198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                            _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
4208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q2_16x8_2 = _mm_and_si128(q2_16x8_2, flag4_16x8);
4218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q2_16x8 = _mm_add_epi8(q2_16x8, q2_16x8_2);
4228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_unpacklo_epi8(p3_16x8, p2_16x8);
4248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_unpacklo_epi8(p1_16x8, p0_16x8);
4258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp3 = _mm_unpacklo_epi8(q0_16x8, q1_16x8);
4268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp4 = _mm_unpacklo_epi8(q2_16x8, q3_16x8);
4278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p3_8x16 = _mm_unpacklo_epi16(temp1, temp2);
4298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p2_8x16 = _mm_unpackhi_epi16(temp1, temp2);
4308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q2_8x16 = _mm_unpacklo_epi16(temp3, temp4);
4318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q3_8x16 = _mm_unpackhi_epi16(temp3, temp4);
4328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line1 = _mm_unpacklo_epi32(p3_8x16, q2_8x16);
4348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line2 = _mm_srli_si128(line1, 8);
4358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line3 = _mm_unpackhi_epi32(p3_8x16, q2_8x16);
4368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line4 = _mm_srli_si128(line3, 8);
4378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line5 = _mm_unpacklo_epi32(p2_8x16, q3_8x16);
4388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line6 = _mm_srli_si128(line5, 8);
4398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line7 = _mm_unpackhi_epi32(p2_8x16, q3_8x16);
4408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line8 = _mm_srli_si128(line7, 8);
4418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd), line1);
4438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd), line2);
4448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd), line3);
4458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd), line4);
4468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd), line5);
4478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd), line6);
4488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd), line7);
4498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd), line8);
4508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_unpackhi_epi8(p3_16x8, p2_16x8);
4528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_unpackhi_epi8(p1_16x8, p0_16x8);
4538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp3 = _mm_unpackhi_epi8(q0_16x8, q1_16x8);
4548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp4 = _mm_unpackhi_epi8(q2_16x8, q3_16x8);
4558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p3_8x16 = _mm_unpacklo_epi16(temp1, temp2);
4578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p2_8x16 = _mm_unpackhi_epi16(temp1, temp2);
4588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q2_8x16 = _mm_unpacklo_epi16(temp3, temp4);
4598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q3_8x16 = _mm_unpackhi_epi16(temp3, temp4);
4608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line1 = _mm_unpacklo_epi32(p3_8x16, q2_8x16);
4628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line2 = _mm_srli_si128(line1, 8);
4638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line3 = _mm_unpackhi_epi32(p3_8x16, q2_8x16);
4648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line4 = _mm_srli_si128(line3, 8);
4658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line5 = _mm_unpacklo_epi32(p2_8x16, q3_8x16);
4668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line6 = _mm_srli_si128(line5, 8);
4678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line7 = _mm_unpackhi_epi32(p2_8x16, q3_8x16);
4688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line8 = _mm_srli_si128(line7, 8);
4698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 8 * src_strd), line1);
4718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 9 * src_strd), line2);
4728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 10 * src_strd), line3);
4738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 11 * src_strd), line4);
4748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 12 * src_strd), line5);
4758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 13 * src_strd), line6);
4768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 14 * src_strd), line7);
4778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 15 * src_strd), line8);
4788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S}
4808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*****************************************************************************/
4828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
4838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  Function Name : ih264_deblk_luma_horz_bs4_ssse3()                        */
4848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
4858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  Description   : This function performs filtering of a luma block         */
4868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                  horizontal edge when the boundary strength is set to 4.  */
4878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
4888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  Inputs        : pu1_src    - pointer to the src sample q0                */
4898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                  src_strd   - source stride                               */
4908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                  alpha      - alpha value for the boundary                */
4918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                  beta       - beta value for the boundary                 */
4928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
4938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  Globals       : None                                                     */
4948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
4958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  Processing    : This operation is described in Sec. 8.7.2.4 under the    */
4968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                  title "Filtering process for edges for bS equal to 4" in */
4978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                  ITU T Rec H.264.                                         */
4988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
4998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  Outputs       : None                                                     */
5008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
5018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  Returns       : None                                                     */
5028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
5038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  Issues        : None                                                     */
5048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
5058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  Revision History:                                                        */
5068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
5078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
5088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*         12 02 2015   Naveen Kumar P  Initial version                      */
5098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
5108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*****************************************************************************/
5118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Svoid ih264_deblk_luma_horz_bs4_ssse3(UWORD8 *pu1_src,
5128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                     WORD32 src_strd,
5138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                     WORD32 alpha,
5148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                     WORD32 beta)
5158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S{
5168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    WORD16 i16_posP3, i16_posP2, i16_posP1, i16_posP0;
5178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    WORD16 i16_posQ1, i16_posQ2, i16_posQ3;
5188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    UWORD8 *pu1_HorzPixel;
5198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i zero = _mm_setzero_si128();
5208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i q0_16x8, q1_16x8, q2_16x8, q3_16x8;
5218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i p0_16x8, p1_16x8, p2_16x8, p3_16x8;
5228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i q0_8x16, q1_8x16, q2_8x16, q3_8x16;
5238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i p0_8x16, p1_8x16, p2_8x16, p3_8x16;
5248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i q0_16x8_1;
5258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i p0_16x8_1;
5268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i q0_16x8_2, q1_16x8_2, q2_16x8_2;
5278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i p0_16x8_2, p1_16x8_2, p2_16x8_2;
5288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i temp1, temp2, temp3, temp4, temp5, temp6;
5298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i Alpha_8x16, Beta_8x16;
5308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i flag1_16x8, flag2_16x8, flag3_16x8, flag4_16x8;
5318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i const_val2_16x8 = _mm_set1_epi16(2);
5328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_HorzPixel = pu1_src - (src_strd << 2);
5348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    i16_posQ1 = src_strd;
5368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    i16_posQ2 = X2(src_strd);
5378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    i16_posQ3 = X3(src_strd);
5388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    i16_posP0 = X3(src_strd);
5398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    i16_posP1 = X2(src_strd);
5408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    i16_posP2 = src_strd;
5418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    i16_posP3 = 0;
5428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    Alpha_8x16 = _mm_set1_epi16(alpha);
5448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    Beta_8x16 = _mm_set1_epi16(beta);
5458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p3_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP3));
5478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p2_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP2));
5488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p1_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP1));
5498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p0_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP0));
5508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q0_16x8 = _mm_loadu_si128((__m128i *)(pu1_src));
5518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q1_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ1));
5528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q2_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ2));
5538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q3_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ3));
5548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    //Cond1 (ABS(p0 - q0) < alpha)
5568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_subs_epu8(q0_16x8, p0_16x8);
5578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_subs_epu8(p0_16x8, q0_16x8);
5588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi8(temp1, temp2);
5598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_unpacklo_epi8(temp1, zero);
5618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_unpackhi_epi8(temp1, zero);
5628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
5648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
5658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    flag1_16x8 = _mm_packs_epi16(temp2, temp1);
5678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    //Cond2 (ABS(q1 - q0) < beta)
5698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_subs_epu8(q0_16x8, q1_16x8);
5708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_subs_epu8(q1_16x8, q0_16x8);
5718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi8(temp1, temp2);
5728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_unpacklo_epi8(temp1, zero);
5748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_unpackhi_epi8(temp1, zero);
5758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
5778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
5788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    flag2_16x8 = _mm_packs_epi16(temp2, temp1);
5808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
5828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    //Cond3 (ABS(p1 - p0) < beta)
5848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_subs_epu8(p0_16x8, p1_16x8);
5858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_subs_epu8(p1_16x8, p0_16x8);
5868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi8(temp1, temp2);
5878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_unpacklo_epi8(temp1, zero);
5898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_unpackhi_epi8(temp1, zero);
5908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
5928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
5938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    flag2_16x8 = _mm_packs_epi16(temp2, temp1);
5958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta))
5978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
5988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // (ABS(p0 - q0) < ((alpha >> 2) + 2))
6008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_subs_epu8(p0_16x8, q0_16x8);
6018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_subs_epu8(q0_16x8, p0_16x8);
6028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi8(temp1, temp2);
6038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    Alpha_8x16 = _mm_srai_epi16(Alpha_8x16, 2);
6048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    Alpha_8x16 = _mm_add_epi16(Alpha_8x16, const_val2_16x8);
6058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_unpacklo_epi8(temp1, zero);
6078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_unpackhi_epi8(temp1, zero);
6088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
6098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
6108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    flag2_16x8 = _mm_packs_epi16(temp2, temp1);
6128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
6138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // (ABS(p2 - p0) < beta)
6158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_subs_epu8(p0_16x8, p2_16x8);
6168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_subs_epu8(p2_16x8, p0_16x8);
6178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi8(temp1, temp2);
6188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_unpacklo_epi8(temp1, zero);
6208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_unpackhi_epi8(temp1, zero);
6218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
6228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
6238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    flag3_16x8 = _mm_packs_epi16(temp2, temp1);
6258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    flag3_16x8 = _mm_and_si128(flag3_16x8, flag2_16x8);
6268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // (ABS(q2 - q0) < beta)
6288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_subs_epu8(q0_16x8, q2_16x8);
6298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_subs_epu8(q2_16x8, q0_16x8);
6308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi8(temp1, temp2);
6318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_unpacklo_epi8(temp1, zero);
6338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_unpackhi_epi8(temp1, zero);
6348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
6358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
6368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    flag4_16x8 = _mm_packs_epi16(temp2, temp1);
6388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    flag4_16x8 = _mm_and_si128(flag4_16x8, flag2_16x8);
6398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // First 8 pixels
6418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p3_8x16 = _mm_unpacklo_epi8(p3_16x8, zero);
6428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p2_8x16 = _mm_unpacklo_epi8(p2_16x8, zero);
6438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p1_8x16 = _mm_unpacklo_epi8(p1_16x8, zero);
6448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p0_8x16 = _mm_unpacklo_epi8(p0_16x8, zero);
6458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q0_8x16 = _mm_unpacklo_epi8(q0_16x8, zero);
6468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q1_8x16 = _mm_unpacklo_epi8(q1_16x8, zero);
6478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q2_8x16 = _mm_unpacklo_epi8(q2_16x8, zero);
6488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q3_8x16 = _mm_unpacklo_epi8(q3_16x8, zero);
6498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // p0_1 and q0_1
6518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(p0_8x16, q1_8x16);
6528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(p1_8x16, q0_8x16);
6538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp5 = _mm_add_epi16(temp1, const_val2_16x8);
6548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp6 = _mm_add_epi16(temp2, const_val2_16x8);
6558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp3 = _mm_slli_epi16(p1_8x16, 1);
6568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp4 = _mm_slli_epi16(q1_8x16, 1);
6578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(temp5, temp3);
6588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(temp6, temp4);
6598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p0_16x8_1 = _mm_srai_epi16(temp1, 2);
6608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q0_16x8_1 = _mm_srai_epi16(temp2, 2);
6618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // p1_2 and q1_2
6638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp6 = _mm_add_epi16(temp6, p0_8x16);
6648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp5 = _mm_add_epi16(temp5, q0_8x16);
6658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(temp6, p2_8x16);
6668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(temp5, q2_8x16);
6678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p1_16x8_2 = _mm_srai_epi16(temp1, 2);
6688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q1_16x8_2 = _mm_srai_epi16(temp2, 2);
6698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // p0_2 and q0_2
6718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(temp3, p2_8x16);
6728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(temp4, q2_8x16);
6738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(temp1, q1_8x16);
6748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(temp2, p1_8x16);
6758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp3 = _mm_add_epi16(p0_8x16, q0_8x16);
6768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp3 = _mm_slli_epi16(temp3, 1);
6778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(temp1, temp3);
6788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(temp2, temp3);
6798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4));
6808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4));
6818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p0_16x8_2 = _mm_srai_epi16(temp1, 3);
6828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q0_16x8_2 = _mm_srai_epi16(temp2, 3);
6838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // p2_2 and q2_2
6858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(temp6, const_val2_16x8);
6868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(temp5, const_val2_16x8);
6878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp3 = _mm_slli_epi16(p2_8x16, 1);
6888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp4 = _mm_slli_epi16(q2_8x16, 1);
6898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp3 = _mm_add_epi16(p2_8x16, temp3);
6908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp4 = _mm_add_epi16(q2_8x16, temp4);
6918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp5 = _mm_slli_epi16(p3_8x16, 1);
6928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp6 = _mm_slli_epi16(q3_8x16, 1);
6938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(temp1, temp3);
6948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(temp2, temp4);
6958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(temp1, temp5);
6968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(temp2, temp6);
6978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p2_16x8_2 = _mm_srai_epi16(temp1, 3);
6988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q2_16x8_2 = _mm_srai_epi16(temp2, 3);
6998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // Second 8 pixels and packing with first 8 pixels
7018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p3_8x16 = _mm_unpackhi_epi8(p3_16x8, zero);
7028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p2_8x16 = _mm_unpackhi_epi8(p2_16x8, zero);
7038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p1_8x16 = _mm_unpackhi_epi8(p1_16x8, zero);
7048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p0_8x16 = _mm_unpackhi_epi8(p0_16x8, zero);
7058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q0_8x16 = _mm_unpackhi_epi8(q0_16x8, zero);
7068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q1_8x16 = _mm_unpackhi_epi8(q1_16x8, zero);
7078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q2_8x16 = _mm_unpackhi_epi8(q2_16x8, zero);
7088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q3_8x16 = _mm_unpackhi_epi8(q3_16x8, zero);
7098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // p0_1 and q0_1
7118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(p0_8x16, q1_8x16);
7128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(p1_8x16, q0_8x16);
7138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp5 = _mm_add_epi16(temp1, const_val2_16x8);
7148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp6 = _mm_add_epi16(temp2, const_val2_16x8);
7158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp3 = _mm_slli_epi16(p1_8x16, 1);
7168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp4 = _mm_slli_epi16(q1_8x16, 1);
7178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(temp5, temp3);
7188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(temp6, temp4);
7198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_srai_epi16(temp1, 2);
7208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_srai_epi16(temp2, 2);
7218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p0_16x8_1 = _mm_packus_epi16(p0_16x8_1, temp1);
7228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q0_16x8_1 = _mm_packus_epi16(q0_16x8_1, temp2);
7238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // p1_2 and q1_2
7258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp6 = _mm_add_epi16(temp6, p0_8x16);
7268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp5 = _mm_add_epi16(temp5, q0_8x16);
7278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(temp6, p2_8x16);
7288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(temp5, q2_8x16);
7298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_srai_epi16(temp1, 2);
7308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_srai_epi16(temp2, 2);
7318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p1_16x8_2 = _mm_packus_epi16(p1_16x8_2, temp1);
7328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q1_16x8_2 = _mm_packus_epi16(q1_16x8_2, temp2);
7338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // p0_2 and q0_2
7358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(temp3, p2_8x16);
7368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(temp4, q2_8x16);
7378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(temp1, q1_8x16);
7388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(temp2, p1_8x16);
7398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp3 = _mm_add_epi16(p0_8x16, q0_8x16);
7408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp3 = _mm_slli_epi16(temp3, 1);
7418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(temp1, temp3);
7428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(temp2, temp3);
7438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4));
7448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4));
7458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_srai_epi16(temp1, 3);
7468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_srai_epi16(temp2, 3);
7478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p0_16x8_2 = _mm_packus_epi16(p0_16x8_2, temp1);
7488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q0_16x8_2 = _mm_packus_epi16(q0_16x8_2, temp2);
7498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // p2_2 and q2_2
7518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(temp6, const_val2_16x8);
7528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(temp5, const_val2_16x8);
7538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp3 = _mm_slli_epi16(p2_8x16, 1);
7548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp4 = _mm_slli_epi16(q2_8x16, 1);
7558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp3 = _mm_add_epi16(p2_8x16, temp3);
7568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp4 = _mm_add_epi16(q2_8x16, temp4);
7578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp5 = _mm_slli_epi16(p3_8x16, 1);
7588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp6 = _mm_slli_epi16(q3_8x16, 1);
7598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(temp1, temp3);
7608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(temp2, temp4);
7618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(temp1, temp5);
7628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(temp2, temp6);
7638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_srai_epi16(temp1, 3);
7648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_srai_epi16(temp2, 3);
7658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p2_16x8_2 = _mm_packus_epi16(p2_16x8_2, temp1);
7668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q2_16x8_2 = _mm_packus_epi16(q2_16x8_2, temp2);
7678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // p0 and q0
7698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p0_16x8 = _mm_and_si128(p0_16x8,
7708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                            _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF)));
7718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p0_16x8_1 = _mm_and_si128(p0_16x8_1, flag1_16x8);
7728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_1);
7738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q0_16x8 = _mm_and_si128(q0_16x8,
7748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                            _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF)));
7758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q0_16x8_1 = _mm_and_si128(q0_16x8_1, flag1_16x8);
7768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_1);
7778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // p0 and q0
7798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p0_16x8 = _mm_and_si128(p0_16x8,
7808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                            _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
7818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p0_16x8_2 = _mm_and_si128(p0_16x8_2, flag3_16x8);
7828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_2);
7838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q0_16x8 = _mm_and_si128(q0_16x8,
7848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                            _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
7858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q0_16x8_2 = _mm_and_si128(q0_16x8_2, flag4_16x8);
7868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_2);
7878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // p1 and q1
7898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p1_16x8 = _mm_and_si128(p1_16x8,
7908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                            _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
7918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p1_16x8_2 = _mm_and_si128(p1_16x8_2, flag3_16x8);
7928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p1_16x8 = _mm_add_epi8(p1_16x8, p1_16x8_2);
7938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q1_16x8 = _mm_and_si128(q1_16x8,
7948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                            _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
7958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q1_16x8_2 = _mm_and_si128(q1_16x8_2, flag4_16x8);
7968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q1_16x8 = _mm_add_epi8(q1_16x8, q1_16x8_2);
7978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // p2 and q2
7998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p2_16x8 = _mm_and_si128(p2_16x8,
8008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                            _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
8018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p2_16x8_2 = _mm_and_si128(p2_16x8_2, flag3_16x8);
8028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p2_16x8 = _mm_add_epi8(p2_16x8, p2_16x8_2);
8038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q2_16x8 = _mm_and_si128(q2_16x8,
8048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                            _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
8058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q2_16x8_2 = _mm_and_si128(q2_16x8_2, flag4_16x8);
8068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q2_16x8 = _mm_add_epi8(q2_16x8, q2_16x8_2);
8078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    _mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP2), p2_16x8);
8098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    _mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP1), p1_16x8);
8108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    _mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP0), p0_16x8);
8118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    _mm_storeu_si128((__m128i *)(pu1_src), q0_16x8);
8138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    _mm_storeu_si128((__m128i *)(pu1_src + i16_posQ1), q1_16x8);
8148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    _mm_storeu_si128((__m128i *)(pu1_src + i16_posQ2), q2_16x8);
8158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S}
8178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*****************************************************************************/
8198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
8208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  Function Name : ih264_deblk_luma_vert_bslt4_ssse3()                      */
8218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
8228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  Description   : This function performs filtering of a luma block         */
8238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                  vertical edge when the boundary strength is less than 4. */
8248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
8258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  Inputs        : pu1_src       - pointer to the src sample q0             */
8268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                  src_strd      - source stride                            */
8278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                  alpha         - alpha value for the boundary             */
8288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                  beta          - beta value for the boundary              */
8298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                  u4_bs         - packed Boundary strength array           */
8308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                  pu1_cliptab   - tc0_table                                */
8318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
8328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  Globals       : None                                                     */
8338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
8348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  Processing    : This operation is described in Sec. 8.7.2.3 under the    */
8358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                  title "Filtering process for edges for bS less than 4"   */
8368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                  in ITU T Rec H.264.                                      */
8378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
8388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  Outputs       : None                                                     */
8398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
8408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  Returns       : None                                                     */
8418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
8428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  Issues        : None                                                     */
8438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
8448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  Revision History:                                                        */
8458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
8468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
8478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*         12 02 2015   Naveen Kumar P  Initial version                      */
8488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
8498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*****************************************************************************/
8508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Svoid ih264_deblk_luma_vert_bslt4_ssse3(UWORD8 *pu1_src,
8518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                       WORD32 src_strd,
8528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                       WORD32 alpha,
8538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                       WORD32 beta,
8548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                       UWORD32 u4_bs,
8558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                       const UWORD8 *pu1_cliptab)
8568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S{
8578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    UWORD8 u1_Bs, u1_Bs1;
8588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8597497191460a9504f8b4f64df169ab633f0b74353Harish Mahendrakar    WORD32 j = 0;
8608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i linea, lineb, linec, lined, linee, linef, lineg, lineh;
8628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i int1, int2, int3, int4, high1, high2;
8638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i flag, flag1, i_C, i_C0;
8648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i i_Ap, i_Aq, diff, const1, const2, in_macro, in_macrotemp, temp,
8658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                    temp1;
8668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i zero = _mm_setzero_si128();
8678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    for(j = 0; j <= 8 * src_strd; j += 8 * src_strd)
8698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    {
8708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        //Transpose
8718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        linea = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + j));
8728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        lineb = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + src_strd + j));
8738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        linec = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 2 * src_strd + j));
8748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        lined = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 3 * src_strd + j));
8758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        linea = _mm_unpacklo_epi8(linea, zero);
8778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        lineb = _mm_unpacklo_epi8(lineb, zero);
8788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        linec = _mm_unpacklo_epi8(linec, zero);
8798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        lined = _mm_unpacklo_epi8(lined, zero);
8808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        int1 = _mm_unpacklo_epi16(linea, lineb);
8828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        lineb = _mm_unpackhi_epi16(linea, lineb);
8838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        int2 = _mm_unpacklo_epi16(linec, lined);
8858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        lined = _mm_unpackhi_epi16(linec, lined);
8868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        linea = _mm_unpacklo_epi16(int1, int2);
8888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        int1 = _mm_unpackhi_epi16(int1, int2);
8898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        linec = _mm_unpacklo_epi16(lineb, lined);
8918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        high1 = _mm_unpackhi_epi16(lineb, lined);
8928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        linee = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 4 * src_strd + j));
8948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        linef = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 5 * src_strd + j));
8958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        lineg = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 6 * src_strd + j));
8968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        lineh = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 7 * src_strd + j));
8978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        linee = _mm_unpacklo_epi8(linee, zero);
8998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        linef = _mm_unpacklo_epi8(linef, zero);
9008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        lineg = _mm_unpacklo_epi8(lineg, zero);
9018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        lineh = _mm_unpacklo_epi8(lineh, zero);
9028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        int2 = _mm_unpacklo_epi16(linee, linef);
9048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        linef = _mm_unpackhi_epi16(linee, linef);
9058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        int3 = _mm_unpacklo_epi16(lineg, lineh);
9078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        lineh = _mm_unpackhi_epi16(lineg, lineh);
9088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        linee = _mm_unpacklo_epi16(int2, int3);
9108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        int2 = _mm_unpackhi_epi16(int2, int3);
9118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        lineg = _mm_unpacklo_epi16(linef, lineh);
9138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        high2 = _mm_unpackhi_epi16(linef, lineh);
9148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        int4 = _mm_unpacklo_epi16(linea, linee);
9168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        lineb = _mm_unpackhi_epi16(linea, linee);
9178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        int3 = _mm_unpacklo_epi16(int1, int2);
9198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        lined = _mm_unpackhi_epi16(int1, int2);
9208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        int2 = _mm_unpacklo_epi16(linec, lineg);
9228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        linef = _mm_unpackhi_epi16(linec, lineg);
9238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        linea = int4;
9258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        linec = int3;
9268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        linee = int2;
9278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        lineg = _mm_unpacklo_epi16(high1, high2);
9298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        lineh = _mm_unpackhi_epi16(high1, high2);
9308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        //end of transpose
9328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        u1_Bs = (u4_bs >> 24) & 0xff;
9348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        u1_Bs1 = (u4_bs >> 16) & 0xff;
9358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        u4_bs <<= 16;
9368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        flag1 = _mm_set_epi16(u1_Bs1, u1_Bs, u1_Bs1, u1_Bs, u1_Bs1, u1_Bs,
9388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                              u1_Bs1, u1_Bs);
9398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        flag1 = _mm_cmpeq_epi16(flag1, zero); //Set flag to 1s and 0s
9408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        flag1 = _mm_xor_si128(flag1, _mm_set1_epi16(0xFFFF)); //Invert for required mask
9418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        i_C0 = _mm_set_epi16(pu1_cliptab[u1_Bs1], pu1_cliptab[u1_Bs],
9438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                             pu1_cliptab[u1_Bs1], pu1_cliptab[u1_Bs],
9448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                             pu1_cliptab[u1_Bs1], pu1_cliptab[u1_Bs],
9458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                             pu1_cliptab[u1_Bs1], pu1_cliptab[u1_Bs]);
9468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        diff = _mm_subs_epi16(linec, lined); //Condn 1
9488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        diff = _mm_abs_epi16(diff);
9498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        const1 = _mm_set1_epi16(alpha);
9508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        flag = _mm_cmpgt_epi16(const1, diff);
9518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        diff = _mm_subs_epi16(linee, lined); //Condtn 2
9538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        diff = _mm_abs_epi16(diff);
9548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        const1 = _mm_set1_epi16(beta);
9558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        flag = _mm_and_si128(flag, _mm_cmpgt_epi16(const1, diff));
9568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        diff = _mm_subs_epi16(lineb, linec); //Condtn 3
9588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        diff = _mm_abs_epi16(diff);
9598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        flag = _mm_and_si128(flag, _mm_cmpgt_epi16(const1, diff)); //Const 1= Beta from now on
9608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        flag = _mm_and_si128(flag, flag1); //Final flag (ui_B condition + other 3 conditions)
9628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        //Adding Ap<Beta and Aq<Beta
9648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        i_Ap = _mm_subs_epi16(linea, linec);
9658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        i_Ap = _mm_abs_epi16(i_Ap);
9668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        const2 = _mm_cmpgt_epi16(const1, i_Ap);
9678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        const2 = _mm_subs_epi16(zero, const2); //Make FFFF=1 and 0000=0
9688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        i_C = _mm_add_epi16(i_C0, const2);
9698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        i_Aq = _mm_subs_epi16(linef, lined);
9718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        i_Aq = _mm_abs_epi16(i_Aq);
9728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        const2 = _mm_cmpgt_epi16(const1, i_Aq);
9738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        const2 = _mm_subs_epi16(zero, const2);
9748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        i_C = _mm_add_epi16(i_C, const2);
9758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        //Calculate in_macro
9778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        diff = _mm_subs_epi16(lined, linec);
9788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        diff = _mm_slli_epi16(diff, 2);
9798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        const2 = _mm_subs_epi16(lineb, linee);
9808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        diff = _mm_add_epi16(diff, const2);
9818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        const2 = _mm_set1_epi16(4);
9828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        diff = _mm_add_epi16(diff, const2);
9838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        in_macro = _mm_srai_epi16(diff, 3);
9848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        in_macro = _mm_min_epi16(i_C, in_macro); //CLIP3
9868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        i_C = _mm_subs_epi16(zero, i_C);
9878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        in_macro = _mm_max_epi16(i_C, in_macro);
9888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        //Compute and store
9908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        in_macrotemp = _mm_add_epi16(linec, in_macro);
9918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        in_macrotemp = _mm_and_si128(in_macrotemp, flag);
9928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        temp = _mm_and_si128(linec,
9938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                             _mm_xor_si128(flag, _mm_set1_epi16(0xFFFF)));
9948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        temp = _mm_add_epi16(temp, in_macrotemp);
9958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        //temp= _mm_packus_epi16 (temp, zero);
9968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        //_mm_storel_epi64(uc_HorzPixel+i16_posP0+i, in_macrotemp);
9978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        in_macrotemp = _mm_subs_epi16(lined, in_macro);
9998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        in_macrotemp = _mm_and_si128(in_macrotemp, flag);
10008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        temp1 = _mm_and_si128(lined,
10018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                              _mm_xor_si128(flag, _mm_set1_epi16(0xFFFF)));
10028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        temp1 = _mm_add_epi16(temp1, in_macrotemp);
10038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        //temp1= _mm_packus_epi16 (temp1, zero);
10048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        //_mm_storel_epi64(pu1_src+i, in_macrotemp);
10058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
10068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        //If Ap<Beta
10078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        flag1 = _mm_cmpgt_epi16(const1, i_Ap);
10088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        flag1 = _mm_and_si128(flag, flag1);
10098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        in_macrotemp = _mm_add_epi16(linec, lined);
10108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        in_macrotemp = _mm_add_epi16(in_macrotemp, _mm_set1_epi16(1));
10118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        in_macrotemp = _mm_srai_epi16(in_macrotemp, 1);
10128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        in_macro = _mm_add_epi16(in_macrotemp, linea);
10138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        in_macro = _mm_subs_epi16(in_macro, _mm_slli_epi16(lineb, 1));
10148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        in_macro = _mm_srai_epi16(in_macro, 1);
10158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
10168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        in_macro = _mm_min_epi16(i_C0, in_macro); //CLIP3
10178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        i_C0 = _mm_subs_epi16(zero, i_C0);
10188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        in_macro = _mm_max_epi16(i_C0, in_macro);
10198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
10208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        in_macro = _mm_and_si128(in_macro, flag1);
10218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        lineb = _mm_add_epi16(lineb, in_macro);
10228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        //in_macro= _mm_packus_epi16 (i_p1, zero);
10238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        //_mm_storel_epi64(uc_HorzPixel+i16_posP1+i, in_macro);
10248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
10258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        flag1 = _mm_cmpgt_epi16(const1, i_Aq);
10268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        flag1 = _mm_and_si128(flag, flag1);
10278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        in_macro = _mm_add_epi16(in_macrotemp, linef);
10288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        in_macro = _mm_subs_epi16(in_macro, _mm_slli_epi16(linee, 1));
10298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        in_macro = _mm_srai_epi16(in_macro, 1);
10308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
10318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        i_C0 = _mm_abs_epi16(i_C0);
10328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        in_macro = _mm_min_epi16(i_C0, in_macro); //CLIP3
10338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        i_C0 = _mm_subs_epi16(zero, i_C0);
10348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        in_macro = _mm_max_epi16(i_C0, in_macro);
10358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
10368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        in_macro = _mm_and_si128(in_macro, flag1);
10378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        linee = _mm_add_epi16(linee, in_macro);
10388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        //in_macro= _mm_packus_epi16 (i_q1, zero);
10398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        //_mm_storel_epi64(pu1_src+i16_posQ1+i, in_macro);
10408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        linec = temp;
10418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        lined = temp1;
10428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        //End of filtering
10438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
10448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        int1 = _mm_unpacklo_epi16(linea, linee);
10458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        linee = _mm_unpackhi_epi16(linea, linee);
10468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
10478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        int2 = _mm_unpacklo_epi16(linec, lineg);
10488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        lineg = _mm_unpackhi_epi16(linec, lineg);
10498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
10508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        linea = _mm_unpacklo_epi16(int1, int2);
10518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        int3 = _mm_unpackhi_epi16(int1, int2);
10528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
10538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        linec = _mm_unpacklo_epi16(linee, lineg);
10548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        lineg = _mm_unpackhi_epi16(linee, lineg);
10558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
10568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        int1 = _mm_unpacklo_epi16(lineb, linef);
10578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        linef = _mm_unpackhi_epi16(lineb, linef);
10588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
10598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        int2 = _mm_unpacklo_epi16(lined, lineh);
10608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        lineh = _mm_unpackhi_epi16(lined, lineh);
10618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
10628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        lineb = _mm_unpacklo_epi16(int1, int2);
10638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        int4 = _mm_unpackhi_epi16(int1, int2);
10648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
10658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        lined = _mm_unpacklo_epi16(linef, lineh);
10668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        lineh = _mm_unpackhi_epi16(linef, lineh);
10678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
10688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        int1 = _mm_unpackhi_epi16(linea, lineb);
10698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        linea = _mm_unpacklo_epi16(linea, lineb);
10708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
10718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        int2 = _mm_unpacklo_epi16(int3, int4);
10728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        high1 = _mm_unpackhi_epi16(int3, int4);
10738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
10748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        lineb = _mm_unpacklo_epi16(linec, lined);
10758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        linef = _mm_unpackhi_epi16(linec, lined);
10768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
10778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        lined = _mm_unpacklo_epi16(lineg, lineh);
10788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        lineh = _mm_unpackhi_epi16(lineg, lineh);
10798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
10808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        linee = int1;
10818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        lineg = high1;
10828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        linec = int2;
10838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        //End of inverse transpose
10848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
10858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        //Packs and stores
10868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        linea = _mm_packus_epi16(linea, zero);
10878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        _mm_storel_epi64((__m128i *)(pu1_src - 3 + j), linea);
10888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
10898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        lineb = _mm_packus_epi16(lineb, zero);
10908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        _mm_storel_epi64((__m128i *)(pu1_src - 3 + src_strd + j), lineb);
10918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
10928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        linec = _mm_packus_epi16(linec, zero);
10938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        _mm_storel_epi64((__m128i *)(pu1_src - 3 + 2 * src_strd + j), linec);
10948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
10958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        lined = _mm_packus_epi16(lined, zero);
10968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        _mm_storel_epi64((__m128i *)(pu1_src - 3 + 3 * src_strd + j), lined);
10978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
10988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        linee = _mm_packus_epi16(linee, zero);
10998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        _mm_storel_epi64((__m128i *)(pu1_src - 3 + 4 * src_strd + j), linee);
11008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
11018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        linef = _mm_packus_epi16(linef, zero);
11028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        _mm_storel_epi64((__m128i *)(pu1_src - 3 + 5 * src_strd + j), linef);
11038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
11048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        lineg = _mm_packus_epi16(lineg, zero);
11058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        _mm_storel_epi64((__m128i *)(pu1_src - 3 + 6 * src_strd + j), lineg);
11068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
11078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        lineh = _mm_packus_epi16(lineh, zero);
11088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        _mm_storel_epi64((__m128i *)(pu1_src - 3 + 7 * src_strd + j), lineh);
11098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
11108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    }
11118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S}
11128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
11138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*****************************************************************************/
11148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
11158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  Function Name : ih264_deblk_luma_horz_bslt4_ssse3()                      */
11168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
11178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  Description   : This function performs filtering of a luma block         */
11188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                  horizontal edge when boundary strength is less than 4.   */
11198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
11208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  Inputs        : pu1_src       - pointer to the src sample q0             */
11218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                  src_strd      - source stride                            */
11228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                  alpha         - alpha value for the boundary             */
11238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                  beta          - beta value for the boundary              */
11248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                  u4_bs         - packed Boundary strength array           */
11258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                  pu1_cliptab   - tc0_table                                */
11268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
11278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  Globals       : None                                                     */
11288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
11298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  Processing    : This operation is described in Sec. 8.7.2.3 under the    */
11308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                  title "Filtering process for edges for bS less than 4"   */
11318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                  in ITU T Rec H.264.                                      */
11328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
11338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  Outputs       : None                                                     */
11348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
11358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  Returns       : None                                                     */
11368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
11378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  Issues        : None                                                     */
11388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
11398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  Revision History:                                                        */
11408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
11418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
11428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*         12 02 2015   Naveen Kumar P  Initial version                      */
11438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
11448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*****************************************************************************/
11458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Svoid ih264_deblk_luma_horz_bslt4_ssse3(UWORD8 *pu1_src,
11468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                       WORD32 src_strd,
11478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                       WORD32 alpha,
11488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                       WORD32 beta,
11498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                       UWORD32 u4_bs,
11508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                       const UWORD8 *pu1_cliptab)
11518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S{
11528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    WORD16 i16_posP2, i16_posP1, i16_posP0, i16_posQ1, i16_posQ2;
11538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    UWORD8 *pu1_HorzPixel;
11548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i zero = _mm_setzero_si128();
11558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i bs_flag_16x8b, C0_16x8, C0_8x16, C0_hi_8x16, C_8x16, C_hi_8x16;
11568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i q0_16x8, q1_16x8, q2_16x8, p0_16x8, p1_16x8, p2_16x8;
11578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i temp1, temp2;
11588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i Alpha_8x16, Beta_8x16, flag1_16x8, flag2_16x8, flag3_16x8;
11598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i in_macro_16x8, in_macro_hi_16x8;
11608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i const_val4_8x16;
11618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
11628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    UWORD8 clip0, clip1, clip2, clip3;
11638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
11648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_HorzPixel = pu1_src - (src_strd << 2);
11658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
11668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    i16_posQ1 = src_strd;
11678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    i16_posQ2 = X2(src_strd);
11688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    i16_posP0 = X3(src_strd);
11698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    i16_posP1 = X2(src_strd);
11708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    i16_posP2 = src_strd;
11718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
11728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q0_16x8 = _mm_loadu_si128((__m128i *)(pu1_src));
11738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q1_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ1));
11748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
11758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    u1_Bs0 = (u4_bs >> 24) & 0xff;
11768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    u1_Bs1 = (u4_bs >> 16) & 0xff;
11778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    u1_Bs2 = (u4_bs >> 8) & 0xff;
11788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    u1_Bs3 = (u4_bs >> 0) & 0xff;
11798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    clip0 = pu1_cliptab[u1_Bs0];
11808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    clip1 = pu1_cliptab[u1_Bs1];
11818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    clip2 = pu1_cliptab[u1_Bs2];
11828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    clip3 = pu1_cliptab[u1_Bs3];
11838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
11848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    Alpha_8x16 = _mm_set1_epi16(alpha);
11858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    Beta_8x16 = _mm_set1_epi16(beta);
11868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
11878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    bs_flag_16x8b = _mm_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs2, u1_Bs2,
11888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                 u1_Bs2, u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,
11898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                 u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0);
11908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
11918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    C0_16x8 = _mm_set_epi8(clip3, clip3, clip3, clip3, clip2, clip2, clip2,
11928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                           clip2, clip1, clip1, clip1, clip1, clip0, clip0,
11938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                           clip0, clip0);
11948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
11958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    bs_flag_16x8b = _mm_cmpeq_epi8(bs_flag_16x8b, zero);
11968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    bs_flag_16x8b = _mm_xor_si128(bs_flag_16x8b, _mm_set1_epi8(0xFF)); //Invert for required mask
11978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    C0_8x16 = _mm_unpacklo_epi8(C0_16x8, zero);
11988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    C0_hi_8x16 = _mm_unpackhi_epi8(C0_16x8, zero);
11998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
12008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p1_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP1));
12018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p0_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP0));
12028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p2_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP2));
12038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q2_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ2));
12048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
12058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    //Cond1 (ABS(p0 - q0) < alpha)
12068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_subs_epu8(q0_16x8, p0_16x8);
12078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_subs_epu8(p0_16x8, q0_16x8);
12088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi8(temp1, temp2);
12098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
12108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_unpacklo_epi8(temp1, zero);
12118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_unpackhi_epi8(temp1, zero);
12128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
12138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
12148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
12158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
12168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    flag1_16x8 = _mm_packs_epi16(temp2, temp1);
12178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    flag1_16x8 = _mm_and_si128(flag1_16x8, bs_flag_16x8b);
12188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
12198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    //Cond2 (ABS(q1 - q0) < beta)
12208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_subs_epu8(q0_16x8, q1_16x8);
12218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_subs_epu8(q1_16x8, q0_16x8);
12228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi8(temp1, temp2);
12238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
12248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_unpacklo_epi8(temp1, zero);
12258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_unpackhi_epi8(temp1, zero);
12268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
12278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
12288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
12298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
12308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    flag2_16x8 = _mm_packs_epi16(temp2, temp1);
12318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
12328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
12338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
12348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    //Cond3 (ABS(p1 - p0) < beta)
12358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_subs_epu8(p0_16x8, p1_16x8);
12368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_subs_epu8(p1_16x8, p0_16x8);
12378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi8(temp1, temp2);
12388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
12398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_unpacklo_epi8(temp1, zero);
12408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_unpackhi_epi8(temp1, zero);
12418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
12428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
12438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
12448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
12458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    flag2_16x8 = _mm_packs_epi16(temp2, temp1);
12468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
12478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta))
12488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
12498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
12508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // (ABS(p2 - p0) < beta)
12518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_subs_epu8(p0_16x8, p2_16x8);
12528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_subs_epu8(p2_16x8, p0_16x8);
12538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi8(temp1, temp2);
12548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
12558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_unpacklo_epi8(temp1, zero);
12568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_unpackhi_epi8(temp1, zero);
12578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
12588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
12598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
12608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    flag2_16x8 = _mm_packs_epi16(temp2, temp1);
12618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
12628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
12638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_subs_epi16(zero, temp2);
12648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_subs_epi16(zero, temp1);
12658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
12668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    C_8x16 = _mm_add_epi16(C0_8x16, temp2);
12678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    C_hi_8x16 = _mm_add_epi16(C0_hi_8x16, temp1);
12688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
12698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // (ABS(q2 - q0) < beta)
12708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_subs_epu8(q0_16x8, q2_16x8);
12718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_subs_epu8(q2_16x8, q0_16x8);
12728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi8(temp1, temp2);
12738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
12748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_unpacklo_epi8(temp1, zero);
12758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_unpackhi_epi8(temp1, zero);
12768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
12778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
12788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
12798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    flag3_16x8 = _mm_packs_epi16(temp2, temp1);
12808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    flag3_16x8 = _mm_and_si128(flag1_16x8, flag3_16x8);
12818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
12828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_subs_epi16(zero, temp2);
12838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_subs_epi16(zero, temp1);
12848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
12858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    C_8x16 = _mm_add_epi16(C_8x16, temp2);
12868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    C_hi_8x16 = _mm_add_epi16(C_hi_8x16, temp1);
12878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
12888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    const_val4_8x16 = _mm_set1_epi16(4);
12898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_subs_epi16(_mm_unpacklo_epi8(q0_16x8, zero),
12908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                           _mm_unpacklo_epi8(p0_16x8, zero));
12918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(p1_16x8, zero),
12928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                           _mm_unpacklo_epi8(q1_16x8, zero));
12938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_slli_epi16(temp1, 2);
12948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(temp1, temp2);
12958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(temp1, const_val4_8x16);
12968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    in_macro_16x8 = _mm_srai_epi16(temp1, 3);
12978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
12988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_subs_epi16(_mm_unpackhi_epi8(q0_16x8, zero),
12998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                           _mm_unpackhi_epi8(p0_16x8, zero));
13008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_subs_epi16(_mm_unpackhi_epi8(p1_16x8, zero),
13018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                           _mm_unpackhi_epi8(q1_16x8, zero));
13028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_slli_epi16(temp1, 2);
13038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(temp1, temp2);
13048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(temp1, const_val4_8x16);
13058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    in_macro_hi_16x8 = _mm_srai_epi16(temp1, 3);
13068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
13078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    in_macro_16x8 = _mm_min_epi16(C_8x16, in_macro_16x8); //CLIP3
13088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    in_macro_hi_16x8 = _mm_min_epi16(C_hi_8x16, in_macro_hi_16x8); //CLIP3
13098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    C_8x16 = _mm_subs_epi16(zero, C_8x16);
13108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    C_hi_8x16 = _mm_subs_epi16(zero, C_hi_8x16);
13118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    in_macro_16x8 = _mm_max_epi16(C_8x16, in_macro_16x8); //CLIP3
13128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    in_macro_hi_16x8 = _mm_max_epi16(C_hi_8x16, in_macro_hi_16x8); //CLIP3
13138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
13148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(_mm_unpacklo_epi8(p0_16x8, zero), in_macro_16x8);
13158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(_mm_unpackhi_epi8(p0_16x8, zero), in_macro_hi_16x8);
13168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
13178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_packus_epi16(temp1, temp2);
13188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
13198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_and_si128(temp1, flag1_16x8);
13208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_and_si128(p0_16x8,
13218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                          _mm_xor_si128(flag1_16x8, _mm_set1_epi16(0xFFFF)));
13228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
13238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi8(temp1, temp2);
13248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
13258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    _mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP0), temp1);
13268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
13278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_sub_epi16(_mm_unpacklo_epi8(q0_16x8, zero), in_macro_16x8);
13288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_sub_epi16(_mm_unpackhi_epi8(q0_16x8, zero), in_macro_hi_16x8);
13298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
13308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_packus_epi16(temp1, temp2);
13318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
13328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_and_si128(temp1, flag1_16x8);
13338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_and_si128(q0_16x8,
13348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                          _mm_xor_si128(flag1_16x8, _mm_set1_epi16(0xFFFF)));
13358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
13368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi8(temp1, temp2);
13378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    _mm_storeu_si128((__m128i *)(pu1_src), temp1);
13388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
13398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    //if(Ap < Beta)
13408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_avg_epu16(_mm_unpacklo_epi8(q0_16x8, zero),
13418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                          _mm_unpacklo_epi8(p0_16x8, zero));
13428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_slli_epi16(_mm_unpacklo_epi8(p1_16x8, zero), 1);
13438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    //temp2 = _mm_subs_epi16(zero,temp2);
13448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(p2_16x8, zero), temp2);
13458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(temp1, temp2);
13468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    in_macro_16x8 = _mm_srai_epi16(temp2, 1);
13478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
13488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_avg_epu16(_mm_unpackhi_epi8(q0_16x8, zero),
13498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                          _mm_unpackhi_epi8(p0_16x8, zero));
13508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_slli_epi16(_mm_unpackhi_epi8(p1_16x8, zero), 1);
13518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    //temp2 = _mm_subs_epi16(zero,temp2);
13528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_subs_epi16(_mm_unpackhi_epi8(p2_16x8, zero), temp2);
13538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(temp1, temp2);
13548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    in_macro_hi_16x8 = _mm_srai_epi16(temp2, 1);
13558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
13568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    in_macro_16x8 = _mm_min_epi16(C0_8x16, in_macro_16x8); //CLIP3
13578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    in_macro_hi_16x8 = _mm_min_epi16(C0_hi_8x16, in_macro_hi_16x8); //CLIP3
13588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    C0_8x16 = _mm_subs_epi16(zero, C0_8x16);
13598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    C0_hi_8x16 = _mm_subs_epi16(zero, C0_hi_8x16);
13608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    in_macro_16x8 = _mm_max_epi16(C0_8x16, in_macro_16x8); //CLIP3
13618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    in_macro_hi_16x8 = _mm_max_epi16(C0_hi_8x16, in_macro_hi_16x8); //CLIP3
13628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
13638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(_mm_unpacklo_epi8(p1_16x8, zero), in_macro_16x8);
13648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(_mm_unpackhi_epi8(p1_16x8, zero), in_macro_hi_16x8);
13658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
13668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_packus_epi16(temp1, temp2);
13678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
13688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_and_si128(temp1, flag2_16x8);
13698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_and_si128(p1_16x8,
13708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                          _mm_xor_si128(flag2_16x8, _mm_set1_epi16(0xFFFF)));
13718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi8(temp1, temp2);
13728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    _mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP1), temp1);
13738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
13748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    //if(Aq < Beta)
13758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_avg_epu16(_mm_unpacklo_epi8(q0_16x8, zero),
13768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                          _mm_unpacklo_epi8(p0_16x8, zero));
13778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_slli_epi16(_mm_unpacklo_epi8(q1_16x8, zero), 1);
13788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    //temp2 = _mm_slli_epi16 (temp2, 1);
13798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(q2_16x8, zero), temp2);
13808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(temp1, temp2);
13818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    in_macro_16x8 = _mm_srai_epi16(temp2, 1);
13828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
13838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_avg_epu16(_mm_unpackhi_epi8(q0_16x8, zero),
13848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                          _mm_unpackhi_epi8(p0_16x8, zero));
13858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_slli_epi16(_mm_unpackhi_epi8(q1_16x8, zero), 1);
13868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    //temp2 = _mm_slli_epi16 (temp2, 1);
13878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_subs_epi16(_mm_unpackhi_epi8(q2_16x8, zero), temp2);
13888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(temp1, temp2);
13898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    in_macro_hi_16x8 = _mm_srai_epi16(temp2, 1);
13908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
13918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    in_macro_16x8 = _mm_max_epi16(C0_8x16, in_macro_16x8); //CLIP3
13928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    in_macro_hi_16x8 = _mm_max_epi16(C0_hi_8x16, in_macro_hi_16x8); //CLIP3
13938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    C0_8x16 = _mm_subs_epi16(zero, C0_8x16);
13948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    C0_hi_8x16 = _mm_subs_epi16(zero, C0_hi_8x16);
13958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    in_macro_16x8 = _mm_min_epi16(C0_8x16, in_macro_16x8); //CLIP3
13968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    in_macro_hi_16x8 = _mm_min_epi16(C0_hi_8x16, in_macro_hi_16x8); //CLIP3
13978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
13988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(_mm_unpacklo_epi8(q1_16x8, zero), in_macro_16x8);
13998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(_mm_unpackhi_epi8(q1_16x8, zero), in_macro_hi_16x8);
14008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
14018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_packus_epi16(temp1, temp2);
14028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
14038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_and_si128(temp1, flag3_16x8);
14048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_and_si128(q1_16x8,
14058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                          _mm_xor_si128(flag3_16x8, _mm_set1_epi16(0xFFFF)));
14068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi8(temp1, temp2);
14078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
14088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    _mm_storeu_si128((__m128i *)(pu1_src + i16_posQ1), temp1);
14098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
14108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S}
14118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
14128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*****************************************************************************/
14138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
14148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  Function Name : ih264_deblk_luma_vert_bs4_mbaff_ssse3()                  */
14158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
14168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  Description   : This function performs filtering of a luma block         */
14178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                  vertical edge when boundary strength is set to 4.        */
14188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
14198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  Inputs        : pu1_src       - pointer to the src sample q0             */
14208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                  src_strd      - source stride                            */
14218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                  alpha         - alpha value for the boundary             */
14228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                  beta          - beta value for the boundary              */
14238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
14248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  Globals       : None                                                     */
14258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
14268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  Processing    : When the function is called twice, this operation is as  */
14278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                  described in Sec. 8.7.2.3 under the title "Filtering     */
14288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                  process for edges for bS equal to 4" in ITU T Rec H.264. */
14298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
14308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  Outputs       : None                                                     */
14318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
14328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  Returns       : None                                                     */
14338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
14348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  Issues        : None                                                     */
14358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
14368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  Revision History:                                                        */
14378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
14388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
14398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*         12 02 2015   Naveen Kumar P  Initial version                      */
14408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
14418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*****************************************************************************/
14428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Svoid ih264_deblk_luma_vert_bs4_mbaff_ssse3(UWORD8 *pu1_src,
14438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                           WORD32 src_strd,
14448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                           WORD32 alpha,
14458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                           WORD32 beta)
14468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S{
14478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i zero = _mm_setzero_si128();
14488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i q0_16x8, q1_16x8, q2_16x8, q3_16x8;
14498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i p0_16x8, p1_16x8, p2_16x8, p3_16x8;
14508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i q0_8x16, q1_8x16, q2_8x16, q3_8x16;
14518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i p0_8x16, p1_8x16, p2_8x16, p3_8x16;
14528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i q0_16x8_1;
14538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i p0_16x8_1;
14548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i q0_16x8_2, q1_16x8_2, q2_16x8_2;
14558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i p0_16x8_2, p1_16x8_2, p2_16x8_2;
14568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i temp1, temp2, temp3, temp4, temp5, temp6;
14578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i Alpha_8x16, Beta_8x16;
14588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i flag1_16x8, flag2_16x8, flag3_16x8, flag4_16x8;
14598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i const_val2_16x8 = _mm_set1_epi16(2);
14608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i line1, line2, line3, line4, line5, line6, line7, line8;
14618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
14628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    Alpha_8x16 = _mm_set1_epi16(alpha);
14638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    Beta_8x16 = _mm_set1_epi16(beta);
14648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
14658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line1 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd));
14668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line2 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd));
14678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line3 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd));
14688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line4 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd));
14698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line5 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd));
14708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line6 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd));
14718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line7 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd));
14728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line8 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd));
14738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
14748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_unpacklo_epi8(line1, line2);
14758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_unpacklo_epi8(line3, line4);
14768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp3 = _mm_unpacklo_epi8(line5, line6);
14778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp4 = _mm_unpacklo_epi8(line7, line8);
14788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
14798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line1 = _mm_unpacklo_epi16(temp1, temp2);
14808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line2 = _mm_unpackhi_epi16(temp1, temp2);
14818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line3 = _mm_unpacklo_epi16(temp3, temp4);
14828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line4 = _mm_unpackhi_epi16(temp3, temp4);
14838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
14848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p1_8x16 = _mm_unpacklo_epi32(line1, line3);
14858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p0_8x16 = _mm_unpackhi_epi32(line1, line3);
14868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q0_8x16 = _mm_unpacklo_epi32(line2, line4);
14878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q1_8x16 = _mm_unpackhi_epi32(line2, line4);
14888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
14898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p3_16x8 = _mm_unpacklo_epi64(p1_8x16, zero);
14908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p2_16x8 = _mm_unpackhi_epi64(p1_8x16, zero);
14918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q2_16x8 = _mm_unpacklo_epi64(q1_8x16, zero);
14928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q3_16x8 = _mm_unpackhi_epi64(q1_8x16, zero);
14938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p1_16x8 = _mm_unpacklo_epi64(p0_8x16, zero);
14948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p0_16x8 = _mm_unpackhi_epi64(p0_8x16, zero);
14958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q0_16x8 = _mm_unpacklo_epi64(q0_8x16, zero);
14968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q1_16x8 = _mm_unpackhi_epi64(q0_8x16, zero);
14978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
14988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    //Cond1 (ABS(p0 - q0) < alpha)
14998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_subs_epu8(q0_16x8, p0_16x8);
15008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_subs_epu8(p0_16x8, q0_16x8);
15018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi8(temp1, temp2);
15028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
15038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_unpacklo_epi8(temp1, zero);
15048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_unpackhi_epi8(temp1, zero);
15058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
15068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
15078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
15088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
15098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    flag1_16x8 = _mm_packs_epi16(temp2, temp1);
15108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
15118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    //Cond2 (ABS(q1 - q0) < beta)
15128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_subs_epu8(q0_16x8, q1_16x8);
15138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_subs_epu8(q1_16x8, q0_16x8);
15148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi8(temp1, temp2);
15158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
15168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_unpacklo_epi8(temp1, zero);
15178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_unpackhi_epi8(temp1, zero);
15188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
15198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
15208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
15218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
15228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    flag2_16x8 = _mm_packs_epi16(temp2, temp1);
15238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
15248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
15258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
15268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    //Cond3 (ABS(p1 - p0) < beta)
15278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_subs_epu8(p0_16x8, p1_16x8);
15288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_subs_epu8(p1_16x8, p0_16x8);
15298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi8(temp1, temp2);
15308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
15318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_unpacklo_epi8(temp1, zero);
15328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_unpackhi_epi8(temp1, zero);
15338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
15348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
15358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
15368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
15378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    flag2_16x8 = _mm_packs_epi16(temp2, temp1);
15388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
15398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta))
15408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
15418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
15428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // (ABS(p0 - q0) < ((alpha >> 2) + 2))
15438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_subs_epu8(p0_16x8, q0_16x8);
15448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_subs_epu8(q0_16x8, p0_16x8);
15458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi8(temp1, temp2);
15468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    Alpha_8x16 = _mm_srai_epi16(Alpha_8x16, 2);
15478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    Alpha_8x16 = _mm_add_epi16(Alpha_8x16, const_val2_16x8);
15488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
15498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_unpacklo_epi8(temp1, zero);
15508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_unpackhi_epi8(temp1, zero);
15518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
15528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
15538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
15548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    flag2_16x8 = _mm_packs_epi16(temp2, temp1);
15558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
15568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
15578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // (ABS(p2 - p0) < beta)
15588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_subs_epu8(p0_16x8, p2_16x8);
15598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_subs_epu8(p2_16x8, p0_16x8);
15608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi8(temp1, temp2);
15618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
15628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_unpacklo_epi8(temp1, zero);
15638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_unpackhi_epi8(temp1, zero);
15648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
15658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
15668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
15678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    flag3_16x8 = _mm_packs_epi16(temp2, temp1);
15688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    flag3_16x8 = _mm_and_si128(flag3_16x8, flag2_16x8);
15698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
15708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // (ABS(q2 - q0) < beta)
15718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_subs_epu8(q0_16x8, q2_16x8);
15728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_subs_epu8(q2_16x8, q0_16x8);
15738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi8(temp1, temp2);
15748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
15758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_unpacklo_epi8(temp1, zero);
15768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_unpackhi_epi8(temp1, zero);
15778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
15788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
15798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
15808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    flag4_16x8 = _mm_packs_epi16(temp2, temp1);
15818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    flag4_16x8 = _mm_and_si128(flag4_16x8, flag2_16x8);
15828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
15838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // First 8 pixels
15848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p3_8x16 = _mm_unpacklo_epi8(p3_16x8, zero);
15858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p2_8x16 = _mm_unpacklo_epi8(p2_16x8, zero);
15868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p1_8x16 = _mm_unpacklo_epi8(p1_16x8, zero);
15878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p0_8x16 = _mm_unpacklo_epi8(p0_16x8, zero);
15888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q0_8x16 = _mm_unpacklo_epi8(q0_16x8, zero);
15898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q1_8x16 = _mm_unpacklo_epi8(q1_16x8, zero);
15908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q2_8x16 = _mm_unpacklo_epi8(q2_16x8, zero);
15918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q3_8x16 = _mm_unpacklo_epi8(q3_16x8, zero);
15928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
15938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // p0_1 and q0_1
15948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(p0_8x16, q1_8x16);
15958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(p1_8x16, q0_8x16);
15968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp5 = _mm_add_epi16(temp1, const_val2_16x8);
15978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp6 = _mm_add_epi16(temp2, const_val2_16x8);
15988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp3 = _mm_slli_epi16(p1_8x16, 1);
15998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp4 = _mm_slli_epi16(q1_8x16, 1);
16008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(temp5, temp3);
16018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(temp6, temp4);
16028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p0_16x8_1 = _mm_srai_epi16(temp1, 2);
16038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q0_16x8_1 = _mm_srai_epi16(temp2, 2);
16048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
16058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // p1_2 and q1_2
16068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp6 = _mm_add_epi16(temp6, p0_8x16);
16078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp5 = _mm_add_epi16(temp5, q0_8x16);
16088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(temp6, p2_8x16);
16098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(temp5, q2_8x16);
16108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p1_16x8_2 = _mm_srai_epi16(temp1, 2);
16118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q1_16x8_2 = _mm_srai_epi16(temp2, 2);
16128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
16138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // p0_2 and q0_2
16148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(temp3, p2_8x16);
16158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(temp4, q2_8x16);
16168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(temp1, q1_8x16);
16178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(temp2, p1_8x16);
16188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp3 = _mm_add_epi16(p0_8x16, q0_8x16);
16198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp3 = _mm_slli_epi16(temp3, 1);
16208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(temp1, temp3);
16218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(temp2, temp3);
16228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4));
16238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4));
16248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p0_16x8_2 = _mm_srai_epi16(temp1, 3);
16258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q0_16x8_2 = _mm_srai_epi16(temp2, 3);
16268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
16278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // p2_2 and q2_2
16288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(temp6, const_val2_16x8);
16298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(temp5, const_val2_16x8);
16308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp3 = _mm_slli_epi16(p2_8x16, 1);
16318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp4 = _mm_slli_epi16(q2_8x16, 1);
16328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp3 = _mm_add_epi16(p2_8x16, temp3);
16338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp4 = _mm_add_epi16(q2_8x16, temp4);
16348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp5 = _mm_slli_epi16(p3_8x16, 1);
16358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp6 = _mm_slli_epi16(q3_8x16, 1);
16368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(temp1, temp3);
16378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(temp2, temp4);
16388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(temp1, temp5);
16398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(temp2, temp6);
16408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p2_16x8_2 = _mm_srai_epi16(temp1, 3);
16418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q2_16x8_2 = _mm_srai_epi16(temp2, 3);
16428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
16438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // p0_1 and q0_1
16448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p0_16x8_1 = _mm_packus_epi16(p0_16x8_1, zero);
16458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q0_16x8_1 = _mm_packus_epi16(q0_16x8_1, zero);
16468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
16478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // p1_2 and q1_2
16488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p1_16x8_2 = _mm_packus_epi16(p1_16x8_2, zero);
16498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q1_16x8_2 = _mm_packus_epi16(q1_16x8_2, zero);
16508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
16518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // p0_2 and q0_2
16528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p0_16x8_2 = _mm_packus_epi16(p0_16x8_2, zero);
16538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q0_16x8_2 = _mm_packus_epi16(q0_16x8_2, zero);
16548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
16558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // p2_2 and q2_2
16568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p2_16x8_2 = _mm_packus_epi16(p2_16x8_2, zero);
16578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q2_16x8_2 = _mm_packus_epi16(q2_16x8_2, zero);
16588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
16598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // p0 and q0
16608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p0_16x8 = _mm_and_si128(p0_16x8,
16618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                            _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF)));
16628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p0_16x8_1 = _mm_and_si128(p0_16x8_1, flag1_16x8);
16638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_1);
16648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q0_16x8 = _mm_and_si128(q0_16x8,
16658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                            _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF)));
16668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q0_16x8_1 = _mm_and_si128(q0_16x8_1, flag1_16x8);
16678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_1);
16688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
16698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // p0 and q0
16708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p0_16x8 = _mm_and_si128(p0_16x8,
16718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                            _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
16728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p0_16x8_2 = _mm_and_si128(p0_16x8_2, flag3_16x8);
16738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_2);
16748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q0_16x8 = _mm_and_si128(q0_16x8,
16758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                            _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
16768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q0_16x8_2 = _mm_and_si128(q0_16x8_2, flag4_16x8);
16778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_2);
16788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
16798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // p1 and q1
16808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p1_16x8 = _mm_and_si128(p1_16x8,
16818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                            _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
16828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p1_16x8_2 = _mm_and_si128(p1_16x8_2, flag3_16x8);
16838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p1_16x8 = _mm_add_epi8(p1_16x8, p1_16x8_2);
16848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q1_16x8 = _mm_and_si128(q1_16x8,
16858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                            _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
16868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q1_16x8_2 = _mm_and_si128(q1_16x8_2, flag4_16x8);
16878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q1_16x8 = _mm_add_epi8(q1_16x8, q1_16x8_2);
16888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
16898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // p2 and q2
16908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p2_16x8 = _mm_and_si128(p2_16x8,
16918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                            _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
16928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p2_16x8_2 = _mm_and_si128(p2_16x8_2, flag3_16x8);
16938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p2_16x8 = _mm_add_epi8(p2_16x8, p2_16x8_2);
16948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q2_16x8 = _mm_and_si128(q2_16x8,
16958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                            _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
16968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q2_16x8_2 = _mm_and_si128(q2_16x8_2, flag4_16x8);
16978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q2_16x8 = _mm_add_epi8(q2_16x8, q2_16x8_2);
16988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
16998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_unpacklo_epi8(p3_16x8, p2_16x8);
17008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_unpacklo_epi8(p1_16x8, p0_16x8);
17018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp3 = _mm_unpacklo_epi8(q0_16x8, q1_16x8);
17028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp4 = _mm_unpacklo_epi8(q2_16x8, q3_16x8);
17038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
17048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p3_8x16 = _mm_unpacklo_epi16(temp1, temp2);
17058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p2_8x16 = _mm_unpackhi_epi16(temp1, temp2);
17068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q2_8x16 = _mm_unpacklo_epi16(temp3, temp4);
17078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q3_8x16 = _mm_unpackhi_epi16(temp3, temp4);
17088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
17098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line1 = _mm_unpacklo_epi32(p3_8x16, q2_8x16);
17108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line2 = _mm_srli_si128(line1, 8);
17118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line3 = _mm_unpackhi_epi32(p3_8x16, q2_8x16);
17128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line4 = _mm_srli_si128(line3, 8);
17138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line5 = _mm_unpacklo_epi32(p2_8x16, q3_8x16);
17148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line6 = _mm_srli_si128(line5, 8);
17158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line7 = _mm_unpackhi_epi32(p2_8x16, q3_8x16);
17168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line8 = _mm_srli_si128(line7, 8);
17178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
17188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd), line1);
17198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd), line2);
17208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd), line3);
17218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd), line4);
17228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd), line5);
17238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd), line6);
17248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd), line7);
17258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd), line8);
17268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
17278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S}
17288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
17298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*****************************************************************************/
17308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
17318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  Function Name : ih264_deblk_luma_vert_bslt4_mbaff_ssse3()                */
17328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
17338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  Description   : This function performs filtering of a luma block         */
17348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                  vertical edge when boundary strength is less than 4.     */
17358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
17368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  Inputs        : pu1_src       - pointer to the src sample q0             */
17378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                  src_strd      - source stride                            */
17388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                  alpha         - alpha value for the boundary             */
17398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                  beta          - beta value for the boundary              */
17408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                  u4_bs         - packed Boundary strength array           */
17418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                  pu1_cliptab   - tc0_table                                */
17428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
17438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  Globals       : None                                                     */
17448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
17458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  Processing    : When the function is called twice, this operation is as  */
17468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                  described in Sec. 8.7.2.3 under the title "Filtering     */
17478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                  process for edges for bS less than 4" in ITU T Rec H.264.*/
17488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
17498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  Outputs       : None                                                     */
17508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
17518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  Returns       : None                                                     */
17528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
17538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  Issues        : None                                                     */
17548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
17558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*  Revision History:                                                        */
17568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
17578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
17588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*         12 02 2015   Naveen Kumar P  Initial version                      */
17598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*                                                                           */
17608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*****************************************************************************/
17618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Svoid ih264_deblk_luma_vert_bslt4_mbaff_ssse3(UWORD8 *pu1_src,
17628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                             WORD32 src_strd,
17638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                             WORD32 alpha,
17648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                             WORD32 beta,
17658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                             UWORD32 u4_bs,
17668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                             const UWORD8 *pu1_cliptab)
17678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S{
17688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i zero = _mm_setzero_si128();
17698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i bs_flag_16x8b, C0_16x8, C0_8x16, C_8x16;
17708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i q0_16x8, q1_16x8, q2_16x8, q3_16x8;
17718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i p0_16x8, p1_16x8, p2_16x8, p3_16x8;
17728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i temp1, temp2, temp3, temp4;
17738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i Alpha_8x16, Beta_8x16, flag1_16x8, flag2_16x8, flag3_16x8;
17748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i in_macro_16x8;
17758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i const_val4_8x16;
17768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
17778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    UWORD8 clip0, clip1, clip2, clip3;
17788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i line1, line2, line3, line4, line5, line6, line7, line8;
17798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i q0_16x8_1, q1_16x8_1, q0_16x8_2;
17808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i p0_16x8_1, p1_16x8_1, p0_16x8_2;
17818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
17828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line1 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd));
17838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line2 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd));
17848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line3 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd));
17858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line4 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd));
17868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line5 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd));
17878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line6 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd));
17888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line7 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd));
17898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line8 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd));
17908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
17918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_unpacklo_epi8(line1, line2);
17928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_unpacklo_epi8(line3, line4);
17938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp3 = _mm_unpacklo_epi8(line5, line6);
17948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp4 = _mm_unpacklo_epi8(line7, line8);
17958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
17968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line1 = _mm_unpacklo_epi16(temp1, temp2);
17978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line2 = _mm_unpackhi_epi16(temp1, temp2);
17988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line3 = _mm_unpacklo_epi16(temp3, temp4);
17998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line4 = _mm_unpackhi_epi16(temp3, temp4);
18008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
18018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_unpacklo_epi32(line1, line3);
18028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_unpackhi_epi32(line1, line3);
18038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp3 = _mm_unpacklo_epi32(line2, line4);
18048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp4 = _mm_unpackhi_epi32(line2, line4);
18058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
18068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p3_16x8 = _mm_unpacklo_epi64(temp1, zero);
18078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p2_16x8 = _mm_unpackhi_epi64(temp1, zero);
18088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q2_16x8 = _mm_unpacklo_epi64(temp4, zero);
18098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q3_16x8 = _mm_unpackhi_epi64(temp4, zero);
18108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p1_16x8 = _mm_unpacklo_epi64(temp2, zero);
18118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p0_16x8 = _mm_unpackhi_epi64(temp2, zero);
18128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q0_16x8 = _mm_unpacklo_epi64(temp3, zero);
18138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q1_16x8 = _mm_unpackhi_epi64(temp3, zero);
18148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
18158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    u1_Bs0 = (u4_bs >> 24) & 0xff;
18168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    u1_Bs1 = (u4_bs >> 16) & 0xff;
18178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    u1_Bs2 = (u4_bs >> 8) & 0xff;
18188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    u1_Bs3 = (u4_bs >> 0) & 0xff;
18198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    clip0 = pu1_cliptab[u1_Bs0];
18208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    clip1 = pu1_cliptab[u1_Bs1];
18218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    clip2 = pu1_cliptab[u1_Bs2];
18228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    clip3 = pu1_cliptab[u1_Bs3];
18238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
18248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    Alpha_8x16 = _mm_set1_epi16(alpha);
18258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    Beta_8x16 = _mm_set1_epi16(beta);
18268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
18278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    bs_flag_16x8b = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, u1_Bs3, u1_Bs3, u1_Bs2,
18288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                 u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs0, u1_Bs0);
18298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
18308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    C0_16x8 = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, clip3, clip3, clip2, clip2,
18318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                           clip1, clip1, clip0, clip0);
18328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
18338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    bs_flag_16x8b = _mm_cmpeq_epi8(bs_flag_16x8b, zero);
18348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    bs_flag_16x8b = _mm_xor_si128(bs_flag_16x8b, _mm_set1_epi8(0xFF)); //Invert for required mask
18358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    C0_8x16 = _mm_unpacklo_epi8(C0_16x8, zero);
18368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
18378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    //Cond1 (ABS(p0 - q0) < alpha)
18388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_subs_epu8(q0_16x8, p0_16x8);
18398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_subs_epu8(p0_16x8, q0_16x8);
18408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi8(temp1, temp2);
18418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
18428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_unpacklo_epi8(temp1, zero);
18438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
18448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
18458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    flag1_16x8 = _mm_packs_epi16(temp2, zero);
18468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    flag1_16x8 = _mm_and_si128(flag1_16x8, bs_flag_16x8b);
18478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
18488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    //Cond2 (ABS(q1 - q0) < beta)
18498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_subs_epu8(q0_16x8, q1_16x8);
18508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_subs_epu8(q1_16x8, q0_16x8);
18518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi8(temp1, temp2);
18528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
18538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_unpacklo_epi8(temp1, zero);
18548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
18558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
18568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    flag2_16x8 = _mm_packs_epi16(temp2, zero);
18578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
18588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
18598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    //Cond3 (ABS(p1 - p0) < beta)
18608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_subs_epu8(p0_16x8, p1_16x8);
18618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_subs_epu8(p1_16x8, p0_16x8);
18628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi8(temp1, temp2);
18638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
18648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_unpacklo_epi8(temp1, zero);
18658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
18668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
18678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    flag2_16x8 = _mm_packs_epi16(temp2, zero);
18688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
18698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta))
18708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
18718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
18728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // (ABS(p2 - p0) < beta)
18738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_subs_epu8(p0_16x8, p2_16x8);
18748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_subs_epu8(p2_16x8, p0_16x8);
18758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi8(temp1, temp2);
18768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
18778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_unpacklo_epi8(temp1, zero);
18788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
18798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
18808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    flag2_16x8 = _mm_packs_epi16(temp2, zero);
18818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
18828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
18838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_subs_epi16(zero, temp2);
18848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
18858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    C_8x16 = _mm_add_epi16(C0_8x16, temp2);
18868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
18878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // (ABS(q2 - q0) < beta)
18888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_subs_epu8(q0_16x8, q2_16x8);
18898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_subs_epu8(q2_16x8, q0_16x8);
18908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi8(temp1, temp2);
18918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
18928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_unpacklo_epi8(temp1, zero);
18938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
18948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
18958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    flag3_16x8 = _mm_packs_epi16(temp2, zero);
18968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    flag3_16x8 = _mm_and_si128(flag1_16x8, flag3_16x8);
18978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
18988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_subs_epi16(zero, temp2);
18998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
19008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    C_8x16 = _mm_add_epi16(C_8x16, temp2);
19018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
19028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    const_val4_8x16 = _mm_set1_epi16(4);
19038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_subs_epi16(_mm_unpacklo_epi8(q0_16x8, zero),
19048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                           _mm_unpacklo_epi8(p0_16x8, zero));
19058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(p1_16x8, zero),
19068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                           _mm_unpacklo_epi8(q1_16x8, zero));
19078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_slli_epi16(temp1, 2);
19088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(temp1, temp2);
19098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(temp1, const_val4_8x16);
19108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    in_macro_16x8 = _mm_srai_epi16(temp1, 3);
19118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
19128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    in_macro_16x8 = _mm_min_epi16(C_8x16, in_macro_16x8); //CLIP3
19138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    C_8x16 = _mm_subs_epi16(zero, C_8x16);
19148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    in_macro_16x8 = _mm_max_epi16(C_8x16, in_macro_16x8); //CLIP3
19158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
19168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // p0
19178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(_mm_unpacklo_epi8(p0_16x8, zero), in_macro_16x8);
19188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
19198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_packus_epi16(temp1, zero);
19208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
19218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p0_16x8_1 = _mm_and_si128(temp1, flag1_16x8);
19228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p0_16x8_2 = _mm_and_si128(
19238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                    p0_16x8, _mm_xor_si128(flag1_16x8, _mm_set1_epi16(0xFFFF)));
19248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
19258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p0_16x8_1 = _mm_add_epi8(p0_16x8_1, p0_16x8_2);
19268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
19278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // q0
19288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_sub_epi16(_mm_unpacklo_epi8(q0_16x8, zero), in_macro_16x8);
19298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
19308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_packus_epi16(temp1, zero);
19318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
19328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q0_16x8_1 = _mm_and_si128(temp1, flag1_16x8);
19338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q0_16x8_2 = _mm_and_si128(
19348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                    q0_16x8, _mm_xor_si128(flag1_16x8, _mm_set1_epi16(0xFFFF)));
19358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
19368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q0_16x8_1 = _mm_add_epi8(q0_16x8_1, q0_16x8_2);
19378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
19388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    //if(Ap < Beta)
19398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_avg_epu16(_mm_unpacklo_epi8(q0_16x8, zero),
19408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                          _mm_unpacklo_epi8(p0_16x8, zero));
19418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_slli_epi16(_mm_unpacklo_epi8(p1_16x8, zero), 1);
19428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    //temp2 = _mm_subs_epi16(zero,temp2);
19438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(p2_16x8, zero), temp2);
19448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(temp1, temp2);
19458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    in_macro_16x8 = _mm_srai_epi16(temp2, 1);
19468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
19478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    in_macro_16x8 = _mm_min_epi16(C0_8x16, in_macro_16x8); //CLIP3
19488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    C0_8x16 = _mm_subs_epi16(zero, C0_8x16);
19498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    in_macro_16x8 = _mm_max_epi16(C0_8x16, in_macro_16x8); //CLIP3
19508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
19518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // p1
19528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(_mm_unpacklo_epi8(p1_16x8, zero), in_macro_16x8);
19538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
19548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_packus_epi16(temp1, zero);
19558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
19568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p1_16x8_1 = _mm_and_si128(temp1, flag2_16x8);
19578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p1_16x8 = _mm_and_si128(p1_16x8,
19588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                            _mm_xor_si128(flag2_16x8, _mm_set1_epi16(0xFFFF)));
19598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    p1_16x8 = _mm_add_epi8(p1_16x8, p1_16x8_1);
19608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
19618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    //if(Aq < Beta)
19628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_avg_epu16(_mm_unpacklo_epi8(q0_16x8, zero),
19638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                          _mm_unpacklo_epi8(p0_16x8, zero));
19648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_slli_epi16(_mm_unpacklo_epi8(q1_16x8, zero), 1);
19658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    //temp2 = _mm_slli_epi16 (temp2, 1);
19668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(q2_16x8, zero), temp2);
19678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_add_epi16(temp1, temp2);
19688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    in_macro_16x8 = _mm_srai_epi16(temp2, 1);
19698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
19708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    in_macro_16x8 = _mm_max_epi16(C0_8x16, in_macro_16x8); //CLIP3
19718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    C0_8x16 = _mm_subs_epi16(zero, C0_8x16);
19728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    in_macro_16x8 = _mm_min_epi16(C0_8x16, in_macro_16x8); //CLIP3
19738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
19748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_add_epi16(_mm_unpacklo_epi8(q1_16x8, zero), in_macro_16x8);
19758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
19768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // q1
19778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_packus_epi16(temp1, zero);
19788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
19798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q1_16x8_1 = _mm_and_si128(temp1, flag3_16x8);
19808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q1_16x8 = _mm_and_si128(q1_16x8,
19818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                            _mm_xor_si128(flag3_16x8, _mm_set1_epi16(0xFFFF)));
19828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    q1_16x8 = _mm_add_epi8(q1_16x8, q1_16x8_1);
19838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
19848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_unpacklo_epi8(p3_16x8, p2_16x8);
19858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_unpacklo_epi8(p1_16x8, p0_16x8_1);
19868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp3 = _mm_unpacklo_epi8(q0_16x8_1, q1_16x8);
19878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp4 = _mm_unpacklo_epi8(q2_16x8, q3_16x8);
19888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
19898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line7 = _mm_unpacklo_epi16(temp1, temp2);
19908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp1 = _mm_unpackhi_epi16(temp1, temp2);
19918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line8 = _mm_unpacklo_epi16(temp3, temp4);
19928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    temp2 = _mm_unpackhi_epi16(temp3, temp4);
19938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
19948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line1 = _mm_unpacklo_epi32(line7, line8);
19958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line2 = _mm_srli_si128(line1, 8);
19968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line3 = _mm_unpackhi_epi32(line7, line8);
19978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line4 = _mm_srli_si128(line3, 8);
19988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line5 = _mm_unpacklo_epi32(temp1, temp2);
19998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line6 = _mm_srli_si128(line5, 8);
20008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line7 = _mm_unpackhi_epi32(temp1, temp2);
20018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    line8 = _mm_srli_si128(line7, 8);
20028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
20038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd), line1);
20048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd), line2);
20058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd), line3);
20068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd), line4);
20078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd), line5);
20088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd), line6);
20098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd), line7);
20108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd), line8);
20118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S}
20128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2013