1/******************************************************************************
2*
3* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4*
5* Licensed under the Apache License, Version 2.0 (the "License");
6* you may not use this file except in compliance with the License.
7* You may obtain a copy of the License at:
8*
9* http://www.apache.org/licenses/LICENSE-2.0
10*
11* Unless required by applicable law or agreed to in writing, software
12* distributed under the License is distributed on an "AS IS" BASIS,
13* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14* See the License for the specific language governing permissions and
15* limitations under the License.
16*
17******************************************************************************/
18/**
19*******************************************************************************
20* @file
21*  ihevc_deblck_atom_intr.c
22*
23* @brief
24*  Contains function definitions for deblocking filters
25*
26* @author
27*  Rishab
28*
29* @par List of Functions:
30*   - ihevc_deblk_luma_vert_ssse3()
31*   - ihevc_deblk_luma_horz_ssse3()
32*   - ihevc_deblk_chroma_vert_ssse3()
33*   - ihevc_deblk_chroma_horz_ssse3()
34*
35* @remarks
36*  None
37*
38*******************************************************************************
39*/
40#include <stdlib.h>
41#include <stdio.h>
42#include <assert.h>
43#include "ihevc_typedefs.h"
44#include "ihevc_platform_macros.h"
45#include "ihevc_macros.h"
46#include "ihevc_deblk.h"
47#include "ihevc_deblk_tables.h"
48#include "ihevc_debug.h"
49
50#include "ihevc_tables_x86_intr.h"
51
52#include <immintrin.h>
53/**
54*******************************************************************************
55*
56* @brief
57*       Decision process and filtering for the luma block vertical edge.
58*
59* @par Description:
60*     The decision process for the luma block vertical edge is  carried out and
61*     an appropriate filter is applied. The  boundary filter strength, bs should
62*     be greater than 0.  The pcm flags and the transquant bypass flags should
63*     be  taken care of by the calling function.
64*
65* @param[in] pu1_src
66*  Pointer to the src sample q(0,0)
67*
68* @param[in] src_strd
69*  Source stride
70*
71* @param[in] bs
72*  Boundary filter strength of q(0,0)
73*
74* @param[in] quant_param_p
75*  quantization parameter of p block
76*
77* @param[in] quant_param_q
78*  quantization parameter of p block
79*
80* @param[in] beta_offset_div2
81*
82*
83* @param[in] tc_offset_div2
84*
85*
86* @param[in] filter_flag_p
87*  flag whether to filter the p block
88*
89* @param[in] filter_flag_q
90*  flag whether to filter the q block
91*
92* @returns
93*
94* @remarks
95*  None
96*
97*******************************************************************************
98*/
99
100void ihevc_deblk_luma_vert_ssse3(UWORD8 *pu1_src,
101                                 WORD32 src_strd,
102                                 WORD32 bs,
103                                 WORD32 quant_param_p,
104                                 WORD32 quant_param_q,
105                                 WORD32 beta_offset_div2,
106                                 WORD32 tc_offset_div2,
107                                 WORD32 filter_flag_p,
108                                 WORD32 filter_flag_q)
109{
110    WORD32 qp_luma, beta_indx, tc_indx;
111    WORD32 beta, tc;
112    WORD32 d, dp, dq, d_sam0, d_sam3;
113
114    WORD32 d3, d0, de_0, de_1, de_2, de_3;
115    WORD32 de, dep, deq;
116    __m128i src_row0_8x16b, src_row1_8x16b, src_row2_8x16b, src_row3_8x16b;
117
118
119    {
120        __m128i src_tmp_8x16b, coef_8x16b, mask_d_result_4x32b, mask_de_result_8x16b;
121        __m128i mask_16x8b, temp_coef0_8x16b, temp_coef1_8x16b;
122
123
124
125        ASSERT((bs > 0) && (bs <= 3));
126        ASSERT(filter_flag_p || filter_flag_q);
127
128        qp_luma = (quant_param_p + quant_param_q + 1) >> 1;
129        beta_indx = CLIP3(qp_luma + (beta_offset_div2 << 1), 0, 51);
130
131        /* BS based on implementation can take value 3 if it is intra/inter egde          */
132        /* based on BS, tc index is calcuated by adding 2 * ( bs - 1) to QP and tc_offset */
133        /* for BS = 1 adding factor is (0*2), BS = 2 or 3 adding factor is (1*2)          */
134        /* the above desired functionallity is achieved by doing (2*(bs>>1))              */
135
136        tc_indx = CLIP3(qp_luma + (2 * (bs >> 1)) + (tc_offset_div2 << 1), 0, 53);
137
138        beta = gai4_ihevc_beta_table[beta_indx];
139        tc = gai4_ihevc_tc_table[tc_indx];
140        if(0 == tc)
141        {
142            return;
143        }
144        src_row0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - 4));
145        src_row3_8x16b = _mm_loadl_epi64((__m128i *)((pu1_src - 4) + 3 * src_strd));
146
147        coef_8x16b = _mm_load_si128((__m128i *)(coef_d));
148        mask_16x8b =  _mm_load_si128((__m128i *)(shuffle_d));
149
150        src_tmp_8x16b = _mm_unpacklo_epi64(src_row0_8x16b, src_row3_8x16b);
151        mask_de_result_8x16b = _mm_shuffle_epi8(src_tmp_8x16b, mask_16x8b);
152
153        mask_d_result_4x32b = _mm_maddubs_epi16(src_tmp_8x16b, coef_8x16b);
154
155
156        //to get all 1's of 8 bit in (1)
157        temp_coef0_8x16b = _mm_cmpeq_epi16(src_tmp_8x16b, src_tmp_8x16b);
158        temp_coef1_8x16b = _mm_srli_epi16(temp_coef0_8x16b, 15);
159        //accumulating values foe dp3 dq3 , dp0 dq0 values
160        mask_d_result_4x32b = _mm_madd_epi16(mask_d_result_4x32b, temp_coef1_8x16b);
161
162        temp_coef1_8x16b = _mm_packus_epi16(temp_coef1_8x16b, temp_coef1_8x16b);
163        // to get all 1,-1 sets of 16 bits in (0)
164        temp_coef0_8x16b = _mm_unpacklo_epi8(temp_coef0_8x16b, temp_coef1_8x16b);
165        //q33-q30,p33-p30,q03-q00,p03-p00,0,q30-p30,0,q00-p00
166        mask_de_result_8x16b = _mm_maddubs_epi16(mask_de_result_8x16b, temp_coef0_8x16b);
167        //to get 16 bit 1's
168        temp_coef0_8x16b = _mm_srli_epi16(temp_coef1_8x16b, 8);
169
170
171        // dq3 dp3 dq0 dp0
172        mask_d_result_4x32b = _mm_abs_epi32(mask_d_result_4x32b);
173        mask_16x8b = _mm_shuffle_epi32(mask_d_result_4x32b, 0xec);
174        mask_d_result_4x32b = _mm_shuffle_epi32(mask_d_result_4x32b, 0x49);
175        // dq dp d3 d0
176        mask_d_result_4x32b = _mm_add_epi32(mask_d_result_4x32b, mask_16x8b);
177        //|q33-q30|,|p33-p30|,|q03-q00|,|p03-p00|,0,|q30-p30|,0,|q00-p00|
178        mask_de_result_8x16b = _mm_abs_epi16(mask_de_result_8x16b);
179        //|q33-q30|+|p33-p30|,|q03-q00|+|p03-p00|,0+|q30-p30|,0+|q00-p00|
180        mask_de_result_8x16b = _mm_madd_epi16(mask_de_result_8x16b, temp_coef0_8x16b);
181
182        ///store back in a single variable
183        temp_coef0_8x16b = _mm_srli_si128(mask_d_result_4x32b, 4);
184        temp_coef1_8x16b = _mm_srli_si128(mask_d_result_4x32b, 8);
185        mask_16x8b = _mm_srli_si128(mask_d_result_4x32b, 12);
186
187        d0 = _mm_cvtsi128_si32(mask_d_result_4x32b);
188        d3 = _mm_cvtsi128_si32(temp_coef0_8x16b);
189        dp = _mm_cvtsi128_si32(temp_coef1_8x16b);
190        dq = _mm_cvtsi128_si32(mask_16x8b);
191        //getting d
192        d = d0 + d3;
193
194        ///store back in a single variable
195        temp_coef0_8x16b = _mm_srli_si128(mask_de_result_8x16b, 4);
196        temp_coef1_8x16b = _mm_srli_si128(mask_de_result_8x16b, 8);
197        mask_16x8b = _mm_srli_si128(mask_de_result_8x16b, 12);
198
199        de_0 = _mm_cvtsi128_si32(mask_de_result_8x16b);
200        de_1 = _mm_cvtsi128_si32(temp_coef0_8x16b);
201        de_2 = _mm_cvtsi128_si32(temp_coef1_8x16b);
202        de_3 = _mm_cvtsi128_si32(mask_16x8b);
203
204        de = 0;
205        dep = 0;
206        deq = 0;
207        if(d < beta)
208        {
209            d_sam0 = 0;
210            if((2 * d0 < (beta >> 2))
211                            && (de_2 < (beta >> 3))
212                            && (de_0 < ((5 * tc + 1) >> 1)))
213            {
214                d_sam0 = 1;
215            }
216
217            d_sam3 = 0;
218            if((2 * d3 < (beta >> 2))
219                            && (de_3 < (beta >> 3))
220                            && de_1 < ((5 * tc + 1) >> 1))
221            {
222                d_sam3 = 1;
223            }
224
225            de = (d_sam0 & d_sam3) + 1;
226            dep = (dp < (beta + (beta >> 1)) >> 3) ? 1 : 0;
227            deq = (dq < (beta + (beta >> 1)) >> 3) ? 1 : 0;
228            if(tc <= 1)
229            {
230                dep = 0;
231                deq = 0;
232            }
233        }
234
235    }
236
237    if(de != 0)
238    {
239
240
241        src_row1_8x16b = _mm_loadl_epi64((__m128i *)((pu1_src - 4) + src_strd));
242        src_row2_8x16b = _mm_loadl_epi64((__m128i *)((pu1_src - 4) + 2 * src_strd));
243
244        if(de == 2)
245        {
246            __m128i temp_pq_str0_16x8b;
247            __m128i temp_pq1_str0_16x8b, temp_pq1_str1_16x8b;
248            __m128i temp_pq2_str0_16x8b;
249            __m128i temp_pq_str1_16x8b;
250            __m128i temp_str0_16x8b, temp_str1_16x8b, temp_str2_16x8b, temp_str3_16x8b;
251            __m128i temp_max0_16x8b, temp_max1_16x8b, temp_min0_16x8b, temp_min1_16x8b;
252            __m128i const2_8x16b, const2tc_8x16b;
253            LWORD64 mask, tc2;
254            tc = tc << 1;
255            mask = (((LWORD64)filter_flag_q) << 63) | (((LWORD64)filter_flag_p) << 31);
256            tc2 = ((LWORD64)tc);
257
258            const2_8x16b = _mm_cmpeq_epi16(src_row0_8x16b, src_row0_8x16b);
259            //q'0-q'1-2 ,p'0-p'1-2
260            src_row0_8x16b = _mm_unpacklo_epi64(src_row0_8x16b, src_row2_8x16b);
261            src_row1_8x16b = _mm_unpacklo_epi64(src_row1_8x16b, src_row3_8x16b);
262
263            const2_8x16b = _mm_srli_epi16(const2_8x16b, 15);
264            temp_pq_str0_16x8b = _mm_srli_epi64(src_row0_8x16b, 16);
265            temp_pq_str1_16x8b = _mm_srli_epi64(src_row1_8x16b, 16);
266            //arranged x x x x x x x x q31 q30 q1 q10 p30 p31 p10 p11 , x x x x x x x x q21 q20 q01 q00 p20 p21 p00 p01
267            temp_str0_16x8b = _mm_unpacklo_epi16(temp_pq_str0_16x8b, temp_pq_str1_16x8b);
268            temp_str1_16x8b = _mm_unpackhi_epi16(temp_pq_str0_16x8b, temp_pq_str1_16x8b);
269
270            const2_8x16b = _mm_packus_epi16(const2_8x16b, const2_8x16b);
271            //arranged q31 q30 q21 q20 q1 q10 q01 q00 p30 p31 p20 p21 p10 p11 p00 p01
272            temp_pq_str0_16x8b = _mm_unpacklo_epi32(temp_str0_16x8b, temp_str1_16x8b);
273
274            temp_pq_str0_16x8b = _mm_maddubs_epi16(temp_pq_str0_16x8b, const2_8x16b);
275
276            //q'1-2, p'1-2
277            temp_pq1_str0_16x8b = _mm_srli_epi64(src_row0_8x16b, 8);
278            temp_pq1_str1_16x8b = _mm_srli_epi64(src_row1_8x16b, 8);
279
280            temp_str2_16x8b = _mm_unpacklo_epi16(temp_pq1_str0_16x8b, temp_pq1_str1_16x8b);
281            temp_str3_16x8b = _mm_unpackhi_epi16(temp_pq1_str0_16x8b, temp_pq1_str1_16x8b);
282
283            temp_str2_16x8b = _mm_shuffle_epi32(temp_str2_16x8b, 0x58);
284            temp_str3_16x8b = _mm_shuffle_epi32(temp_str3_16x8b, 0x58);
285            // q30 p30 q20 p20 q10 p10 q01 q00 p30 q20 p20 q10 p10 q01 q00 p00
286            temp_pq1_str0_16x8b = _mm_unpackhi_epi32(temp_str2_16x8b, temp_str3_16x8b);
287            // q32 q31 q22 q21 q12 q11 q02 q01 p32 p31 p22 p21 p12 p11 p02 p01
288            temp_pq1_str1_16x8b = _mm_unpacklo_epi32(temp_str2_16x8b, temp_str3_16x8b);
289
290            temp_pq1_str0_16x8b = _mm_maddubs_epi16(temp_pq1_str0_16x8b, const2_8x16b);
291            temp_pq1_str1_16x8b = _mm_maddubs_epi16(temp_pq1_str1_16x8b, const2_8x16b);
292
293            //clipping mask design
294            temp_str1_16x8b = _mm_setzero_si128();
295            temp_str0_16x8b = _mm_loadl_epi64((__m128i *)(&mask));
296            const2tc_8x16b  = _mm_loadl_epi64((__m128i *)(&tc2));
297            temp_str0_16x8b = _mm_shuffle_epi32(temp_str0_16x8b, 0x44);
298            const2tc_8x16b  = _mm_shuffle_epi8(const2tc_8x16b, temp_str1_16x8b);
299
300            //clipping mask design
301            temp_str0_16x8b = _mm_srai_epi32(temp_str0_16x8b, 31);
302            const2tc_8x16b = _mm_and_si128(const2tc_8x16b, temp_str0_16x8b);
303            //calculating Clipping MAX for all pixel values.
304            temp_max0_16x8b = _mm_adds_epu8(src_row0_8x16b, const2tc_8x16b);
305            temp_max1_16x8b = _mm_adds_epu8(src_row1_8x16b, const2tc_8x16b);
306
307
308            //q'2-q'0-2,p'2-p'0-2
309            temp_pq2_str0_16x8b = _mm_unpacklo_epi16(src_row0_8x16b, src_row2_8x16b);
310            temp_str3_16x8b     = _mm_unpacklo_epi16(src_row1_8x16b, src_row3_8x16b);
311
312            temp_pq2_str0_16x8b = _mm_shuffle_epi32(temp_pq2_str0_16x8b, 0x5c);
313            temp_str3_16x8b     = _mm_shuffle_epi32(temp_str3_16x8b, 0x5c);
314
315            const2_8x16b = _mm_slli_epi16(const2_8x16b, 1);
316            //arranged q33 q32 q23 q22 q13 q12 q03 q02 p33 p32 p23 p22 p13 p12 p03 p02
317            temp_str3_16x8b = _mm_unpacklo_epi16(temp_pq2_str0_16x8b, temp_str3_16x8b);
318
319            temp_pq2_str0_16x8b = _mm_maddubs_epi16(temp_str3_16x8b, const2_8x16b);
320
321            //calculating Clipping MIN for all pixel values.
322            temp_min0_16x8b = _mm_subs_epu8(src_row0_8x16b, const2tc_8x16b);
323            temp_min1_16x8b = _mm_subs_epu8(src_row1_8x16b, const2tc_8x16b);
324            //q'0-q'1-2 ,p'0-p'1-2
325            temp_pq_str1_16x8b = _mm_shuffle_epi32(temp_pq_str0_16x8b, 0x4e);
326            temp_pq_str0_16x8b = _mm_add_epi16(temp_pq_str0_16x8b, temp_pq_str1_16x8b);
327            //q'1-2 p'1-2
328            temp_pq1_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq1_str1_16x8b);
329            //to get 2 in 16 bit
330            const2_8x16b = _mm_srli_epi16(const2_8x16b, 8);
331            //to get q33 q23 q13 q03, p33 p23 p13 p03
332            temp_pq1_str1_16x8b = _mm_slli_epi16(temp_str3_16x8b, 8);
333            temp_pq_str1_16x8b = _mm_srli_epi16(temp_str3_16x8b, 8);
334            temp_pq1_str1_16x8b = _mm_srli_epi16(temp_pq1_str1_16x8b, 8);
335
336            //q'1, p'1 (adding 2)
337            temp_pq1_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, const2_8x16b);
338            //q'0-q'1,p'0-p'1
339            temp_pq_str0_16x8b = _mm_add_epi16(temp_pq_str0_16x8b, const2_8x16b);
340            //q'2-q'1,p'2-p'1
341            temp_pq2_str0_16x8b = _mm_add_epi16(temp_pq2_str0_16x8b, const2_8x16b);
342            //q'0 = (q'0-q'1)+q'1 ,p'0 = (p'0-p'1)+p'1;
343            temp_pq_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq_str0_16x8b);
344            //q'2 = (q'2-q'1)+q'1 ,p'2 = (p'2-p'1)+p'1;
345            temp_pq2_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq2_str0_16x8b);
346
347            //normalisation of all modified pixels
348            temp_pq_str0_16x8b  = _mm_srai_epi16(temp_pq_str0_16x8b, 3);
349            temp_pq1_str0_16x8b = _mm_srai_epi16(temp_pq1_str0_16x8b, 2);
350            temp_pq2_str0_16x8b = _mm_srai_epi16(temp_pq2_str0_16x8b, 3);
351
352            //getting p0 p1 together and p2 p3 together
353            temp_str0_16x8b = _mm_unpacklo_epi16(temp_pq1_str0_16x8b, temp_pq_str0_16x8b);
354            temp_str2_16x8b = _mm_unpacklo_epi16(temp_pq1_str1_16x8b, temp_pq2_str0_16x8b);
355            //getting q1 q0 together and  q3 q2 together
356            temp_pq_str0_16x8b = _mm_unpackhi_epi16(temp_pq_str0_16x8b, temp_pq1_str0_16x8b);
357            temp_pq2_str0_16x8b = _mm_unpackhi_epi16(temp_pq2_str0_16x8b, temp_pq_str1_16x8b);
358            //getting p's of row0 row1 together and of row2 row3 together
359            temp_pq_str1_16x8b = _mm_unpacklo_epi32(temp_str2_16x8b, temp_str0_16x8b);
360            temp_str2_16x8b    = _mm_unpackhi_epi32(temp_str2_16x8b, temp_str0_16x8b);
361            //getting q's of row0 row1 together and of row2 row3 together
362            temp_str0_16x8b    = _mm_unpacklo_epi32(temp_pq_str0_16x8b, temp_pq2_str0_16x8b);
363            temp_pq_str0_16x8b = _mm_unpackhi_epi32(temp_pq_str0_16x8b, temp_pq2_str0_16x8b);
364            //getting values for respective rows in 16 bit
365            src_row0_8x16b = _mm_unpacklo_epi64(temp_pq_str1_16x8b, temp_str0_16x8b);
366            src_row1_8x16b = _mm_unpackhi_epi64(temp_pq_str1_16x8b, temp_str0_16x8b);
367            src_row2_8x16b = _mm_unpacklo_epi64(temp_str2_16x8b, temp_pq_str0_16x8b);
368            src_row3_8x16b = _mm_unpackhi_epi64(temp_str2_16x8b, temp_pq_str0_16x8b);
369            //packing values to 8 bit
370            src_row0_8x16b = _mm_packus_epi16(src_row0_8x16b, src_row2_8x16b);
371            src_row1_8x16b = _mm_packus_epi16(src_row1_8x16b, src_row3_8x16b);
372            //Clipping MAX
373            src_row0_8x16b = _mm_min_epu8(src_row0_8x16b, temp_max0_16x8b);
374            src_row1_8x16b = _mm_min_epu8(src_row1_8x16b, temp_max1_16x8b);
375            //Clipping MIN
376            src_row0_8x16b = _mm_max_epu8(src_row0_8x16b, temp_min0_16x8b);
377            src_row1_8x16b = _mm_max_epu8(src_row1_8x16b, temp_min1_16x8b);
378            //separating row 2 and row 3
379            src_row2_8x16b = _mm_srli_si128(src_row0_8x16b, 8);
380            src_row3_8x16b = _mm_srli_si128(src_row1_8x16b, 8);
381
382        }
383
384        else
385        {
386
387            __m128i tmp_delta0_8x16b, tmp_delta1_8x16b, tmp_delta2_8x16b, tmp_delta3_8x16b;
388            __m128i tmp0_const_8x16b, tmp1_const_8x16b, tmp2_const_8x16b, tmp3_const_8x16b;
389            __m128i coefdelta_0_8x16b, mask_pq_8x16b;
390            __m128i const2_8x16b, consttc_8x16b;
391
392            LWORD64 mask1;
393            mask1 = (((LWORD64)(filter_flag_q & deq)) << 63) | (((LWORD64)filter_flag_q) << 47) | (((LWORD64)filter_flag_p) << 31) | (((LWORD64)(filter_flag_p & dep)) << 15);
394
395            consttc_8x16b = _mm_set1_epi32(tc);
396
397
398            src_row0_8x16b = _mm_unpacklo_epi64(src_row0_8x16b, src_row1_8x16b);
399            src_row2_8x16b = _mm_unpacklo_epi64(src_row2_8x16b, src_row3_8x16b);
400
401            tmp_delta2_8x16b = _mm_srli_epi64(src_row0_8x16b, 16);
402            tmp_delta3_8x16b = _mm_srli_epi64(src_row2_8x16b, 16);
403
404            tmp_delta2_8x16b = _mm_shuffle_epi32(tmp_delta2_8x16b, 0x08);
405            tmp_delta3_8x16b = _mm_shuffle_epi32(tmp_delta3_8x16b, 0x08);
406            //arranged q31 q30 p30 p31  q21 q20 p20 p21  q1 q10 p10 p11 q01 q00 p00 p01
407            tmp_delta2_8x16b = _mm_unpacklo_epi64(tmp_delta2_8x16b, tmp_delta3_8x16b);
408
409            coefdelta_0_8x16b = _mm_load_si128((__m128i *)coef_de1);
410            // (-3q1+9q0),(-9p0+3p1)
411            tmp_delta3_8x16b = _mm_maddubs_epi16(tmp_delta2_8x16b, coefdelta_0_8x16b);
412            //converting to 16 bit
413            consttc_8x16b = _mm_packs_epi32(consttc_8x16b, consttc_8x16b);
414            //getting -tc store
415            tmp1_const_8x16b = _mm_cmpeq_epi32(consttc_8x16b, consttc_8x16b);
416            //calc 10 *tc = 2*tc +8*tc ; 2*tc
417            tmp2_const_8x16b = _mm_slli_epi16(consttc_8x16b, 1);
418            //calc 10 *tc = 2*tc +8*tc ; 8*tc
419            tmp0_const_8x16b = _mm_slli_epi16(consttc_8x16b, 3);
420            //getting -tc store
421            tmp3_const_8x16b = _mm_sign_epi16(consttc_8x16b, tmp1_const_8x16b);
422            //calc 10 *tc
423            tmp2_const_8x16b = _mm_add_epi16(tmp2_const_8x16b, tmp0_const_8x16b);
424            //const 1
425            const2_8x16b = _mm_srli_epi16(tmp1_const_8x16b, 15);
426            tmp_delta0_8x16b = _mm_madd_epi16(tmp_delta3_8x16b, const2_8x16b);
427            const2_8x16b = _mm_srli_epi32(tmp1_const_8x16b, 31);
428            //getting the mask values
429            mask_pq_8x16b = _mm_loadl_epi64((__m128i *)(&mask1));
430            //loaded coef for delta1 calculation
431            coefdelta_0_8x16b = _mm_load_si128((__m128i *)coef_dep1);
432            //(-2q1+q0),(p0-2p1)
433            tmp_delta3_8x16b = _mm_maddubs_epi16(tmp_delta2_8x16b, coefdelta_0_8x16b);
434            //const 8
435            const2_8x16b = _mm_slli_epi32(const2_8x16b, 3);
436            //rearranging the mask values
437            mask_pq_8x16b = _mm_unpacklo_epi64(mask_pq_8x16b, mask_pq_8x16b);
438            //normalisation of the filter
439            tmp_delta0_8x16b = _mm_add_epi32(tmp_delta0_8x16b, const2_8x16b);
440            tmp_delta0_8x16b = _mm_srai_epi32(tmp_delta0_8x16b, 4);
441
442            //getting deltaq0
443            tmp_delta2_8x16b = _mm_sign_epi32(tmp_delta0_8x16b, tmp1_const_8x16b);
444            //packing  d3q d2q d1q d0q d3p d2p d1p d0p
445            tmp_delta0_8x16b = _mm_packs_epi32(tmp_delta0_8x16b, tmp_delta2_8x16b);
446            //absolute delta
447            tmp_delta2_8x16b = _mm_abs_epi16(tmp_delta0_8x16b);
448            //Clipping of delta0
449            tmp_delta0_8x16b = _mm_min_epi16(tmp_delta0_8x16b, consttc_8x16b);
450            //mask for |delta| < 10*tc
451            tmp0_const_8x16b = _mm_cmpgt_epi16(tmp2_const_8x16b, tmp_delta2_8x16b);
452            //Clipping of delta0
453            tmp_delta0_8x16b = _mm_max_epi16(tmp_delta0_8x16b, tmp3_const_8x16b);
454
455
456            //delta 1 calc starts
457
458            //getting q32 q22 q12 q02 p32 p12 p22 p02
459            tmp2_const_8x16b = _mm_loadl_epi64((__m128i *)(shuffle0));
460            tmp_delta2_8x16b = _mm_shuffle_epi8(src_row0_8x16b, tmp2_const_8x16b);
461            tmp_delta1_8x16b =  _mm_shuffle_epi8(src_row2_8x16b, tmp2_const_8x16b);
462            tmp_delta1_8x16b = _mm_unpacklo_epi32(tmp_delta2_8x16b, tmp_delta1_8x16b);
463            //constant 1
464            const2_8x16b = _mm_srli_epi16(tmp1_const_8x16b, 15);
465            //tc>>1 16 bit
466            consttc_8x16b = _mm_srai_epi16(consttc_8x16b, 1);
467
468            //getting -tc>>1 store  16 bit
469            tmp1_const_8x16b = _mm_sign_epi16(consttc_8x16b, tmp1_const_8x16b);
470            //2*delta0
471            tmp2_const_8x16b = _mm_add_epi16(tmp_delta0_8x16b, tmp_delta0_8x16b);
472
473            //getting  all respective q's and p's together
474            tmp3_const_8x16b = _mm_load_si128((__m128i *)(shuffle1));
475            tmp_delta3_8x16b = _mm_shuffle_epi8(tmp_delta3_8x16b, tmp3_const_8x16b);
476            //final adds for deltap1 and deltaq1
477            tmp_delta3_8x16b = _mm_add_epi16(tmp_delta3_8x16b, const2_8x16b);
478            tmp_delta1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, tmp2_const_8x16b);
479            tmp_delta1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, tmp_delta3_8x16b);
480            tmp2_const_8x16b = _mm_setzero_si128();
481            tmp_delta1_8x16b = _mm_srai_epi16(tmp_delta1_8x16b, 2);
482
483            // clipping delta1
484            tmp_delta1_8x16b = _mm_min_epi16(tmp_delta1_8x16b, consttc_8x16b);
485            // clipping delta1
486            tmp_delta1_8x16b = _mm_max_epi16(tmp_delta1_8x16b, tmp1_const_8x16b);
487
488            //getting the mask ready
489            mask_pq_8x16b = _mm_srai_epi16(mask_pq_8x16b, 15);
490            //masking of the delta values |delta|<10*tc
491            tmp_delta1_8x16b = _mm_and_si128(tmp_delta1_8x16b, tmp0_const_8x16b);
492            tmp_delta0_8x16b = _mm_and_si128(tmp_delta0_8x16b, tmp0_const_8x16b);
493            //packing dq1 dq0 dp0 dp1
494            tmp1_const_8x16b = _mm_unpacklo_epi16(tmp_delta1_8x16b, tmp_delta0_8x16b);
495            tmp_delta0_8x16b = _mm_unpackhi_epi16(tmp_delta0_8x16b, tmp_delta1_8x16b);
496            tmp_delta1_8x16b = _mm_unpackhi_epi32(tmp1_const_8x16b, tmp_delta0_8x16b);
497            tmp_delta0_8x16b = _mm_unpacklo_epi32(tmp1_const_8x16b, tmp_delta0_8x16b);
498
499            //masking of the delta values dep, deq , filter_p ,filter_q
500            tmp_delta0_8x16b = _mm_and_si128(tmp_delta0_8x16b, mask_pq_8x16b);
501            tmp_delta1_8x16b = _mm_and_si128(tmp_delta1_8x16b, mask_pq_8x16b);
502            //converting 8bit to 16 bit
503            src_row0_8x16b = _mm_unpacklo_epi8(src_row0_8x16b, tmp2_const_8x16b);
504            src_row1_8x16b = _mm_unpacklo_epi8(src_row1_8x16b, tmp2_const_8x16b);
505            src_row2_8x16b = _mm_unpacklo_epi8(src_row2_8x16b, tmp2_const_8x16b);
506            src_row3_8x16b = _mm_unpacklo_epi8(src_row3_8x16b, tmp2_const_8x16b);
507            //shuffle values loaded
508            tmp0_const_8x16b = _mm_load_si128((__m128i *)shuffle2);
509            tmp1_const_8x16b = _mm_load_si128((__m128i *)shuffle3);
510            //arranging each row delta in different registers
511            tmp_delta3_8x16b = _mm_shuffle_epi8(tmp_delta1_8x16b, tmp1_const_8x16b);
512            tmp_delta2_8x16b = _mm_shuffle_epi8(tmp_delta1_8x16b, tmp0_const_8x16b);
513            tmp_delta1_8x16b = _mm_shuffle_epi8(tmp_delta0_8x16b, tmp1_const_8x16b);
514            tmp_delta0_8x16b = _mm_shuffle_epi8(tmp_delta0_8x16b, tmp0_const_8x16b);
515
516            //adding the respective delta
517            src_row3_8x16b = _mm_add_epi16(tmp_delta3_8x16b, src_row3_8x16b);
518            src_row2_8x16b = _mm_add_epi16(tmp_delta2_8x16b, src_row2_8x16b);
519            src_row1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, src_row1_8x16b);
520            src_row0_8x16b = _mm_add_epi16(tmp_delta0_8x16b, src_row0_8x16b);
521            //saturating to 8 bit
522            src_row2_8x16b = _mm_packus_epi16(src_row2_8x16b, src_row3_8x16b);
523            src_row0_8x16b = _mm_packus_epi16(src_row0_8x16b, src_row1_8x16b);
524            //separating different rows
525            src_row1_8x16b = _mm_srli_si128(src_row0_8x16b, 8);
526            src_row3_8x16b = _mm_srli_si128(src_row2_8x16b, 8);
527        }
528
529        _mm_storel_epi64((__m128i *)(pu1_src - 4), src_row0_8x16b);
530        _mm_storel_epi64((__m128i *)((pu1_src - 4) + src_strd), src_row1_8x16b);
531        _mm_storel_epi64((__m128i *)((pu1_src - 4) + 2 * src_strd), src_row2_8x16b);
532        _mm_storel_epi64((__m128i *)((pu1_src - 4) + 3 * src_strd), src_row3_8x16b);
533    }
534}
535
536void ihevc_deblk_luma_horz_ssse3(UWORD8 *pu1_src,
537                                 WORD32 src_strd,
538                                 WORD32 bs,
539                                 WORD32 quant_param_p,
540                                 WORD32 quant_param_q,
541                                 WORD32 beta_offset_div2,
542                                 WORD32 tc_offset_div2,
543                                 WORD32 filter_flag_p,
544                                 WORD32 filter_flag_q)
545{
546    WORD32 qp_luma, beta_indx, tc_indx;
547    WORD32 beta, tc;
548
549    WORD32 d0, d3, dp, dq, d;
550    WORD32 de_0, de_1, de_2, de_3;
551    WORD32 d_sam0, d_sam3;
552    WORD32 de, dep, deq;
553
554    __m128i src_q0_8x16b, src_q1_8x16b, src_p0_8x16b, src_p1_8x16b, src_q2_8x16b;
555    __m128i tmp_pq_str1_8x16b, src_p2_8x16b, tmp_pq_str0_8x16b;
556
557
558
559
560    {
561        __m128i src_tmp_p_0_8x16b, src_tmp_p_1_8x16b, src_tmp_q_0_8x16b, src_tmp_q_1_8x16b;
562        __m128i coef_8x16b, mask_d_result_4x32b, mask_de_result_8x16b;
563        __m128i mask_16x8b, temp_coef0_8x16b, temp_coef1_8x16b;
564
565        ASSERT((bs > 0));
566        ASSERT(filter_flag_p || filter_flag_q);
567
568        qp_luma = (quant_param_p + quant_param_q + 1) >> 1;
569        beta_indx = CLIP3(qp_luma + (beta_offset_div2 << 1), 0, 51);
570
571        /* BS based on implementation can take value 3 if it is intra/inter egde          */
572        /* based on BS, tc index is calcuated by adding 2 * ( bs - 1) to QP and tc_offset */
573        /* for BS = 1 adding factor is (0*2), BS = 2 or 3 adding factor is (1*2)          */
574        /* the above desired functionallity is achieved by doing (2*(bs>>1))              */
575
576        tc_indx = CLIP3(qp_luma + 2 * (bs >> 1) + (tc_offset_div2 << 1), 0, 53);
577
578        beta = gai4_ihevc_beta_table[beta_indx];
579        tc = gai4_ihevc_tc_table[tc_indx];
580        if(0 == tc)
581        {
582            return;
583        }
584        src_q0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src));
585        src_q1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
586        src_p0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - src_strd));
587        src_p1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - 2 * src_strd));
588        src_q2_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd));
589        tmp_pq_str1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd));
590        src_p2_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - 3 * src_strd));
591        tmp_pq_str0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - 4 * src_strd));
592
593
594        src_tmp_p_0_8x16b = _mm_unpacklo_epi8(src_p1_8x16b, src_p0_8x16b);
595        src_tmp_p_1_8x16b = _mm_unpacklo_epi8(tmp_pq_str0_8x16b, src_p2_8x16b);
596
597        src_tmp_q_0_8x16b = _mm_unpacklo_epi8(src_q0_8x16b, src_q1_8x16b);
598        src_tmp_q_1_8x16b = _mm_unpacklo_epi8(src_q2_8x16b, tmp_pq_str1_8x16b);
599
600        src_tmp_p_0_8x16b = _mm_unpacklo_epi16(src_tmp_p_1_8x16b, src_tmp_p_0_8x16b);
601        src_tmp_q_0_8x16b = _mm_unpacklo_epi16(src_tmp_q_0_8x16b, src_tmp_q_1_8x16b);
602
603        src_tmp_p_0_8x16b = _mm_shuffle_epi32(src_tmp_p_0_8x16b, 0x6c);
604        src_tmp_q_0_8x16b = _mm_shuffle_epi32(src_tmp_q_0_8x16b, 0x6c);
605
606        coef_8x16b = _mm_load_si128((__m128i *)(coef_d));
607        mask_16x8b =  _mm_load_si128((__m128i *)(shuffle_d));
608
609        src_tmp_p_0_8x16b = _mm_unpacklo_epi32(src_tmp_p_0_8x16b, src_tmp_q_0_8x16b);
610        //WORD32 shuffle_d[4]={0x80800403,0x80800c0b,0x03000704,0x0b080f0c};
611        mask_de_result_8x16b = _mm_shuffle_epi8(src_tmp_p_0_8x16b, mask_16x8b);
612
613        mask_d_result_4x32b = _mm_maddubs_epi16(src_tmp_p_0_8x16b, coef_8x16b);
614
615
616        //to get all 1's of 8 bit in (1)
617        temp_coef0_8x16b = _mm_cmpeq_epi16(src_tmp_p_0_8x16b, src_tmp_p_0_8x16b);
618        temp_coef1_8x16b = _mm_srli_epi16(temp_coef0_8x16b, 15);
619        //accumulating values foe dp3 dq3 , dp0 dq0 values
620        mask_d_result_4x32b = _mm_madd_epi16(mask_d_result_4x32b, temp_coef1_8x16b);
621
622        temp_coef1_8x16b = _mm_packus_epi16(temp_coef1_8x16b, temp_coef1_8x16b);
623        // to get all 1,-1 sets of 16 bits in (0)
624        temp_coef0_8x16b = _mm_unpacklo_epi8(temp_coef0_8x16b, temp_coef1_8x16b);
625        //q33-q30,p33-p30,q03-q00,p03-p00,0,q30-p30,0,q00-p00
626        mask_de_result_8x16b = _mm_maddubs_epi16(mask_de_result_8x16b, temp_coef0_8x16b);
627        //to get 16 bit 1's
628        temp_coef0_8x16b = _mm_srli_epi16(temp_coef1_8x16b, 8);
629
630
631        // dq3 dp3 dq0 dp0
632        mask_d_result_4x32b = _mm_abs_epi32(mask_d_result_4x32b);
633        mask_16x8b = _mm_shuffle_epi32(mask_d_result_4x32b, 0xec);
634        mask_d_result_4x32b = _mm_shuffle_epi32(mask_d_result_4x32b, 0x49);
635        // dq dp d3 d0
636        mask_d_result_4x32b = _mm_add_epi32(mask_d_result_4x32b, mask_16x8b);
637        //|q33-q30|,|p33-p30|,|q03-q00|,|p03-p00|,0,|q30-p30|,0,|q00-p00|
638        mask_de_result_8x16b = _mm_abs_epi16(mask_de_result_8x16b);
639        //|q33-q30|+|p33-p30|,|q03-q00|+|p03-p00|,0+|q30-p30|,0+|q00-p00|
640        mask_de_result_8x16b = _mm_madd_epi16(mask_de_result_8x16b, temp_coef0_8x16b);
641
642        ///store back in a single variable
643        temp_coef0_8x16b = _mm_srli_si128(mask_d_result_4x32b, 4);
644        temp_coef1_8x16b = _mm_srli_si128(mask_d_result_4x32b, 8);
645        mask_16x8b = _mm_srli_si128(mask_d_result_4x32b, 12);
646
647        d0 = _mm_cvtsi128_si32(mask_d_result_4x32b);
648        d3 = _mm_cvtsi128_si32(temp_coef0_8x16b);
649        dp = _mm_cvtsi128_si32(temp_coef1_8x16b);
650        dq = _mm_cvtsi128_si32(mask_16x8b);
651        //getting d
652        d = d0 + d3;
653
654        ///store back in a single variable
655        temp_coef0_8x16b = _mm_srli_si128(mask_de_result_8x16b, 4);
656        temp_coef1_8x16b = _mm_srli_si128(mask_de_result_8x16b, 8);
657        mask_16x8b = _mm_srli_si128(mask_de_result_8x16b, 12);
658
659        de_0 = _mm_cvtsi128_si32(mask_de_result_8x16b);
660        de_1 = _mm_cvtsi128_si32(temp_coef0_8x16b);
661        de_2 = _mm_cvtsi128_si32(temp_coef1_8x16b);
662        de_3 = _mm_cvtsi128_si32(mask_16x8b);
663
664        de = 0;
665        dep = 0;
666        deq = 0;
667        if(d < beta)
668        {
669            d_sam0 = 0;
670            if((2 * d0 < (beta >> 2))
671                            && (de_2 < (beta >> 3))
672                            && (de_0 < ((5 * tc + 1) >> 1)))
673            {
674                d_sam0 = 1;
675            }
676
677            d_sam3 = 0;
678            if((2 * d3 < (beta >> 2))
679                            && (de_3 < (beta >> 3))
680                            && de_1 < ((5 * tc + 1) >> 1))
681            {
682                d_sam3 = 1;
683            }
684
685            de = (d_sam0 & d_sam3) + 1;
686            dep = (dp < (beta + (beta >> 1)) >> 3) ? 1 : 0;
687            deq = (dq < (beta + (beta >> 1)) >> 3) ? 1 : 0;
688            if(tc <= 1)
689            {
690                dep = 0;
691                deq = 0;
692            }
693        }
694
695    }
696
697    if(de != 0)
698    {
699
700        if(2 == de)
701        {
702
703            __m128i temp_pq0_str0_16x8b;
704            __m128i temp_pq1_str0_16x8b, temp_pq1_str1_16x8b;
705            __m128i temp_pq2_str0_16x8b;
706            __m128i temp_str0_16x8b, temp_str1_16x8b;
707            __m128i const2_8x16b, const2tc_8x16b;
708
709            LWORD64 mask, tc2;
710            tc = tc << 1;
711            mask = (((LWORD64)filter_flag_q) << 63) | (((LWORD64)filter_flag_p) << 31);
712            tc2 = ((LWORD64)tc);
713
714            const2_8x16b = _mm_cmpeq_epi16(src_p1_8x16b, src_p1_8x16b);
715            //q'0-q'1-2 ,p'0-p'1-2
716            temp_pq0_str0_16x8b = _mm_unpacklo_epi8(src_p1_8x16b, src_p0_8x16b);
717            temp_str0_16x8b   = _mm_unpacklo_epi8(src_q0_8x16b, src_q1_8x16b);
718            const2_8x16b = _mm_srli_epi16(const2_8x16b, 15);
719            //arranged q31 q30 q21 q20 q1 q10 q01 q00 p30 p31 p20 p21 p10 p11 p00 p01
720            temp_pq0_str0_16x8b = _mm_unpacklo_epi64(temp_pq0_str0_16x8b, temp_str0_16x8b);
721
722            const2_8x16b = _mm_packus_epi16(const2_8x16b, const2_8x16b);
723            temp_pq0_str0_16x8b = _mm_maddubs_epi16(temp_pq0_str0_16x8b, const2_8x16b);
724
725            //q'1-2, p'1-2
726            temp_pq1_str0_16x8b = _mm_unpacklo_epi8(src_p0_8x16b, src_q0_8x16b);
727            temp_pq1_str1_16x8b = _mm_unpacklo_epi8(src_q1_8x16b, src_q2_8x16b);
728            temp_str1_16x8b = _mm_unpacklo_epi8(src_p1_8x16b, src_p2_8x16b);
729            // q30 p30 q20 p20 q10 p10 q01 q00 p30 q20 p20 q10 p10 q01 q00 p00
730            temp_pq1_str0_16x8b = _mm_unpacklo_epi64(temp_pq1_str0_16x8b, temp_pq1_str0_16x8b);
731            // q32 q31 q22 q21 q12 q11 q02 q01 p32 p31 p22 p21 p12 p11 p02 p01
732            temp_pq1_str1_16x8b = _mm_unpacklo_epi64(temp_str1_16x8b, temp_pq1_str1_16x8b);
733
734            temp_pq1_str0_16x8b = _mm_maddubs_epi16(temp_pq1_str0_16x8b, const2_8x16b);
735            temp_pq1_str1_16x8b = _mm_maddubs_epi16(temp_pq1_str1_16x8b, const2_8x16b);
736
737            //clipping mask design
738            temp_str1_16x8b = _mm_setzero_si128();
739            temp_str0_16x8b = _mm_loadl_epi64((__m128i *)(&mask));
740            const2tc_8x16b  = _mm_loadl_epi64((__m128i *)(&tc2));
741            temp_str0_16x8b = _mm_shuffle_epi32(temp_str0_16x8b, 0x44);
742            const2tc_8x16b  = _mm_shuffle_epi8(const2tc_8x16b, temp_str1_16x8b);
743
744            //clipping mask design
745            temp_str0_16x8b = _mm_srai_epi32(temp_str0_16x8b, 31);
746            const2tc_8x16b = _mm_and_si128(const2tc_8x16b, temp_str0_16x8b);
747            //calculating Clipping MAX for all pixel values.
748            src_p0_8x16b = _mm_unpacklo_epi32(src_p0_8x16b, src_q0_8x16b);
749            src_q0_8x16b = _mm_unpacklo_epi32(src_p1_8x16b, src_q1_8x16b);
750            //for clipping calc
751            src_p1_8x16b = _mm_unpacklo_epi64(src_p0_8x16b, src_q0_8x16b);
752            //saving the unmodified data of q1 p1 q0 p0
753            src_q1_8x16b = _mm_unpackhi_epi64(src_p0_8x16b, src_q0_8x16b);
754            //CLIpping MAX and MIN for q1 p1 q0 p0
755            src_p0_8x16b = _mm_adds_epu8(src_p1_8x16b, const2tc_8x16b);
756            src_p1_8x16b = _mm_subs_epu8(src_p1_8x16b, const2tc_8x16b);
757
758
759            //q'2-q'0-2,p'2-p'0-2
760            tmp_pq_str0_8x16b = _mm_unpacklo_epi8(src_p2_8x16b, tmp_pq_str0_8x16b);
761            temp_pq2_str0_16x8b = _mm_unpacklo_epi8(src_q2_8x16b, tmp_pq_str1_8x16b);
762            const2_8x16b = _mm_slli_epi16(const2_8x16b, 1);
763            //arranged q33 q32 q23 q22 q13 q12 q03 q02 p32 p33 p22 p23 p12 p13 p02 p03
764            temp_pq2_str0_16x8b = _mm_unpacklo_epi64(tmp_pq_str0_8x16b, temp_pq2_str0_16x8b);
765            src_p2_8x16b = _mm_unpacklo_epi32(src_p2_8x16b, src_q2_8x16b);
766            temp_pq2_str0_16x8b = _mm_maddubs_epi16(temp_pq2_str0_16x8b, const2_8x16b);
767
768            //calculating Clipping MAX and MIN for p2 and q2 .
769            tmp_pq_str0_8x16b = _mm_adds_epu8(src_p2_8x16b, const2tc_8x16b);
770            tmp_pq_str1_8x16b = _mm_subs_epu8(src_p2_8x16b, const2tc_8x16b);
771            //q'0-q'1-2 ,p'0-p'1-2
772            temp_str0_16x8b = _mm_shuffle_epi32(temp_pq0_str0_16x8b, 0x4e);
773            temp_pq0_str0_16x8b = _mm_add_epi16(temp_pq0_str0_16x8b, temp_str0_16x8b);
774            //q'1-2 p'1-2
775            temp_pq1_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq1_str1_16x8b);
776            //to get 2 in 16 bit
777            const2_8x16b = _mm_srli_epi16(const2_8x16b, 8);
778
779
780            //q'1, p'1 (adding 2)
781            temp_pq1_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, const2_8x16b);
782            //q'0-q'1,p'0-p'1
783            temp_pq0_str0_16x8b = _mm_add_epi16(temp_pq0_str0_16x8b, const2_8x16b);
784            //q'2-q'1,p'2-p'1
785            temp_pq2_str0_16x8b = _mm_add_epi16(temp_pq2_str0_16x8b, const2_8x16b);
786            //q'0 = (q'0-q'1)+q'1 ,p'0 = (p'0-p'1)+p'1;
787            temp_pq0_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq0_str0_16x8b);
788            //q'2 = (q'2-q'1)+q'1 ,p'2 = (p'2-p'1)+p'1;
789            temp_pq2_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq2_str0_16x8b);
790
791            //normalisation of all modified pixels
792            temp_pq0_str0_16x8b = _mm_srai_epi16(temp_pq0_str0_16x8b, 3);
793            temp_pq1_str0_16x8b = _mm_srai_epi16(temp_pq1_str0_16x8b, 2);
794            temp_pq2_str0_16x8b = _mm_srai_epi16(temp_pq2_str0_16x8b, 3);
795            //q'1 p'1 q'0 p'0
796            temp_pq0_str0_16x8b = _mm_packus_epi16(temp_pq0_str0_16x8b, temp_pq1_str0_16x8b);
797            temp_pq2_str0_16x8b = _mm_packus_epi16(temp_pq2_str0_16x8b, temp_pq2_str0_16x8b);
798            //pack with the unmodified data of q2 and p2
799            src_p2_8x16b = _mm_unpackhi_epi64(temp_pq2_str0_16x8b, src_p2_8x16b);
800            //Clipping MAX and MIN for q'1 p'1 q'0 p'0 and q'2  p'2
801            temp_pq0_str0_16x8b = _mm_min_epu8(temp_pq0_str0_16x8b, src_p0_8x16b);
802            src_p2_8x16b = _mm_min_epu8(src_p2_8x16b, tmp_pq_str0_8x16b);
803            temp_pq0_str0_16x8b = _mm_max_epu8(temp_pq0_str0_16x8b, src_p1_8x16b);
804            src_p2_8x16b = _mm_max_epu8(src_p2_8x16b, tmp_pq_str1_8x16b);
805            //Reshuffling q'1 p'1 q'0 p'0 along with unmodified data
806            src_p0_8x16b = _mm_unpacklo_epi32(temp_pq0_str0_16x8b, src_q1_8x16b);
807            src_p1_8x16b = _mm_unpackhi_epi32(temp_pq0_str0_16x8b, src_q1_8x16b);
808            src_p2_8x16b = _mm_shuffle_epi32(src_p2_8x16b, 0xd8);
809            src_q0_8x16b = _mm_srli_si128(src_p0_8x16b, 8);
810            src_q1_8x16b = _mm_srli_si128(src_p1_8x16b, 8);
811            src_q2_8x16b = _mm_srli_si128(src_p2_8x16b, 8);
812
813            _mm_storel_epi64((__m128i *)(pu1_src - 3 * src_strd), src_p2_8x16b);
814            _mm_storel_epi64((__m128i *)(pu1_src - 2 * src_strd), src_p1_8x16b);
815            _mm_storel_epi64((__m128i *)(pu1_src - src_strd), src_p0_8x16b);
816            _mm_storel_epi64((__m128i *)(pu1_src), src_q0_8x16b);
817            _mm_storel_epi64((__m128i *)(pu1_src + src_strd), src_q1_8x16b);
818            _mm_storel_epi64((__m128i *)(pu1_src + 2 * src_strd), src_q2_8x16b);
819
820
821        }
822
823        else
824        {
825
826            __m128i tmp_delta0_8x16b, tmp_delta1_8x16b;
827            __m128i tmp0_const_8x16b, tmp1_const_8x16b, tmp2_const_8x16b;
828            __m128i coefdelta_0_8x16b;
829            __m128i const2_8x16b, consttc_8x16b;
830
831            LWORD64 maskp0, maskp1, maskq0, maskq1;
832            maskp0 = (LWORD64)filter_flag_p;
833            maskq0 = (LWORD64)filter_flag_q;
834            maskp1 = (LWORD64)dep;
835            maskq1 = (LWORD64)deq;
836            consttc_8x16b = _mm_set1_epi32(tc);
837
838            tmp_delta0_8x16b = _mm_unpacklo_epi8(src_p1_8x16b, src_p0_8x16b);
839            tmp_delta1_8x16b = _mm_unpacklo_epi8(src_q0_8x16b, src_q1_8x16b);
840            //arranged q31 q30 p30 p31  q21 q20 p20 p21  q1 q10 p10 p11 q01 q00 p00 p01
841            tmp_delta1_8x16b = _mm_unpacklo_epi16(tmp_delta0_8x16b, tmp_delta1_8x16b);
842
843            coefdelta_0_8x16b = _mm_load_si128((__m128i *)coef_de1);
844            // (-3q1+9q0),(-9p0+3p1)
845            tmp_delta0_8x16b = _mm_maddubs_epi16(tmp_delta1_8x16b, coefdelta_0_8x16b);
846
847            //getting -tc store
848            tmp2_const_8x16b = _mm_cmpeq_epi32(consttc_8x16b, consttc_8x16b);
849
850            //getting tc in 16 bit
851            consttc_8x16b = _mm_packs_epi32(consttc_8x16b, consttc_8x16b);
852            //calc 10 *tc = 2*tc +8*tc ; 2*tc
853            tmp_pq_str0_8x16b = _mm_slli_epi16(consttc_8x16b, 1);
854            //calc 10 *tc = 2*tc +8*tc ; 8*tc
855            tmp_pq_str1_8x16b = _mm_slli_epi16(consttc_8x16b, 3);
856
857            //const 1
858            const2_8x16b = _mm_srli_epi16(tmp2_const_8x16b, 15);
859            //calc 10 *tc
860            tmp_pq_str0_8x16b = _mm_add_epi16(tmp_pq_str0_8x16b, tmp_pq_str1_8x16b);
861            //delta0 without normalisation and clipping
862            tmp_delta0_8x16b = _mm_madd_epi16(tmp_delta0_8x16b, const2_8x16b);
863
864            const2_8x16b = _mm_srli_epi32(tmp2_const_8x16b, 31);
865
866            //loaded coef for delta1 calculation
867            coefdelta_0_8x16b = _mm_load_si128((__m128i *)coef_dep1);
868            //(-2q1+q0),(p0-2p1)
869            tmp_delta1_8x16b = _mm_maddubs_epi16(tmp_delta1_8x16b, coefdelta_0_8x16b);
870            //const 8
871            const2_8x16b = _mm_slli_epi32(const2_8x16b, 3);
872
873            //normalisation of the filter
874            tmp_delta0_8x16b = _mm_add_epi32(tmp_delta0_8x16b, const2_8x16b);
875            tmp_delta0_8x16b = _mm_srai_epi32(tmp_delta0_8x16b, 4);
876
877            //getting deltaq0
878            tmp_pq_str1_8x16b = _mm_sign_epi32(tmp_delta0_8x16b, tmp2_const_8x16b);
879            //getting -tc
880            tmp1_const_8x16b = _mm_sign_epi16(consttc_8x16b, tmp2_const_8x16b);
881            //packing  d03q d02q d01q d0q d03p d02p d01p d00p
882            tmp_delta0_8x16b = _mm_packs_epi32(tmp_delta0_8x16b, tmp_pq_str1_8x16b);
883            //absolute delta
884            tmp_pq_str1_8x16b = _mm_abs_epi16(tmp_delta0_8x16b);
885
886            //Clipping of delta0
887            tmp_delta0_8x16b = _mm_min_epi16(tmp_delta0_8x16b, consttc_8x16b);
888            //tc>>1 16 bit
889            consttc_8x16b = _mm_srai_epi16(consttc_8x16b, 1);
890            //Clipping of delta0
891            tmp_delta0_8x16b = _mm_max_epi16(tmp_delta0_8x16b, tmp1_const_8x16b);
892
893            //(-tc)>>1 16 bit
894            tmp1_const_8x16b = _mm_sign_epi16(consttc_8x16b, tmp2_const_8x16b);
895            //mask for |delta| < 10*tc
896            tmp_pq_str0_8x16b = _mm_cmpgt_epi16(tmp_pq_str0_8x16b, tmp_pq_str1_8x16b);
897            //delta 1 calc starts
898
899            //getting q32 q22 q12 q02 p32 p12 p22 p02
900            tmp0_const_8x16b = _mm_setzero_si128();
901            src_q2_8x16b = _mm_unpacklo_epi8(src_q2_8x16b, tmp0_const_8x16b);
902            src_p2_8x16b = _mm_unpacklo_epi8(src_p2_8x16b, tmp0_const_8x16b);
903            src_p2_8x16b = _mm_unpacklo_epi64(src_p2_8x16b, src_q2_8x16b);
904            //constant 1
905            const2_8x16b = _mm_srli_epi16(tmp2_const_8x16b, 15);
906            //2*delta0
907            tmp2_const_8x16b = _mm_add_epi16(tmp_delta0_8x16b, tmp_delta0_8x16b);
908            //getting  all respective q's and p's together
909            coefdelta_0_8x16b = _mm_load_si128((__m128i *)(shuffle1));
910            tmp_delta1_8x16b = _mm_shuffle_epi8(tmp_delta1_8x16b, coefdelta_0_8x16b);
911            //final adds for deltap1 and deltaq1
912            tmp_delta1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, const2_8x16b);
913            src_p2_8x16b = _mm_add_epi16(src_p2_8x16b, tmp2_const_8x16b);
914            tmp_delta1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, src_p2_8x16b);
915            tmp_delta1_8x16b = _mm_srai_epi16(tmp_delta1_8x16b, 2);
916
917            //mask0= (((LWORD64)filter_flag_q)<<63)| (((LWORD64)filter_flag_p)<<31);
918            tmp_pq_str1_8x16b = _mm_loadl_epi64((__m128i *)(&(maskq0)));
919            src_p2_8x16b = _mm_loadl_epi64((__m128i *)(&(maskp0)));
920
921            //   src_p2_8x16b = _mm_set_epi32(filter_flag_q,filter_flag_p,filter_flag_q,filter_flag_p);
922            //mask1= (((LWORD64)(filter_flag_q&deq))<<63)|(((LWORD64)(filter_flag_p & dep))<<31);
923            src_q2_8x16b = _mm_loadl_epi64((__m128i *)(&(maskq1)));
924            coefdelta_0_8x16b = _mm_loadl_epi64((__m128i *)(&(maskp1)));
925
926            src_p2_8x16b = _mm_unpacklo_epi32(src_p2_8x16b, tmp_pq_str1_8x16b);
927            src_q2_8x16b = _mm_unpacklo_epi32(coefdelta_0_8x16b, src_q2_8x16b);
928            //src_q2_8x16b = _mm_set_epi32(deq,dep,deq,dep);
929            src_q2_8x16b = _mm_and_si128(src_q2_8x16b, src_p2_8x16b);
930
931            //rearranging the mask values
932            src_q2_8x16b = _mm_shuffle_epi32(src_q2_8x16b, 0x50);
933            src_p2_8x16b = _mm_shuffle_epi32(src_p2_8x16b, 0x50);
934
935            src_q2_8x16b = _mm_slli_epi32(src_q2_8x16b, 31);
936            src_p2_8x16b = _mm_slli_epi32(src_p2_8x16b, 31);
937            src_q2_8x16b = _mm_srai_epi32(src_q2_8x16b, 31);
938            src_p2_8x16b = _mm_srai_epi32(src_p2_8x16b, 31);
939
940            //combining mask delta1
941            tmp_pq_str1_8x16b = _mm_and_si128(tmp_pq_str0_8x16b, src_q2_8x16b);
942            // clipping delta1
943            tmp_delta1_8x16b = _mm_min_epi16(tmp_delta1_8x16b, consttc_8x16b);
944            //combining mask delat0
945            tmp_pq_str0_8x16b = _mm_and_si128(tmp_pq_str0_8x16b, src_p2_8x16b);
946            // clipping delta1
947            tmp_delta1_8x16b = _mm_max_epi16(tmp_delta1_8x16b, tmp1_const_8x16b);
948
949
950            //masking of the delta values |delta|<10*tc
951            tmp_delta1_8x16b = _mm_and_si128(tmp_delta1_8x16b, tmp_pq_str1_8x16b);
952            tmp_delta0_8x16b = _mm_and_si128(tmp_delta0_8x16b, tmp_pq_str0_8x16b);
953            //separating p and q delta 0 and addinq p0 and q0
954            tmp_pq_str0_8x16b = _mm_unpacklo_epi64(tmp_delta0_8x16b, tmp0_const_8x16b);
955            tmp_pq_str1_8x16b = _mm_unpackhi_epi64(tmp_delta0_8x16b, tmp0_const_8x16b);
956            src_p0_8x16b = _mm_unpacklo_epi8(src_p0_8x16b, tmp0_const_8x16b);
957            src_q0_8x16b = _mm_unpacklo_epi8(src_q0_8x16b, tmp0_const_8x16b);
958            src_p0_8x16b = _mm_add_epi16(src_p0_8x16b, tmp_pq_str0_8x16b);
959            src_q0_8x16b = _mm_add_epi16(src_q0_8x16b, tmp_pq_str1_8x16b);
960            //separating p and q delta 0 and addinq p0 and q0
961            tmp_pq_str0_8x16b = _mm_unpacklo_epi64(tmp_delta1_8x16b, tmp0_const_8x16b);
962            tmp_pq_str1_8x16b = _mm_unpackhi_epi64(tmp_delta1_8x16b, tmp0_const_8x16b);
963            src_p1_8x16b = _mm_unpacklo_epi8(src_p1_8x16b, tmp0_const_8x16b);
964            src_q1_8x16b = _mm_unpacklo_epi8(src_q1_8x16b, tmp0_const_8x16b);
965            src_p1_8x16b = _mm_add_epi16(src_p1_8x16b, tmp_pq_str0_8x16b);
966            src_q1_8x16b = _mm_add_epi16(src_q1_8x16b, tmp_pq_str1_8x16b);
967            //packing p1 q1 and p0 q0 to 8 bit
968            src_p1_8x16b = _mm_packus_epi16(src_p1_8x16b, src_q1_8x16b);
969            src_p0_8x16b = _mm_packus_epi16(src_p0_8x16b, src_q0_8x16b);
970
971            src_q1_8x16b = _mm_srli_si128(src_p1_8x16b, 8);
972            src_q0_8x16b = _mm_srli_si128(src_p0_8x16b, 8);
973
974            _mm_storel_epi64((__m128i *)(pu1_src - 2 * src_strd), src_p1_8x16b);
975            _mm_storel_epi64((__m128i *)(pu1_src - src_strd), src_p0_8x16b);
976            _mm_storel_epi64((__m128i *)(pu1_src), src_q0_8x16b);
977            _mm_storel_epi64((__m128i *)(pu1_src + src_strd), src_q1_8x16b);
978
979
980        }
981
982
983
984    }
985
986}
987
988void ihevc_deblk_chroma_vert_ssse3(UWORD8 *pu1_src,
989                                   WORD32 src_strd,
990                                   WORD32 quant_param_p,
991                                   WORD32 quant_param_q,
992                                   WORD32 qp_offset_u,
993                                   WORD32 qp_offset_v,
994                                   WORD32 tc_offset_div2,
995                                   WORD32 filter_flag_p,
996                                   WORD32 filter_flag_q)
997{
998    WORD32 qp_indx_u, qp_chroma_u;
999    WORD32 qp_indx_v, qp_chroma_v;
1000    WORD32 tc_indx_u, tc_u;
1001    WORD32 tc_indx_v, tc_v;
1002
1003    __m128i src_row_0_16x8b, tmp_pxl_0_16x8b, src_row_2_16x8b, tmp_pxl_1_16x8b;
1004    ASSERT(filter_flag_p || filter_flag_q);
1005
1006    /* chroma processing is done only if BS is 2             */
1007    /* this function is assumed to be called only if BS is 2 */
1008    qp_indx_u = qp_offset_u + ((quant_param_p + quant_param_q + 1) >> 1);
1009    qp_chroma_u = qp_indx_u < 0 ? qp_indx_u : (qp_indx_u > 57 ? qp_indx_u - 6 : gai4_ihevc_qp_table[qp_indx_u]);
1010
1011    qp_indx_v = qp_offset_v + ((quant_param_p + quant_param_q + 1) >> 1);
1012    qp_chroma_v = qp_indx_v < 0 ? qp_indx_v : (qp_indx_v > 57 ? qp_indx_v - 6 : gai4_ihevc_qp_table[qp_indx_v]);
1013
1014    tc_indx_u = CLIP3(qp_chroma_u + 2 + (tc_offset_div2 << 1), 0, 53);
1015    tc_u = gai4_ihevc_tc_table[tc_indx_u];
1016
1017    tc_indx_v = CLIP3(qp_chroma_v + 2 + (tc_offset_div2 << 1), 0, 53);
1018    tc_v = gai4_ihevc_tc_table[tc_indx_v];
1019
1020    if(0 == tc_u && 0 == tc_v)
1021    {
1022        return;
1023    }
1024    src_row_0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src - 4));
1025    tmp_pxl_0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd - 4));
1026    src_row_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd - 4));
1027    tmp_pxl_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd - 4));
1028
1029    {
1030        LWORD64 mask_tc, mask_flag, mask;
1031        __m128i delta_vu0_16x8b, delta_vu1_16x8b;
1032        __m128i mask_tc_16x8, mask_16x8b, mask_flag_p_16x8b, mask_flag_q_16x8b;
1033        __m128i min_0_16x8b;
1034        __m128i const_16x8b;
1035        mask_flag = (((LWORD64)filter_flag_p) << 31) | (((LWORD64)filter_flag_q) << 63);
1036        mask_tc = (((LWORD64)tc_v) << 16) | ((LWORD64)tc_u);
1037        mask = 0xffff00000000ffffLL;
1038
1039        src_row_0_16x8b = _mm_unpacklo_epi64(src_row_0_16x8b, tmp_pxl_0_16x8b);
1040        src_row_2_16x8b = _mm_unpacklo_epi64(src_row_2_16x8b, tmp_pxl_1_16x8b);
1041
1042        mask_16x8b = _mm_load_si128((__m128i *)(shuffle_uv));
1043        // qv11 qu11 qv10 qu10 qv01 qu01 qv00 qu00 pv10 pu10 pv11 pu11 pv00 pu00 pv01 pu01
1044        // qv31 qu31 qv30 qu30 qv21 qu21 qv20 qu20 pv30 pu30 pv31 pu31 pv20 pu20 pv21 pu21
1045        delta_vu0_16x8b = _mm_shuffle_epi8(src_row_0_16x8b, mask_16x8b);
1046        delta_vu1_16x8b = _mm_shuffle_epi8(src_row_2_16x8b, mask_16x8b);
1047
1048        tmp_pxl_0_16x8b = _mm_unpacklo_epi64(delta_vu0_16x8b, delta_vu1_16x8b);
1049        tmp_pxl_1_16x8b = _mm_unpackhi_epi64(delta_vu0_16x8b, delta_vu1_16x8b);
1050        // pv30 pv31 pu30 pu31 pv20 pv21 pu20 pu21 pv10 pv11 pu10 pu11 pv00 pv01 pu00 pu01
1051        // qv31 qv30 qu31 qu30 qv21 qv20 qu21 qu20 qv11 qv10 qu11 qu10 qv01 qv00 qu01 qu00
1052        delta_vu0_16x8b = _mm_load_si128((__m128i *)delta0);
1053        delta_vu1_16x8b = _mm_load_si128((__m128i *)delta1);
1054
1055        delta_vu0_16x8b = _mm_maddubs_epi16(tmp_pxl_0_16x8b, delta_vu0_16x8b);
1056        delta_vu1_16x8b = _mm_maddubs_epi16(tmp_pxl_1_16x8b, delta_vu1_16x8b);
1057
1058        //generating offset 4
1059        const_16x8b = _mm_cmpeq_epi16(tmp_pxl_0_16x8b, tmp_pxl_0_16x8b);
1060        // filter flag mask and tc mask
1061        mask_tc_16x8 = _mm_loadl_epi64((__m128i *)(&mask_tc));
1062        mask_flag_q_16x8b = _mm_loadl_epi64((__m128i *)(&mask_flag));
1063
1064        mask_tc_16x8 = _mm_shuffle_epi32(mask_tc_16x8, 0x00);
1065        mask_flag_q_16x8b = _mm_srai_epi32(mask_flag_q_16x8b, 31);
1066        //-tc
1067        min_0_16x8b = _mm_sign_epi16(mask_tc_16x8, const_16x8b);
1068        //converting const 1
1069        const_16x8b = _mm_srli_epi16(const_16x8b, 15);
1070
1071        //filterp and filterq flag
1072        mask_flag_p_16x8b = _mm_shuffle_epi32(mask_flag_q_16x8b, 0x00);
1073        mask_flag_q_16x8b = _mm_shuffle_epi32(mask_flag_q_16x8b, 0x55);
1074
1075        //modified delta with a filter (1 -4 4 -1) available in 16 bit
1076        delta_vu0_16x8b = _mm_add_epi16(delta_vu0_16x8b, delta_vu1_16x8b);
1077        //converting const 4
1078        const_16x8b = _mm_slli_epi16(const_16x8b, 2);
1079
1080        mask_16x8b = _mm_loadl_epi64((__m128i *)(&mask));
1081        //offset addition
1082        delta_vu0_16x8b = _mm_add_epi16(delta_vu0_16x8b, const_16x8b);
1083        //eliminating q1
1084        tmp_pxl_1_16x8b = _mm_slli_epi16(tmp_pxl_1_16x8b, 8);
1085
1086        const_16x8b = _mm_setzero_si128();
1087        //filter after normalisation
1088        delta_vu0_16x8b = _mm_srai_epi16(delta_vu0_16x8b, 3);
1089        mask_16x8b = _mm_shuffle_epi32(mask_16x8b, 0x44);
1090
1091        //clipping MAX
1092        delta_vu0_16x8b = _mm_min_epi16(delta_vu0_16x8b, mask_tc_16x8);
1093        //getting p0 and eliminating p1
1094        tmp_pxl_0_16x8b = _mm_srli_epi16(tmp_pxl_0_16x8b, 8);
1095        //clipping MIN
1096        delta_vu0_16x8b = _mm_max_epi16(delta_vu0_16x8b, min_0_16x8b);
1097        //getting q0
1098        tmp_pxl_1_16x8b = _mm_srli_epi16(tmp_pxl_1_16x8b, 8);
1099        //masking filter flag
1100        delta_vu1_16x8b = _mm_and_si128(delta_vu0_16x8b, mask_flag_q_16x8b);
1101        delta_vu0_16x8b = _mm_and_si128(delta_vu0_16x8b, mask_flag_p_16x8b);
1102
1103        // q-delta ,p+delta
1104        tmp_pxl_1_16x8b = _mm_sub_epi16(tmp_pxl_1_16x8b, delta_vu1_16x8b);
1105        tmp_pxl_0_16x8b = _mm_add_epi16(tmp_pxl_0_16x8b, delta_vu0_16x8b);
1106        //merging q0 and p0 of respective rows
1107        delta_vu1_16x8b = _mm_unpackhi_epi32(tmp_pxl_0_16x8b, tmp_pxl_1_16x8b);
1108        delta_vu0_16x8b = _mm_unpacklo_epi32(tmp_pxl_0_16x8b, tmp_pxl_1_16x8b);
1109        // row 0 and row 1 packed , row2 and row3 packed
1110        delta_vu0_16x8b = _mm_packus_epi16(delta_vu0_16x8b, const_16x8b);
1111        delta_vu1_16x8b = _mm_packus_epi16(delta_vu1_16x8b, const_16x8b);
1112        //removing older pixel values
1113        src_row_0_16x8b = _mm_and_si128(src_row_0_16x8b, mask_16x8b);
1114        src_row_2_16x8b = _mm_and_si128(src_row_2_16x8b, mask_16x8b);
1115        //arranging modified pixels
1116        delta_vu0_16x8b = _mm_shuffle_epi32(delta_vu0_16x8b, 0xd8);
1117        delta_vu1_16x8b = _mm_shuffle_epi32(delta_vu1_16x8b, 0xd8);
1118        delta_vu0_16x8b = _mm_slli_epi64(delta_vu0_16x8b, 16);
1119        delta_vu1_16x8b = _mm_slli_epi64(delta_vu1_16x8b, 16);
1120        //plugging the modified values
1121        src_row_0_16x8b = _mm_or_si128(src_row_0_16x8b, delta_vu0_16x8b);
1122        src_row_2_16x8b = _mm_or_si128(src_row_2_16x8b, delta_vu1_16x8b);
1123
1124
1125        //geting values for row1 and row 3
1126        tmp_pxl_0_16x8b = _mm_srli_si128(src_row_0_16x8b, 8);
1127        tmp_pxl_1_16x8b = _mm_srli_si128(src_row_2_16x8b, 8);
1128
1129        _mm_storel_epi64((__m128i *)(pu1_src - 4), src_row_0_16x8b);
1130        _mm_storel_epi64((__m128i *)((pu1_src - 4) + src_strd), tmp_pxl_0_16x8b);
1131        _mm_storel_epi64((__m128i *)((pu1_src - 4) + 2 * src_strd), src_row_2_16x8b);
1132        _mm_storel_epi64((__m128i *)((pu1_src - 4) + 3 * src_strd), tmp_pxl_1_16x8b);
1133    }
1134
1135
1136
1137}
1138
1139void ihevc_deblk_chroma_horz_ssse3(UWORD8 *pu1_src,
1140                                   WORD32 src_strd,
1141                                   WORD32 quant_param_p,
1142                                   WORD32 quant_param_q,
1143                                   WORD32 qp_offset_u,
1144                                   WORD32 qp_offset_v,
1145                                   WORD32 tc_offset_div2,
1146                                   WORD32 filter_flag_p,
1147                                   WORD32 filter_flag_q)
1148{
1149    WORD32 qp_indx_u, qp_chroma_u;
1150    WORD32 qp_indx_v, qp_chroma_v;
1151    WORD32 tc_indx_u, tc_u;
1152    WORD32 tc_indx_v, tc_v;
1153
1154
1155    __m128i tmp_p0_16x8b, src_p0_16x8b, src_q0_16x8b, tmp_q0_16x8b;
1156
1157    ASSERT(filter_flag_p || filter_flag_q);
1158
1159    /* chroma processing is done only if BS is 2             */
1160    /* this function is assumed to be called only if BS is 2 */
1161    qp_indx_u = qp_offset_u + ((quant_param_p + quant_param_q + 1) >> 1);
1162    qp_chroma_u = qp_indx_u < 0 ? qp_indx_u : (qp_indx_u > 57 ? qp_indx_u - 6 : gai4_ihevc_qp_table[qp_indx_u]);
1163
1164    qp_indx_v = qp_offset_v + ((quant_param_p + quant_param_q + 1) >> 1);
1165    qp_chroma_v = qp_indx_v < 0 ? qp_indx_v : (qp_indx_v > 57 ? qp_indx_v - 6 : gai4_ihevc_qp_table[qp_indx_v]);
1166
1167    tc_indx_u = CLIP3(qp_chroma_u + 2 + (tc_offset_div2 << 1), 0, 53);
1168    tc_u = gai4_ihevc_tc_table[tc_indx_u];
1169
1170    tc_indx_v = CLIP3(qp_chroma_v + 2 + (tc_offset_div2 << 1), 0, 53);
1171    tc_v = gai4_ihevc_tc_table[tc_indx_v];
1172
1173    if(0 == tc_u && 0 == tc_v)
1174    {
1175        return;
1176    }
1177    tmp_p0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src - 2 * src_strd));
1178    src_p0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src - src_strd));
1179    src_q0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));
1180    tmp_q0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
1181
1182    {
1183        LWORD64 mask_tc, mask_flag;
1184        __m128i delta_vu0_16x8b, delta_vu1_16x8b;
1185        __m128i mask_tc_16x8, mask_16x8b, mask_flag_p_16x8b, mask_flag_q_16x8b;
1186        __m128i min_0_16x8b;
1187        __m128i const_16x8b;
1188        mask_flag = (((LWORD64)filter_flag_p) << 31) | (((LWORD64)filter_flag_q) << 63);
1189        mask_tc = (((LWORD64)tc_v) << 16) | ((LWORD64)tc_u);
1190
1191        tmp_p0_16x8b = _mm_unpacklo_epi8(tmp_p0_16x8b, src_p0_16x8b);
1192        tmp_q0_16x8b = _mm_unpacklo_epi8(src_q0_16x8b, tmp_q0_16x8b);
1193
1194        // pv30 pv31 pu30 pu31 pv20 pv21 pu20 pu21 pv10 pv11 pu10 pu11 pv00 pv01 pu00 pu01
1195        // qv31 qv30 qu31 qu30 qv21 qv20 qu21 qu20 qv11 qv10 qu11 qu10 qv01 qv00 qu01 qu00
1196        delta_vu0_16x8b = _mm_load_si128((__m128i *)delta0);
1197        delta_vu1_16x8b = _mm_load_si128((__m128i *)delta1);
1198
1199        delta_vu0_16x8b = _mm_maddubs_epi16(tmp_p0_16x8b, delta_vu0_16x8b);
1200        delta_vu1_16x8b = _mm_maddubs_epi16(tmp_q0_16x8b, delta_vu1_16x8b);
1201
1202
1203        // filter flag mask and tc mask
1204        mask_tc_16x8 = _mm_loadl_epi64((__m128i *)(&mask_tc));
1205        mask_flag_q_16x8b = _mm_loadl_epi64((__m128i *)(&mask_flag));
1206
1207        //generating offset 4
1208        const_16x8b = _mm_cmpeq_epi16(tmp_p0_16x8b, tmp_p0_16x8b);
1209        // filter flag mask and tc mask
1210        mask_tc_16x8 = _mm_shuffle_epi32(mask_tc_16x8, 0x00);
1211        mask_flag_q_16x8b = _mm_srai_epi32(mask_flag_q_16x8b, 31);
1212        //-tc
1213        min_0_16x8b = _mm_sign_epi16(mask_tc_16x8, const_16x8b);
1214        //converting const 1
1215        const_16x8b = _mm_srli_epi16(const_16x8b, 15);
1216
1217        //filterp
1218        mask_flag_p_16x8b = _mm_shuffle_epi32(mask_flag_q_16x8b, 0x00);
1219
1220
1221        //converting const 4
1222        const_16x8b = _mm_slli_epi16(const_16x8b, 2);
1223        //modified delta with a filter (1 -4 4 -1) available in 16 bit
1224        delta_vu0_16x8b = _mm_add_epi16(delta_vu0_16x8b, delta_vu1_16x8b);
1225
1226        //filterq flag
1227        mask_flag_q_16x8b = _mm_shuffle_epi32(mask_flag_q_16x8b, 0x55);
1228        //offset addition
1229        delta_vu0_16x8b = _mm_add_epi16(delta_vu0_16x8b, const_16x8b);
1230        mask_16x8b = _mm_setzero_si128();
1231        //filter after normalisation
1232        delta_vu0_16x8b = _mm_srai_epi16(delta_vu0_16x8b, 3);
1233
1234        //converting p0 to 16bit
1235        src_p0_16x8b = _mm_unpacklo_epi8(src_p0_16x8b, mask_16x8b);
1236        //clipping MAX
1237        delta_vu0_16x8b = _mm_min_epi16(delta_vu0_16x8b, mask_tc_16x8);
1238        //converting q0 to 16bit
1239        src_q0_16x8b = _mm_unpacklo_epi8(src_q0_16x8b, mask_16x8b);
1240        //clipping MIN
1241        delta_vu0_16x8b = _mm_max_epi16(delta_vu0_16x8b, min_0_16x8b);
1242
1243        //masking filter flag
1244        delta_vu1_16x8b = _mm_and_si128(delta_vu0_16x8b, mask_flag_q_16x8b);
1245        delta_vu0_16x8b = _mm_and_si128(delta_vu0_16x8b, mask_flag_p_16x8b);
1246
1247        // q-delta ,p+delta
1248        src_q0_16x8b = _mm_sub_epi16(src_q0_16x8b, delta_vu1_16x8b);
1249        src_p0_16x8b = _mm_add_epi16(src_p0_16x8b, delta_vu0_16x8b);
1250
1251        // p0 and q0 packed
1252        src_q0_16x8b = _mm_packus_epi16(src_q0_16x8b, mask_16x8b);
1253        src_p0_16x8b = _mm_packus_epi16(src_p0_16x8b, mask_16x8b);
1254
1255
1256
1257        _mm_storel_epi64((__m128i *)(pu1_src - src_strd), src_p0_16x8b);
1258        _mm_storel_epi64((__m128i *)(pu1_src), src_q0_16x8b);
1259
1260    }
1261
1262
1263}
1264