1/******************************************************************************
2 *
3 * Copyright (C) 2015 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19*/
20/*****************************************************************************/
21/*                                                                           */
22/*  File Name         : ih264_deblk_luma_ssse3.c                             */
23/*                                                                           */
24/*  Description       : Contains function definitions for deblocking         */
25/*                                                                           */
26/*  List of Functions : ih264_deblk_luma_vert_bs4_ssse3()                    */
27/*                      ih264_deblk_luma_horz_bs4_ssse3()                    */
28/*                      ih264_deblk_luma_vert_bslt4_ssse3()                  */
29/*                      ih264_deblk_luma_horz_bslt4_ssse3()                  */
30/*                      ih264_deblk_luma_vert_bs4_mbaff_ssse3()              */
31/*                      ih264_deblk_luma_vert_bslt4_mbaff_ssse3()            */
32/*                                                                           */
33/*  Issues / Problems : None                                                 */
34/*                                                                           */
35/*  Revision History  :                                                      */
36/*                                                                           */
37/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
38/*         12 02 2015   Naveen Kumar P  Added luma deblocking ssse3          */
39/*                                      intrinsics                           */
40/*                                                                           */
41/*****************************************************************************/
42
43/*****************************************************************************/
44/* File Includes                                                             */
45/*****************************************************************************/
46
47/* System include files */
48#include <stdio.h>
49
50/* User include files */
51#include "ih264_typedefs.h"
52#include "ih264_platform_macros.h"
53#include "ih264_deblk_edge_filters.h"
54#include "ih264_macros.h"
55
56/*****************************************************************************/
57/* Function Definitions                                                      */
58/*****************************************************************************/
59
60/*****************************************************************************/
61/*                                                                           */
62/*  Function Name : ih264_deblk_luma_vert_bs4_ssse3()                        */
63/*                                                                           */
64/*  Description   : This function performs filtering of a luma block         */
65/*                  vertical edge when the boundary strength is set to 4.    */
66/*                                                                           */
67/*  Inputs        : pu1_src    - pointer to the src sample q0                */
68/*                  src_strd   - source stride                               */
69/*                  alpha      - alpha value for the boundary                */
70/*                  beta       - beta value for the boundary                 */
71/*                                                                           */
72/*  Globals       : None                                                     */
73/*                                                                           */
74/*  Processing    : This operation is described in Sec. 8.7.2.4 under the    */
75/*                  title "Filtering process for edges for bS equal to 4" in */
76/*                  ITU T Rec H.264.                                         */
77/*                                                                           */
78/*  Outputs       : None                                                     */
79/*                                                                           */
80/*  Returns       : None                                                     */
81/*                                                                           */
82/*  Issues        : None                                                     */
83/*                                                                           */
84/*  Revision History:                                                        */
85/*                                                                           */
86/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
87/*         12 02 2015   Naveen Kumar P  Initial version                      */
88/*                                                                           */
89/*****************************************************************************/
90void ih264_deblk_luma_vert_bs4_ssse3(UWORD8 *pu1_src,
91                                     WORD32 src_strd,
92                                     WORD32 alpha,
93                                     WORD32 beta)
94{
95    __m128i zero = _mm_setzero_si128();
96    __m128i q0_16x8, q1_16x8, q2_16x8, q3_16x8;
97    __m128i p0_16x8, p1_16x8, p2_16x8, p3_16x8;
98    __m128i q0_8x16, q1_8x16, q2_8x16, q3_8x16;
99    __m128i p0_8x16, p1_8x16, p2_8x16, p3_8x16;
100    __m128i q0_16x8_1;
101    __m128i p0_16x8_1;
102    __m128i q0_16x8_2, q1_16x8_2, q2_16x8_2;
103    __m128i p0_16x8_2, p1_16x8_2, p2_16x8_2;
104    __m128i temp1, temp2, temp3, temp4, temp5, temp6;
105    __m128i Alpha_8x16, Beta_8x16;
106    __m128i flag1_16x8, flag2_16x8, flag3_16x8, flag4_16x8;
107    __m128i const_val2_16x8 = _mm_set1_epi16(2);
108    __m128i line1, line2, line3, line4, line5, line6, line7, line8;
109
110    Alpha_8x16 = _mm_set1_epi16(alpha);
111    Beta_8x16 = _mm_set1_epi16(beta);
112
113    line1 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd));
114    line2 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd));
115    line3 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd));
116    line4 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd));
117    line5 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd));
118    line6 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd));
119    line7 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd));
120    line8 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd));
121
122    temp1 = _mm_unpacklo_epi8(line1, line2);
123    temp2 = _mm_unpacklo_epi8(line3, line4);
124    temp3 = _mm_unpacklo_epi8(line5, line6);
125    temp4 = _mm_unpacklo_epi8(line7, line8);
126
127    line1 = _mm_unpacklo_epi16(temp1, temp2);
128    line2 = _mm_unpackhi_epi16(temp1, temp2);
129    line3 = _mm_unpacklo_epi16(temp3, temp4);
130    line4 = _mm_unpackhi_epi16(temp3, temp4);
131
132    p1_8x16 = _mm_unpacklo_epi32(line1, line3);
133    p0_8x16 = _mm_unpackhi_epi32(line1, line3);
134    q0_8x16 = _mm_unpacklo_epi32(line2, line4);
135    q1_8x16 = _mm_unpackhi_epi32(line2, line4);
136
137    line1 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 8 * src_strd));
138    line2 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 9 * src_strd));
139    line3 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 10 * src_strd));
140    line4 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 11 * src_strd));
141    line5 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 12 * src_strd));
142    line6 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 13 * src_strd));
143    line7 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 14 * src_strd));
144    line8 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 15 * src_strd));
145
146    temp1 = _mm_unpacklo_epi8(line1, line2);
147    temp2 = _mm_unpacklo_epi8(line3, line4);
148    temp3 = _mm_unpacklo_epi8(line5, line6);
149    temp4 = _mm_unpacklo_epi8(line7, line8);
150
151    line1 = _mm_unpacklo_epi16(temp1, temp2);
152    line2 = _mm_unpackhi_epi16(temp1, temp2);
153    line3 = _mm_unpacklo_epi16(temp3, temp4);
154    line4 = _mm_unpackhi_epi16(temp3, temp4);
155
156    temp1 = _mm_unpacklo_epi32(line1, line3);
157    temp2 = _mm_unpackhi_epi32(line1, line3);
158    temp3 = _mm_unpacklo_epi32(line2, line4);
159    temp4 = _mm_unpackhi_epi32(line2, line4);
160
161    p3_16x8 = _mm_unpacklo_epi64(p1_8x16, temp1);
162    p2_16x8 = _mm_unpackhi_epi64(p1_8x16, temp1);
163    q2_16x8 = _mm_unpacklo_epi64(q1_8x16, temp4);
164    q3_16x8 = _mm_unpackhi_epi64(q1_8x16, temp4);
165    p1_16x8 = _mm_unpacklo_epi64(p0_8x16, temp2);
166    p0_16x8 = _mm_unpackhi_epi64(p0_8x16, temp2);
167    q0_16x8 = _mm_unpacklo_epi64(q0_8x16, temp3);
168    q1_16x8 = _mm_unpackhi_epi64(q0_8x16, temp3);
169
170    //Cond1 (ABS(p0 - q0) < alpha)
171    temp1 = _mm_subs_epu8(q0_16x8, p0_16x8);
172    temp2 = _mm_subs_epu8(p0_16x8, q0_16x8);
173    temp1 = _mm_add_epi8(temp1, temp2);
174
175    temp2 = _mm_unpacklo_epi8(temp1, zero);
176    temp1 = _mm_unpackhi_epi8(temp1, zero);
177
178    temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
179    temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
180
181    flag1_16x8 = _mm_packs_epi16(temp2, temp1);
182
183    //Cond2 (ABS(q1 - q0) < beta)
184    temp1 = _mm_subs_epu8(q0_16x8, q1_16x8);
185    temp2 = _mm_subs_epu8(q1_16x8, q0_16x8);
186    temp1 = _mm_add_epi8(temp1, temp2);
187
188    temp2 = _mm_unpacklo_epi8(temp1, zero);
189    temp1 = _mm_unpackhi_epi8(temp1, zero);
190
191    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
192    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
193
194    flag2_16x8 = _mm_packs_epi16(temp2, temp1);
195
196    flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
197
198    //Cond3 (ABS(p1 - p0) < beta)
199    temp1 = _mm_subs_epu8(p0_16x8, p1_16x8);
200    temp2 = _mm_subs_epu8(p1_16x8, p0_16x8);
201    temp1 = _mm_add_epi8(temp1, temp2);
202
203    temp2 = _mm_unpacklo_epi8(temp1, zero);
204    temp1 = _mm_unpackhi_epi8(temp1, zero);
205
206    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
207    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
208
209    flag2_16x8 = _mm_packs_epi16(temp2, temp1);
210
211    // !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta))
212    flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
213
214    // (ABS(p0 - q0) < ((alpha >> 2) + 2))
215    temp1 = _mm_subs_epu8(p0_16x8, q0_16x8);
216    temp2 = _mm_subs_epu8(q0_16x8, p0_16x8);
217    temp1 = _mm_add_epi8(temp1, temp2);
218    Alpha_8x16 = _mm_srai_epi16(Alpha_8x16, 2);
219    Alpha_8x16 = _mm_add_epi16(Alpha_8x16, const_val2_16x8);
220
221    temp2 = _mm_unpacklo_epi8(temp1, zero);
222    temp1 = _mm_unpackhi_epi8(temp1, zero);
223    temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
224    temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
225
226    flag2_16x8 = _mm_packs_epi16(temp2, temp1);
227    flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
228
229    // (ABS(p2 - p0) < beta)
230    temp1 = _mm_subs_epu8(p0_16x8, p2_16x8);
231    temp2 = _mm_subs_epu8(p2_16x8, p0_16x8);
232    temp1 = _mm_add_epi8(temp1, temp2);
233
234    temp2 = _mm_unpacklo_epi8(temp1, zero);
235    temp1 = _mm_unpackhi_epi8(temp1, zero);
236    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
237    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
238
239    flag3_16x8 = _mm_packs_epi16(temp2, temp1);
240    flag3_16x8 = _mm_and_si128(flag3_16x8, flag2_16x8);
241
242    // (ABS(q2 - q0) < beta)
243    temp1 = _mm_subs_epu8(q0_16x8, q2_16x8);
244    temp2 = _mm_subs_epu8(q2_16x8, q0_16x8);
245    temp1 = _mm_add_epi8(temp1, temp2);
246
247    temp2 = _mm_unpacklo_epi8(temp1, zero);
248    temp1 = _mm_unpackhi_epi8(temp1, zero);
249    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
250    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
251
252    flag4_16x8 = _mm_packs_epi16(temp2, temp1);
253    flag4_16x8 = _mm_and_si128(flag4_16x8, flag2_16x8);
254
255    // First 8 pixels
256    p3_8x16 = _mm_unpacklo_epi8(p3_16x8, zero);
257    p2_8x16 = _mm_unpacklo_epi8(p2_16x8, zero);
258    p1_8x16 = _mm_unpacklo_epi8(p1_16x8, zero);
259    p0_8x16 = _mm_unpacklo_epi8(p0_16x8, zero);
260    q0_8x16 = _mm_unpacklo_epi8(q0_16x8, zero);
261    q1_8x16 = _mm_unpacklo_epi8(q1_16x8, zero);
262    q2_8x16 = _mm_unpacklo_epi8(q2_16x8, zero);
263    q3_8x16 = _mm_unpacklo_epi8(q3_16x8, zero);
264
265    // p0_1 and q0_1
266    temp1 = _mm_add_epi16(p0_8x16, q1_8x16);
267    temp2 = _mm_add_epi16(p1_8x16, q0_8x16);
268    temp5 = _mm_add_epi16(temp1, const_val2_16x8);
269    temp6 = _mm_add_epi16(temp2, const_val2_16x8);
270    temp3 = _mm_slli_epi16(p1_8x16, 1);
271    temp4 = _mm_slli_epi16(q1_8x16, 1);
272    temp1 = _mm_add_epi16(temp5, temp3);
273    temp2 = _mm_add_epi16(temp6, temp4);
274    p0_16x8_1 = _mm_srai_epi16(temp1, 2);
275    q0_16x8_1 = _mm_srai_epi16(temp2, 2);
276
277    // p1_2 and q1_2
278    temp6 = _mm_add_epi16(temp6, p0_8x16);
279    temp5 = _mm_add_epi16(temp5, q0_8x16);
280    temp1 = _mm_add_epi16(temp6, p2_8x16);
281    temp2 = _mm_add_epi16(temp5, q2_8x16);
282    p1_16x8_2 = _mm_srai_epi16(temp1, 2);
283    q1_16x8_2 = _mm_srai_epi16(temp2, 2);
284
285    // p0_2 and q0_2
286    temp1 = _mm_add_epi16(temp3, p2_8x16);
287    temp2 = _mm_add_epi16(temp4, q2_8x16);
288    temp1 = _mm_add_epi16(temp1, q1_8x16);
289    temp2 = _mm_add_epi16(temp2, p1_8x16);
290    temp3 = _mm_add_epi16(p0_8x16, q0_8x16);
291    temp3 = _mm_slli_epi16(temp3, 1);
292    temp1 = _mm_add_epi16(temp1, temp3);
293    temp2 = _mm_add_epi16(temp2, temp3);
294    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4));
295    temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4));
296    p0_16x8_2 = _mm_srai_epi16(temp1, 3);
297    q0_16x8_2 = _mm_srai_epi16(temp2, 3);
298
299    // p2_2 and q2_2
300    temp1 = _mm_add_epi16(temp6, const_val2_16x8);
301    temp2 = _mm_add_epi16(temp5, const_val2_16x8);
302    temp3 = _mm_slli_epi16(p2_8x16, 1);
303    temp4 = _mm_slli_epi16(q2_8x16, 1);
304    temp3 = _mm_add_epi16(p2_8x16, temp3);
305    temp4 = _mm_add_epi16(q2_8x16, temp4);
306    temp5 = _mm_slli_epi16(p3_8x16, 1);
307    temp6 = _mm_slli_epi16(q3_8x16, 1);
308    temp1 = _mm_add_epi16(temp1, temp3);
309    temp2 = _mm_add_epi16(temp2, temp4);
310    temp1 = _mm_add_epi16(temp1, temp5);
311    temp2 = _mm_add_epi16(temp2, temp6);
312    p2_16x8_2 = _mm_srai_epi16(temp1, 3);
313    q2_16x8_2 = _mm_srai_epi16(temp2, 3);
314
315    // Second 8 pixels and packing with first 8 pixels
316    p3_8x16 = _mm_unpackhi_epi8(p3_16x8, zero);
317    p2_8x16 = _mm_unpackhi_epi8(p2_16x8, zero);
318    p1_8x16 = _mm_unpackhi_epi8(p1_16x8, zero);
319    p0_8x16 = _mm_unpackhi_epi8(p0_16x8, zero);
320    q0_8x16 = _mm_unpackhi_epi8(q0_16x8, zero);
321    q1_8x16 = _mm_unpackhi_epi8(q1_16x8, zero);
322    q2_8x16 = _mm_unpackhi_epi8(q2_16x8, zero);
323    q3_8x16 = _mm_unpackhi_epi8(q3_16x8, zero);
324
325    // p0_1 and q0_1
326    temp1 = _mm_add_epi16(p0_8x16, q1_8x16);
327    temp2 = _mm_add_epi16(p1_8x16, q0_8x16);
328    temp5 = _mm_add_epi16(temp1, const_val2_16x8);
329    temp6 = _mm_add_epi16(temp2, const_val2_16x8);
330    temp3 = _mm_slli_epi16(p1_8x16, 1);
331    temp4 = _mm_slli_epi16(q1_8x16, 1);
332    temp1 = _mm_add_epi16(temp5, temp3);
333    temp2 = _mm_add_epi16(temp6, temp4);
334    temp1 = _mm_srai_epi16(temp1, 2);
335    temp2 = _mm_srai_epi16(temp2, 2);
336    p0_16x8_1 = _mm_packus_epi16(p0_16x8_1, temp1);
337    q0_16x8_1 = _mm_packus_epi16(q0_16x8_1, temp2);
338
339    // p1_2 and q1_2
340    temp6 = _mm_add_epi16(temp6, p0_8x16);
341    temp5 = _mm_add_epi16(temp5, q0_8x16);
342    temp1 = _mm_add_epi16(temp6, p2_8x16);
343    temp2 = _mm_add_epi16(temp5, q2_8x16);
344    temp1 = _mm_srai_epi16(temp1, 2);
345    temp2 = _mm_srai_epi16(temp2, 2);
346    p1_16x8_2 = _mm_packus_epi16(p1_16x8_2, temp1);
347    q1_16x8_2 = _mm_packus_epi16(q1_16x8_2, temp2);
348
349    // p0_2 and q0_2
350    temp1 = _mm_add_epi16(temp3, p2_8x16);
351    temp2 = _mm_add_epi16(temp4, q2_8x16);
352    temp1 = _mm_add_epi16(temp1, q1_8x16);
353    temp2 = _mm_add_epi16(temp2, p1_8x16);
354    temp3 = _mm_add_epi16(p0_8x16, q0_8x16);
355    temp3 = _mm_slli_epi16(temp3, 1);
356    temp1 = _mm_add_epi16(temp1, temp3);
357    temp2 = _mm_add_epi16(temp2, temp3);
358    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4));
359    temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4));
360    temp1 = _mm_srai_epi16(temp1, 3);
361    temp2 = _mm_srai_epi16(temp2, 3);
362    p0_16x8_2 = _mm_packus_epi16(p0_16x8_2, temp1);
363    q0_16x8_2 = _mm_packus_epi16(q0_16x8_2, temp2);
364
365    // p2_2 and q2_2
366    temp1 = _mm_add_epi16(temp6, const_val2_16x8);
367    temp2 = _mm_add_epi16(temp5, const_val2_16x8);
368    temp3 = _mm_slli_epi16(p2_8x16, 1);
369    temp4 = _mm_slli_epi16(q2_8x16, 1);
370    temp3 = _mm_add_epi16(p2_8x16, temp3);
371    temp4 = _mm_add_epi16(q2_8x16, temp4);
372    temp5 = _mm_slli_epi16(p3_8x16, 1);
373    temp6 = _mm_slli_epi16(q3_8x16, 1);
374    temp1 = _mm_add_epi16(temp1, temp3);
375    temp2 = _mm_add_epi16(temp2, temp4);
376    temp1 = _mm_add_epi16(temp1, temp5);
377    temp2 = _mm_add_epi16(temp2, temp6);
378    temp1 = _mm_srai_epi16(temp1, 3);
379    temp2 = _mm_srai_epi16(temp2, 3);
380    p2_16x8_2 = _mm_packus_epi16(p2_16x8_2, temp1);
381    q2_16x8_2 = _mm_packus_epi16(q2_16x8_2, temp2);
382
383    // p0 and q0
384    p0_16x8 = _mm_and_si128(p0_16x8,
385                            _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF)));
386    p0_16x8_1 = _mm_and_si128(p0_16x8_1, flag1_16x8);
387    p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_1);
388    q0_16x8 = _mm_and_si128(q0_16x8,
389                            _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF)));
390    q0_16x8_1 = _mm_and_si128(q0_16x8_1, flag1_16x8);
391    q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_1);
392
393    // p0 and q0
394    p0_16x8 = _mm_and_si128(p0_16x8,
395                            _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
396    p0_16x8_2 = _mm_and_si128(p0_16x8_2, flag3_16x8);
397    p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_2);
398    q0_16x8 = _mm_and_si128(q0_16x8,
399                            _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
400    q0_16x8_2 = _mm_and_si128(q0_16x8_2, flag4_16x8);
401    q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_2);
402
403    // p1 and q1
404    p1_16x8 = _mm_and_si128(p1_16x8,
405                            _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
406    p1_16x8_2 = _mm_and_si128(p1_16x8_2, flag3_16x8);
407    p1_16x8 = _mm_add_epi8(p1_16x8, p1_16x8_2);
408    q1_16x8 = _mm_and_si128(q1_16x8,
409                            _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
410    q1_16x8_2 = _mm_and_si128(q1_16x8_2, flag4_16x8);
411    q1_16x8 = _mm_add_epi8(q1_16x8, q1_16x8_2);
412
413    // p2 and q2
414    p2_16x8 = _mm_and_si128(p2_16x8,
415                            _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
416    p2_16x8_2 = _mm_and_si128(p2_16x8_2, flag3_16x8);
417    p2_16x8 = _mm_add_epi8(p2_16x8, p2_16x8_2);
418    q2_16x8 = _mm_and_si128(q2_16x8,
419                            _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
420    q2_16x8_2 = _mm_and_si128(q2_16x8_2, flag4_16x8);
421    q2_16x8 = _mm_add_epi8(q2_16x8, q2_16x8_2);
422
423    temp1 = _mm_unpacklo_epi8(p3_16x8, p2_16x8);
424    temp2 = _mm_unpacklo_epi8(p1_16x8, p0_16x8);
425    temp3 = _mm_unpacklo_epi8(q0_16x8, q1_16x8);
426    temp4 = _mm_unpacklo_epi8(q2_16x8, q3_16x8);
427
428    p3_8x16 = _mm_unpacklo_epi16(temp1, temp2);
429    p2_8x16 = _mm_unpackhi_epi16(temp1, temp2);
430    q2_8x16 = _mm_unpacklo_epi16(temp3, temp4);
431    q3_8x16 = _mm_unpackhi_epi16(temp3, temp4);
432
433    line1 = _mm_unpacklo_epi32(p3_8x16, q2_8x16);
434    line2 = _mm_srli_si128(line1, 8);
435    line3 = _mm_unpackhi_epi32(p3_8x16, q2_8x16);
436    line4 = _mm_srli_si128(line3, 8);
437    line5 = _mm_unpacklo_epi32(p2_8x16, q3_8x16);
438    line6 = _mm_srli_si128(line5, 8);
439    line7 = _mm_unpackhi_epi32(p2_8x16, q3_8x16);
440    line8 = _mm_srli_si128(line7, 8);
441
442    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd), line1);
443    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd), line2);
444    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd), line3);
445    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd), line4);
446    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd), line5);
447    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd), line6);
448    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd), line7);
449    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd), line8);
450
451    temp1 = _mm_unpackhi_epi8(p3_16x8, p2_16x8);
452    temp2 = _mm_unpackhi_epi8(p1_16x8, p0_16x8);
453    temp3 = _mm_unpackhi_epi8(q0_16x8, q1_16x8);
454    temp4 = _mm_unpackhi_epi8(q2_16x8, q3_16x8);
455
456    p3_8x16 = _mm_unpacklo_epi16(temp1, temp2);
457    p2_8x16 = _mm_unpackhi_epi16(temp1, temp2);
458    q2_8x16 = _mm_unpacklo_epi16(temp3, temp4);
459    q3_8x16 = _mm_unpackhi_epi16(temp3, temp4);
460
461    line1 = _mm_unpacklo_epi32(p3_8x16, q2_8x16);
462    line2 = _mm_srli_si128(line1, 8);
463    line3 = _mm_unpackhi_epi32(p3_8x16, q2_8x16);
464    line4 = _mm_srli_si128(line3, 8);
465    line5 = _mm_unpacklo_epi32(p2_8x16, q3_8x16);
466    line6 = _mm_srli_si128(line5, 8);
467    line7 = _mm_unpackhi_epi32(p2_8x16, q3_8x16);
468    line8 = _mm_srli_si128(line7, 8);
469
470    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 8 * src_strd), line1);
471    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 9 * src_strd), line2);
472    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 10 * src_strd), line3);
473    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 11 * src_strd), line4);
474    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 12 * src_strd), line5);
475    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 13 * src_strd), line6);
476    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 14 * src_strd), line7);
477    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 15 * src_strd), line8);
478
479}
480
481/*****************************************************************************/
482/*                                                                           */
483/*  Function Name : ih264_deblk_luma_horz_bs4_ssse3()                        */
484/*                                                                           */
485/*  Description   : This function performs filtering of a luma block         */
486/*                  horizontal edge when the boundary strength is set to 4.  */
487/*                                                                           */
488/*  Inputs        : pu1_src    - pointer to the src sample q0                */
489/*                  src_strd   - source stride                               */
490/*                  alpha      - alpha value for the boundary                */
491/*                  beta       - beta value for the boundary                 */
492/*                                                                           */
493/*  Globals       : None                                                     */
494/*                                                                           */
495/*  Processing    : This operation is described in Sec. 8.7.2.4 under the    */
496/*                  title "Filtering process for edges for bS equal to 4" in */
497/*                  ITU T Rec H.264.                                         */
498/*                                                                           */
499/*  Outputs       : None                                                     */
500/*                                                                           */
501/*  Returns       : None                                                     */
502/*                                                                           */
503/*  Issues        : None                                                     */
504/*                                                                           */
505/*  Revision History:                                                        */
506/*                                                                           */
507/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
508/*         12 02 2015   Naveen Kumar P  Initial version                      */
509/*                                                                           */
510/*****************************************************************************/
511void ih264_deblk_luma_horz_bs4_ssse3(UWORD8 *pu1_src,
512                                     WORD32 src_strd,
513                                     WORD32 alpha,
514                                     WORD32 beta)
515{
516    WORD16 i16_posP3, i16_posP2, i16_posP1, i16_posP0;
517    WORD16 i16_posQ1, i16_posQ2, i16_posQ3;
518    UWORD8 *pu1_HorzPixel;
519    __m128i zero = _mm_setzero_si128();
520    __m128i q0_16x8, q1_16x8, q2_16x8, q3_16x8;
521    __m128i p0_16x8, p1_16x8, p2_16x8, p3_16x8;
522    __m128i q0_8x16, q1_8x16, q2_8x16, q3_8x16;
523    __m128i p0_8x16, p1_8x16, p2_8x16, p3_8x16;
524    __m128i q0_16x8_1;
525    __m128i p0_16x8_1;
526    __m128i q0_16x8_2, q1_16x8_2, q2_16x8_2;
527    __m128i p0_16x8_2, p1_16x8_2, p2_16x8_2;
528    __m128i temp1, temp2, temp3, temp4, temp5, temp6;
529    __m128i Alpha_8x16, Beta_8x16;
530    __m128i flag1_16x8, flag2_16x8, flag3_16x8, flag4_16x8;
531    __m128i const_val2_16x8 = _mm_set1_epi16(2);
532
533    pu1_HorzPixel = pu1_src - (src_strd << 2);
534
535    i16_posQ1 = src_strd;
536    i16_posQ2 = X2(src_strd);
537    i16_posQ3 = X3(src_strd);
538    i16_posP0 = X3(src_strd);
539    i16_posP1 = X2(src_strd);
540    i16_posP2 = src_strd;
541    i16_posP3 = 0;
542
543    Alpha_8x16 = _mm_set1_epi16(alpha);
544    Beta_8x16 = _mm_set1_epi16(beta);
545
546    p3_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP3));
547    p2_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP2));
548    p1_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP1));
549    p0_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP0));
550    q0_16x8 = _mm_loadu_si128((__m128i *)(pu1_src));
551    q1_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ1));
552    q2_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ2));
553    q3_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ3));
554
555    //Cond1 (ABS(p0 - q0) < alpha)
556    temp1 = _mm_subs_epu8(q0_16x8, p0_16x8);
557    temp2 = _mm_subs_epu8(p0_16x8, q0_16x8);
558    temp1 = _mm_add_epi8(temp1, temp2);
559
560    temp2 = _mm_unpacklo_epi8(temp1, zero);
561    temp1 = _mm_unpackhi_epi8(temp1, zero);
562
563    temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
564    temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
565
566    flag1_16x8 = _mm_packs_epi16(temp2, temp1);
567
568    //Cond2 (ABS(q1 - q0) < beta)
569    temp1 = _mm_subs_epu8(q0_16x8, q1_16x8);
570    temp2 = _mm_subs_epu8(q1_16x8, q0_16x8);
571    temp1 = _mm_add_epi8(temp1, temp2);
572
573    temp2 = _mm_unpacklo_epi8(temp1, zero);
574    temp1 = _mm_unpackhi_epi8(temp1, zero);
575
576    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
577    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
578
579    flag2_16x8 = _mm_packs_epi16(temp2, temp1);
580
581    flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
582
583    //Cond3 (ABS(p1 - p0) < beta)
584    temp1 = _mm_subs_epu8(p0_16x8, p1_16x8);
585    temp2 = _mm_subs_epu8(p1_16x8, p0_16x8);
586    temp1 = _mm_add_epi8(temp1, temp2);
587
588    temp2 = _mm_unpacklo_epi8(temp1, zero);
589    temp1 = _mm_unpackhi_epi8(temp1, zero);
590
591    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
592    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
593
594    flag2_16x8 = _mm_packs_epi16(temp2, temp1);
595
596    // !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta))
597    flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
598
599    // (ABS(p0 - q0) < ((alpha >> 2) + 2))
600    temp1 = _mm_subs_epu8(p0_16x8, q0_16x8);
601    temp2 = _mm_subs_epu8(q0_16x8, p0_16x8);
602    temp1 = _mm_add_epi8(temp1, temp2);
603    Alpha_8x16 = _mm_srai_epi16(Alpha_8x16, 2);
604    Alpha_8x16 = _mm_add_epi16(Alpha_8x16, const_val2_16x8);
605
606    temp2 = _mm_unpacklo_epi8(temp1, zero);
607    temp1 = _mm_unpackhi_epi8(temp1, zero);
608    temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
609    temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
610
611    flag2_16x8 = _mm_packs_epi16(temp2, temp1);
612    flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
613
614    // (ABS(p2 - p0) < beta)
615    temp1 = _mm_subs_epu8(p0_16x8, p2_16x8);
616    temp2 = _mm_subs_epu8(p2_16x8, p0_16x8);
617    temp1 = _mm_add_epi8(temp1, temp2);
618
619    temp2 = _mm_unpacklo_epi8(temp1, zero);
620    temp1 = _mm_unpackhi_epi8(temp1, zero);
621    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
622    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
623
624    flag3_16x8 = _mm_packs_epi16(temp2, temp1);
625    flag3_16x8 = _mm_and_si128(flag3_16x8, flag2_16x8);
626
627    // (ABS(q2 - q0) < beta)
628    temp1 = _mm_subs_epu8(q0_16x8, q2_16x8);
629    temp2 = _mm_subs_epu8(q2_16x8, q0_16x8);
630    temp1 = _mm_add_epi8(temp1, temp2);
631
632    temp2 = _mm_unpacklo_epi8(temp1, zero);
633    temp1 = _mm_unpackhi_epi8(temp1, zero);
634    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
635    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
636
637    flag4_16x8 = _mm_packs_epi16(temp2, temp1);
638    flag4_16x8 = _mm_and_si128(flag4_16x8, flag2_16x8);
639
640    // First 8 pixels
641    p3_8x16 = _mm_unpacklo_epi8(p3_16x8, zero);
642    p2_8x16 = _mm_unpacklo_epi8(p2_16x8, zero);
643    p1_8x16 = _mm_unpacklo_epi8(p1_16x8, zero);
644    p0_8x16 = _mm_unpacklo_epi8(p0_16x8, zero);
645    q0_8x16 = _mm_unpacklo_epi8(q0_16x8, zero);
646    q1_8x16 = _mm_unpacklo_epi8(q1_16x8, zero);
647    q2_8x16 = _mm_unpacklo_epi8(q2_16x8, zero);
648    q3_8x16 = _mm_unpacklo_epi8(q3_16x8, zero);
649
650    // p0_1 and q0_1
651    temp1 = _mm_add_epi16(p0_8x16, q1_8x16);
652    temp2 = _mm_add_epi16(p1_8x16, q0_8x16);
653    temp5 = _mm_add_epi16(temp1, const_val2_16x8);
654    temp6 = _mm_add_epi16(temp2, const_val2_16x8);
655    temp3 = _mm_slli_epi16(p1_8x16, 1);
656    temp4 = _mm_slli_epi16(q1_8x16, 1);
657    temp1 = _mm_add_epi16(temp5, temp3);
658    temp2 = _mm_add_epi16(temp6, temp4);
659    p0_16x8_1 = _mm_srai_epi16(temp1, 2);
660    q0_16x8_1 = _mm_srai_epi16(temp2, 2);
661
662    // p1_2 and q1_2
663    temp6 = _mm_add_epi16(temp6, p0_8x16);
664    temp5 = _mm_add_epi16(temp5, q0_8x16);
665    temp1 = _mm_add_epi16(temp6, p2_8x16);
666    temp2 = _mm_add_epi16(temp5, q2_8x16);
667    p1_16x8_2 = _mm_srai_epi16(temp1, 2);
668    q1_16x8_2 = _mm_srai_epi16(temp2, 2);
669
670    // p0_2 and q0_2
671    temp1 = _mm_add_epi16(temp3, p2_8x16);
672    temp2 = _mm_add_epi16(temp4, q2_8x16);
673    temp1 = _mm_add_epi16(temp1, q1_8x16);
674    temp2 = _mm_add_epi16(temp2, p1_8x16);
675    temp3 = _mm_add_epi16(p0_8x16, q0_8x16);
676    temp3 = _mm_slli_epi16(temp3, 1);
677    temp1 = _mm_add_epi16(temp1, temp3);
678    temp2 = _mm_add_epi16(temp2, temp3);
679    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4));
680    temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4));
681    p0_16x8_2 = _mm_srai_epi16(temp1, 3);
682    q0_16x8_2 = _mm_srai_epi16(temp2, 3);
683
684    // p2_2 and q2_2
685    temp1 = _mm_add_epi16(temp6, const_val2_16x8);
686    temp2 = _mm_add_epi16(temp5, const_val2_16x8);
687    temp3 = _mm_slli_epi16(p2_8x16, 1);
688    temp4 = _mm_slli_epi16(q2_8x16, 1);
689    temp3 = _mm_add_epi16(p2_8x16, temp3);
690    temp4 = _mm_add_epi16(q2_8x16, temp4);
691    temp5 = _mm_slli_epi16(p3_8x16, 1);
692    temp6 = _mm_slli_epi16(q3_8x16, 1);
693    temp1 = _mm_add_epi16(temp1, temp3);
694    temp2 = _mm_add_epi16(temp2, temp4);
695    temp1 = _mm_add_epi16(temp1, temp5);
696    temp2 = _mm_add_epi16(temp2, temp6);
697    p2_16x8_2 = _mm_srai_epi16(temp1, 3);
698    q2_16x8_2 = _mm_srai_epi16(temp2, 3);
699
700    // Second 8 pixels and packing with first 8 pixels
701    p3_8x16 = _mm_unpackhi_epi8(p3_16x8, zero);
702    p2_8x16 = _mm_unpackhi_epi8(p2_16x8, zero);
703    p1_8x16 = _mm_unpackhi_epi8(p1_16x8, zero);
704    p0_8x16 = _mm_unpackhi_epi8(p0_16x8, zero);
705    q0_8x16 = _mm_unpackhi_epi8(q0_16x8, zero);
706    q1_8x16 = _mm_unpackhi_epi8(q1_16x8, zero);
707    q2_8x16 = _mm_unpackhi_epi8(q2_16x8, zero);
708    q3_8x16 = _mm_unpackhi_epi8(q3_16x8, zero);
709
710    // p0_1 and q0_1
711    temp1 = _mm_add_epi16(p0_8x16, q1_8x16);
712    temp2 = _mm_add_epi16(p1_8x16, q0_8x16);
713    temp5 = _mm_add_epi16(temp1, const_val2_16x8);
714    temp6 = _mm_add_epi16(temp2, const_val2_16x8);
715    temp3 = _mm_slli_epi16(p1_8x16, 1);
716    temp4 = _mm_slli_epi16(q1_8x16, 1);
717    temp1 = _mm_add_epi16(temp5, temp3);
718    temp2 = _mm_add_epi16(temp6, temp4);
719    temp1 = _mm_srai_epi16(temp1, 2);
720    temp2 = _mm_srai_epi16(temp2, 2);
721    p0_16x8_1 = _mm_packus_epi16(p0_16x8_1, temp1);
722    q0_16x8_1 = _mm_packus_epi16(q0_16x8_1, temp2);
723
724    // p1_2 and q1_2
725    temp6 = _mm_add_epi16(temp6, p0_8x16);
726    temp5 = _mm_add_epi16(temp5, q0_8x16);
727    temp1 = _mm_add_epi16(temp6, p2_8x16);
728    temp2 = _mm_add_epi16(temp5, q2_8x16);
729    temp1 = _mm_srai_epi16(temp1, 2);
730    temp2 = _mm_srai_epi16(temp2, 2);
731    p1_16x8_2 = _mm_packus_epi16(p1_16x8_2, temp1);
732    q1_16x8_2 = _mm_packus_epi16(q1_16x8_2, temp2);
733
734    // p0_2 and q0_2
735    temp1 = _mm_add_epi16(temp3, p2_8x16);
736    temp2 = _mm_add_epi16(temp4, q2_8x16);
737    temp1 = _mm_add_epi16(temp1, q1_8x16);
738    temp2 = _mm_add_epi16(temp2, p1_8x16);
739    temp3 = _mm_add_epi16(p0_8x16, q0_8x16);
740    temp3 = _mm_slli_epi16(temp3, 1);
741    temp1 = _mm_add_epi16(temp1, temp3);
742    temp2 = _mm_add_epi16(temp2, temp3);
743    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4));
744    temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4));
745    temp1 = _mm_srai_epi16(temp1, 3);
746    temp2 = _mm_srai_epi16(temp2, 3);
747    p0_16x8_2 = _mm_packus_epi16(p0_16x8_2, temp1);
748    q0_16x8_2 = _mm_packus_epi16(q0_16x8_2, temp2);
749
750    // p2_2 and q2_2
751    temp1 = _mm_add_epi16(temp6, const_val2_16x8);
752    temp2 = _mm_add_epi16(temp5, const_val2_16x8);
753    temp3 = _mm_slli_epi16(p2_8x16, 1);
754    temp4 = _mm_slli_epi16(q2_8x16, 1);
755    temp3 = _mm_add_epi16(p2_8x16, temp3);
756    temp4 = _mm_add_epi16(q2_8x16, temp4);
757    temp5 = _mm_slli_epi16(p3_8x16, 1);
758    temp6 = _mm_slli_epi16(q3_8x16, 1);
759    temp1 = _mm_add_epi16(temp1, temp3);
760    temp2 = _mm_add_epi16(temp2, temp4);
761    temp1 = _mm_add_epi16(temp1, temp5);
762    temp2 = _mm_add_epi16(temp2, temp6);
763    temp1 = _mm_srai_epi16(temp1, 3);
764    temp2 = _mm_srai_epi16(temp2, 3);
765    p2_16x8_2 = _mm_packus_epi16(p2_16x8_2, temp1);
766    q2_16x8_2 = _mm_packus_epi16(q2_16x8_2, temp2);
767
768    // p0 and q0
769    p0_16x8 = _mm_and_si128(p0_16x8,
770                            _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF)));
771    p0_16x8_1 = _mm_and_si128(p0_16x8_1, flag1_16x8);
772    p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_1);
773    q0_16x8 = _mm_and_si128(q0_16x8,
774                            _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF)));
775    q0_16x8_1 = _mm_and_si128(q0_16x8_1, flag1_16x8);
776    q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_1);
777
778    // p0 and q0
779    p0_16x8 = _mm_and_si128(p0_16x8,
780                            _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
781    p0_16x8_2 = _mm_and_si128(p0_16x8_2, flag3_16x8);
782    p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_2);
783    q0_16x8 = _mm_and_si128(q0_16x8,
784                            _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
785    q0_16x8_2 = _mm_and_si128(q0_16x8_2, flag4_16x8);
786    q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_2);
787
788    // p1 and q1
789    p1_16x8 = _mm_and_si128(p1_16x8,
790                            _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
791    p1_16x8_2 = _mm_and_si128(p1_16x8_2, flag3_16x8);
792    p1_16x8 = _mm_add_epi8(p1_16x8, p1_16x8_2);
793    q1_16x8 = _mm_and_si128(q1_16x8,
794                            _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
795    q1_16x8_2 = _mm_and_si128(q1_16x8_2, flag4_16x8);
796    q1_16x8 = _mm_add_epi8(q1_16x8, q1_16x8_2);
797
798    // p2 and q2
799    p2_16x8 = _mm_and_si128(p2_16x8,
800                            _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
801    p2_16x8_2 = _mm_and_si128(p2_16x8_2, flag3_16x8);
802    p2_16x8 = _mm_add_epi8(p2_16x8, p2_16x8_2);
803    q2_16x8 = _mm_and_si128(q2_16x8,
804                            _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
805    q2_16x8_2 = _mm_and_si128(q2_16x8_2, flag4_16x8);
806    q2_16x8 = _mm_add_epi8(q2_16x8, q2_16x8_2);
807
808    _mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP2), p2_16x8);
809    _mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP1), p1_16x8);
810    _mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP0), p0_16x8);
811
812    _mm_storeu_si128((__m128i *)(pu1_src), q0_16x8);
813    _mm_storeu_si128((__m128i *)(pu1_src + i16_posQ1), q1_16x8);
814    _mm_storeu_si128((__m128i *)(pu1_src + i16_posQ2), q2_16x8);
815
816}
817
818/*****************************************************************************/
819/*                                                                           */
820/*  Function Name : ih264_deblk_luma_vert_bslt4_ssse3()                      */
821/*                                                                           */
822/*  Description   : This function performs filtering of a luma block         */
823/*                  vertical edge when the boundary strength is less than 4. */
824/*                                                                           */
825/*  Inputs        : pu1_src       - pointer to the src sample q0             */
826/*                  src_strd      - source stride                            */
827/*                  alpha         - alpha value for the boundary             */
828/*                  beta          - beta value for the boundary              */
829/*                  u4_bs         - packed Boundary strength array           */
830/*                  pu1_cliptab   - tc0_table                                */
831/*                                                                           */
832/*  Globals       : None                                                     */
833/*                                                                           */
834/*  Processing    : This operation is described in Sec. 8.7.2.3 under the    */
835/*                  title "Filtering process for edges for bS less than 4"   */
836/*                  in ITU T Rec H.264.                                      */
837/*                                                                           */
838/*  Outputs       : None                                                     */
839/*                                                                           */
840/*  Returns       : None                                                     */
841/*                                                                           */
842/*  Issues        : None                                                     */
843/*                                                                           */
844/*  Revision History:                                                        */
845/*                                                                           */
846/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
847/*         12 02 2015   Naveen Kumar P  Initial version                      */
848/*                                                                           */
849/*****************************************************************************/
850void ih264_deblk_luma_vert_bslt4_ssse3(UWORD8 *pu1_src,
851                                       WORD32 src_strd,
852                                       WORD32 alpha,
853                                       WORD32 beta,
854                                       UWORD32 u4_bs,
855                                       const UWORD8 *pu1_cliptab)
856{
857    UWORD8 u1_Bs, u1_Bs1;
858
859    WORD32 j = 0;
860
861    __m128i linea, lineb, linec, lined, linee, linef, lineg, lineh;
862    __m128i int1, int2, int3, int4, high1, high2;
863    __m128i flag, flag1, i_C, i_C0;
864    __m128i i_Ap, i_Aq, diff, const1, const2, in_macro, in_macrotemp, temp,
865                    temp1;
866    __m128i zero = _mm_setzero_si128();
867
868    for(j = 0; j <= 8 * src_strd; j += 8 * src_strd)
869    {
870        //Transpose
871        linea = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + j));
872        lineb = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + src_strd + j));
873        linec = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 2 * src_strd + j));
874        lined = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 3 * src_strd + j));
875
876        linea = _mm_unpacklo_epi8(linea, zero);
877        lineb = _mm_unpacklo_epi8(lineb, zero);
878        linec = _mm_unpacklo_epi8(linec, zero);
879        lined = _mm_unpacklo_epi8(lined, zero);
880
881        int1 = _mm_unpacklo_epi16(linea, lineb);
882        lineb = _mm_unpackhi_epi16(linea, lineb);
883
884        int2 = _mm_unpacklo_epi16(linec, lined);
885        lined = _mm_unpackhi_epi16(linec, lined);
886
887        linea = _mm_unpacklo_epi16(int1, int2);
888        int1 = _mm_unpackhi_epi16(int1, int2);
889
890        linec = _mm_unpacklo_epi16(lineb, lined);
891        high1 = _mm_unpackhi_epi16(lineb, lined);
892
893        linee = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 4 * src_strd + j));
894        linef = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 5 * src_strd + j));
895        lineg = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 6 * src_strd + j));
896        lineh = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 7 * src_strd + j));
897
898        linee = _mm_unpacklo_epi8(linee, zero);
899        linef = _mm_unpacklo_epi8(linef, zero);
900        lineg = _mm_unpacklo_epi8(lineg, zero);
901        lineh = _mm_unpacklo_epi8(lineh, zero);
902
903        int2 = _mm_unpacklo_epi16(linee, linef);
904        linef = _mm_unpackhi_epi16(linee, linef);
905
906        int3 = _mm_unpacklo_epi16(lineg, lineh);
907        lineh = _mm_unpackhi_epi16(lineg, lineh);
908
909        linee = _mm_unpacklo_epi16(int2, int3);
910        int2 = _mm_unpackhi_epi16(int2, int3);
911
912        lineg = _mm_unpacklo_epi16(linef, lineh);
913        high2 = _mm_unpackhi_epi16(linef, lineh);
914
915        int4 = _mm_unpacklo_epi16(linea, linee);
916        lineb = _mm_unpackhi_epi16(linea, linee);
917
918        int3 = _mm_unpacklo_epi16(int1, int2);
919        lined = _mm_unpackhi_epi16(int1, int2);
920
921        int2 = _mm_unpacklo_epi16(linec, lineg);
922        linef = _mm_unpackhi_epi16(linec, lineg);
923
924        linea = int4;
925        linec = int3;
926        linee = int2;
927
928        lineg = _mm_unpacklo_epi16(high1, high2);
929        lineh = _mm_unpackhi_epi16(high1, high2);
930
931        //end of transpose
932
933        u1_Bs = (u4_bs >> 24) & 0xff;
934        u1_Bs1 = (u4_bs >> 16) & 0xff;
935        u4_bs <<= 16;
936
937        flag1 = _mm_set_epi16(u1_Bs1, u1_Bs, u1_Bs1, u1_Bs, u1_Bs1, u1_Bs,
938                              u1_Bs1, u1_Bs);
939        flag1 = _mm_cmpeq_epi16(flag1, zero); //Set flag to 1s and 0s
940        flag1 = _mm_xor_si128(flag1, _mm_set1_epi16(0xFFFF)); //Invert for required mask
941
942        i_C0 = _mm_set_epi16(pu1_cliptab[u1_Bs1], pu1_cliptab[u1_Bs],
943                             pu1_cliptab[u1_Bs1], pu1_cliptab[u1_Bs],
944                             pu1_cliptab[u1_Bs1], pu1_cliptab[u1_Bs],
945                             pu1_cliptab[u1_Bs1], pu1_cliptab[u1_Bs]);
946
947        diff = _mm_subs_epi16(linec, lined); //Condn 1
948        diff = _mm_abs_epi16(diff);
949        const1 = _mm_set1_epi16(alpha);
950        flag = _mm_cmpgt_epi16(const1, diff);
951
952        diff = _mm_subs_epi16(linee, lined); //Condtn 2
953        diff = _mm_abs_epi16(diff);
954        const1 = _mm_set1_epi16(beta);
955        flag = _mm_and_si128(flag, _mm_cmpgt_epi16(const1, diff));
956
957        diff = _mm_subs_epi16(lineb, linec); //Condtn 3
958        diff = _mm_abs_epi16(diff);
959        flag = _mm_and_si128(flag, _mm_cmpgt_epi16(const1, diff)); //Const 1= Beta from now on
960
961        flag = _mm_and_si128(flag, flag1); //Final flag (ui_B condition + other 3 conditions)
962
963        //Adding Ap<Beta and Aq<Beta
964        i_Ap = _mm_subs_epi16(linea, linec);
965        i_Ap = _mm_abs_epi16(i_Ap);
966        const2 = _mm_cmpgt_epi16(const1, i_Ap);
967        const2 = _mm_subs_epi16(zero, const2); //Make FFFF=1 and 0000=0
968        i_C = _mm_add_epi16(i_C0, const2);
969
970        i_Aq = _mm_subs_epi16(linef, lined);
971        i_Aq = _mm_abs_epi16(i_Aq);
972        const2 = _mm_cmpgt_epi16(const1, i_Aq);
973        const2 = _mm_subs_epi16(zero, const2);
974        i_C = _mm_add_epi16(i_C, const2);
975
976        //Calculate in_macro
977        diff = _mm_subs_epi16(lined, linec);
978        diff = _mm_slli_epi16(diff, 2);
979        const2 = _mm_subs_epi16(lineb, linee);
980        diff = _mm_add_epi16(diff, const2);
981        const2 = _mm_set1_epi16(4);
982        diff = _mm_add_epi16(diff, const2);
983        in_macro = _mm_srai_epi16(diff, 3);
984
985        in_macro = _mm_min_epi16(i_C, in_macro); //CLIP3
986        i_C = _mm_subs_epi16(zero, i_C);
987        in_macro = _mm_max_epi16(i_C, in_macro);
988
989        //Compute and store
990        in_macrotemp = _mm_add_epi16(linec, in_macro);
991        in_macrotemp = _mm_and_si128(in_macrotemp, flag);
992        temp = _mm_and_si128(linec,
993                             _mm_xor_si128(flag, _mm_set1_epi16(0xFFFF)));
994        temp = _mm_add_epi16(temp, in_macrotemp);
995        //temp= _mm_packus_epi16 (temp, zero);
996        //_mm_storel_epi64(uc_HorzPixel+i16_posP0+i, in_macrotemp);
997
998        in_macrotemp = _mm_subs_epi16(lined, in_macro);
999        in_macrotemp = _mm_and_si128(in_macrotemp, flag);
1000        temp1 = _mm_and_si128(lined,
1001                              _mm_xor_si128(flag, _mm_set1_epi16(0xFFFF)));
1002        temp1 = _mm_add_epi16(temp1, in_macrotemp);
1003        //temp1= _mm_packus_epi16 (temp1, zero);
1004        //_mm_storel_epi64(pu1_src+i, in_macrotemp);
1005
1006        //If Ap<Beta
1007        flag1 = _mm_cmpgt_epi16(const1, i_Ap);
1008        flag1 = _mm_and_si128(flag, flag1);
1009        in_macrotemp = _mm_add_epi16(linec, lined);
1010        in_macrotemp = _mm_add_epi16(in_macrotemp, _mm_set1_epi16(1));
1011        in_macrotemp = _mm_srai_epi16(in_macrotemp, 1);
1012        in_macro = _mm_add_epi16(in_macrotemp, linea);
1013        in_macro = _mm_subs_epi16(in_macro, _mm_slli_epi16(lineb, 1));
1014        in_macro = _mm_srai_epi16(in_macro, 1);
1015
1016        in_macro = _mm_min_epi16(i_C0, in_macro); //CLIP3
1017        i_C0 = _mm_subs_epi16(zero, i_C0);
1018        in_macro = _mm_max_epi16(i_C0, in_macro);
1019
1020        in_macro = _mm_and_si128(in_macro, flag1);
1021        lineb = _mm_add_epi16(lineb, in_macro);
1022        //in_macro= _mm_packus_epi16 (i_p1, zero);
1023        //_mm_storel_epi64(uc_HorzPixel+i16_posP1+i, in_macro);
1024
1025        flag1 = _mm_cmpgt_epi16(const1, i_Aq);
1026        flag1 = _mm_and_si128(flag, flag1);
1027        in_macro = _mm_add_epi16(in_macrotemp, linef);
1028        in_macro = _mm_subs_epi16(in_macro, _mm_slli_epi16(linee, 1));
1029        in_macro = _mm_srai_epi16(in_macro, 1);
1030
1031        i_C0 = _mm_abs_epi16(i_C0);
1032        in_macro = _mm_min_epi16(i_C0, in_macro); //CLIP3
1033        i_C0 = _mm_subs_epi16(zero, i_C0);
1034        in_macro = _mm_max_epi16(i_C0, in_macro);
1035
1036        in_macro = _mm_and_si128(in_macro, flag1);
1037        linee = _mm_add_epi16(linee, in_macro);
1038        //in_macro= _mm_packus_epi16 (i_q1, zero);
1039        //_mm_storel_epi64(pu1_src+i16_posQ1+i, in_macro);
1040        linec = temp;
1041        lined = temp1;
1042        //End of filtering
1043
1044        int1 = _mm_unpacklo_epi16(linea, linee);
1045        linee = _mm_unpackhi_epi16(linea, linee);
1046
1047        int2 = _mm_unpacklo_epi16(linec, lineg);
1048        lineg = _mm_unpackhi_epi16(linec, lineg);
1049
1050        linea = _mm_unpacklo_epi16(int1, int2);
1051        int3 = _mm_unpackhi_epi16(int1, int2);
1052
1053        linec = _mm_unpacklo_epi16(linee, lineg);
1054        lineg = _mm_unpackhi_epi16(linee, lineg);
1055
1056        int1 = _mm_unpacklo_epi16(lineb, linef);
1057        linef = _mm_unpackhi_epi16(lineb, linef);
1058
1059        int2 = _mm_unpacklo_epi16(lined, lineh);
1060        lineh = _mm_unpackhi_epi16(lined, lineh);
1061
1062        lineb = _mm_unpacklo_epi16(int1, int2);
1063        int4 = _mm_unpackhi_epi16(int1, int2);
1064
1065        lined = _mm_unpacklo_epi16(linef, lineh);
1066        lineh = _mm_unpackhi_epi16(linef, lineh);
1067
1068        int1 = _mm_unpackhi_epi16(linea, lineb);
1069        linea = _mm_unpacklo_epi16(linea, lineb);
1070
1071        int2 = _mm_unpacklo_epi16(int3, int4);
1072        high1 = _mm_unpackhi_epi16(int3, int4);
1073
1074        lineb = _mm_unpacklo_epi16(linec, lined);
1075        linef = _mm_unpackhi_epi16(linec, lined);
1076
1077        lined = _mm_unpacklo_epi16(lineg, lineh);
1078        lineh = _mm_unpackhi_epi16(lineg, lineh);
1079
1080        linee = int1;
1081        lineg = high1;
1082        linec = int2;
1083        //End of inverse transpose
1084
1085        //Packs and stores
1086        linea = _mm_packus_epi16(linea, zero);
1087        _mm_storel_epi64((__m128i *)(pu1_src - 3 + j), linea);
1088
1089        lineb = _mm_packus_epi16(lineb, zero);
1090        _mm_storel_epi64((__m128i *)(pu1_src - 3 + src_strd + j), lineb);
1091
1092        linec = _mm_packus_epi16(linec, zero);
1093        _mm_storel_epi64((__m128i *)(pu1_src - 3 + 2 * src_strd + j), linec);
1094
1095        lined = _mm_packus_epi16(lined, zero);
1096        _mm_storel_epi64((__m128i *)(pu1_src - 3 + 3 * src_strd + j), lined);
1097
1098        linee = _mm_packus_epi16(linee, zero);
1099        _mm_storel_epi64((__m128i *)(pu1_src - 3 + 4 * src_strd + j), linee);
1100
1101        linef = _mm_packus_epi16(linef, zero);
1102        _mm_storel_epi64((__m128i *)(pu1_src - 3 + 5 * src_strd + j), linef);
1103
1104        lineg = _mm_packus_epi16(lineg, zero);
1105        _mm_storel_epi64((__m128i *)(pu1_src - 3 + 6 * src_strd + j), lineg);
1106
1107        lineh = _mm_packus_epi16(lineh, zero);
1108        _mm_storel_epi64((__m128i *)(pu1_src - 3 + 7 * src_strd + j), lineh);
1109
1110    }
1111}
1112
1113/*****************************************************************************/
1114/*                                                                           */
1115/*  Function Name : ih264_deblk_luma_horz_bslt4_ssse3()                      */
1116/*                                                                           */
1117/*  Description   : This function performs filtering of a luma block         */
1118/*                  horizontal edge when boundary strength is less than 4.   */
1119/*                                                                           */
1120/*  Inputs        : pu1_src       - pointer to the src sample q0             */
1121/*                  src_strd      - source stride                            */
1122/*                  alpha         - alpha value for the boundary             */
1123/*                  beta          - beta value for the boundary              */
1124/*                  u4_bs         - packed Boundary strength array           */
1125/*                  pu1_cliptab   - tc0_table                                */
1126/*                                                                           */
1127/*  Globals       : None                                                     */
1128/*                                                                           */
1129/*  Processing    : This operation is described in Sec. 8.7.2.3 under the    */
1130/*                  title "Filtering process for edges for bS less than 4"   */
1131/*                  in ITU T Rec H.264.                                      */
1132/*                                                                           */
1133/*  Outputs       : None                                                     */
1134/*                                                                           */
1135/*  Returns       : None                                                     */
1136/*                                                                           */
1137/*  Issues        : None                                                     */
1138/*                                                                           */
1139/*  Revision History:                                                        */
1140/*                                                                           */
1141/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
1142/*         12 02 2015   Naveen Kumar P  Initial version                      */
1143/*                                                                           */
1144/*****************************************************************************/
1145void ih264_deblk_luma_horz_bslt4_ssse3(UWORD8 *pu1_src,
1146                                       WORD32 src_strd,
1147                                       WORD32 alpha,
1148                                       WORD32 beta,
1149                                       UWORD32 u4_bs,
1150                                       const UWORD8 *pu1_cliptab)
1151{
1152    WORD16 i16_posP2, i16_posP1, i16_posP0, i16_posQ1, i16_posQ2;
1153    UWORD8 *pu1_HorzPixel;
1154    __m128i zero = _mm_setzero_si128();
1155    __m128i bs_flag_16x8b, C0_16x8, C0_8x16, C0_hi_8x16, C_8x16, C_hi_8x16;
1156    __m128i q0_16x8, q1_16x8, q2_16x8, p0_16x8, p1_16x8, p2_16x8;
1157    __m128i temp1, temp2;
1158    __m128i Alpha_8x16, Beta_8x16, flag1_16x8, flag2_16x8, flag3_16x8;
1159    __m128i in_macro_16x8, in_macro_hi_16x8;
1160    __m128i const_val4_8x16;
1161    UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
1162    UWORD8 clip0, clip1, clip2, clip3;
1163
1164    pu1_HorzPixel = pu1_src - (src_strd << 2);
1165
1166    i16_posQ1 = src_strd;
1167    i16_posQ2 = X2(src_strd);
1168    i16_posP0 = X3(src_strd);
1169    i16_posP1 = X2(src_strd);
1170    i16_posP2 = src_strd;
1171
1172    q0_16x8 = _mm_loadu_si128((__m128i *)(pu1_src));
1173    q1_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ1));
1174
1175    u1_Bs0 = (u4_bs >> 24) & 0xff;
1176    u1_Bs1 = (u4_bs >> 16) & 0xff;
1177    u1_Bs2 = (u4_bs >> 8) & 0xff;
1178    u1_Bs3 = (u4_bs >> 0) & 0xff;
1179    clip0 = pu1_cliptab[u1_Bs0];
1180    clip1 = pu1_cliptab[u1_Bs1];
1181    clip2 = pu1_cliptab[u1_Bs2];
1182    clip3 = pu1_cliptab[u1_Bs3];
1183
1184    Alpha_8x16 = _mm_set1_epi16(alpha);
1185    Beta_8x16 = _mm_set1_epi16(beta);
1186
1187    bs_flag_16x8b = _mm_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs2, u1_Bs2,
1188                                 u1_Bs2, u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,
1189                                 u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0);
1190
1191    C0_16x8 = _mm_set_epi8(clip3, clip3, clip3, clip3, clip2, clip2, clip2,
1192                           clip2, clip1, clip1, clip1, clip1, clip0, clip0,
1193                           clip0, clip0);
1194
1195    bs_flag_16x8b = _mm_cmpeq_epi8(bs_flag_16x8b, zero);
1196    bs_flag_16x8b = _mm_xor_si128(bs_flag_16x8b, _mm_set1_epi8(0xFF)); //Invert for required mask
1197    C0_8x16 = _mm_unpacklo_epi8(C0_16x8, zero);
1198    C0_hi_8x16 = _mm_unpackhi_epi8(C0_16x8, zero);
1199
1200    p1_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP1));
1201    p0_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP0));
1202    p2_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP2));
1203    q2_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ2));
1204
1205    //Cond1 (ABS(p0 - q0) < alpha)
1206    temp1 = _mm_subs_epu8(q0_16x8, p0_16x8);
1207    temp2 = _mm_subs_epu8(p0_16x8, q0_16x8);
1208    temp1 = _mm_add_epi8(temp1, temp2);
1209
1210    temp2 = _mm_unpacklo_epi8(temp1, zero);
1211    temp1 = _mm_unpackhi_epi8(temp1, zero);
1212
1213    temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
1214    temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
1215
1216    flag1_16x8 = _mm_packs_epi16(temp2, temp1);
1217    flag1_16x8 = _mm_and_si128(flag1_16x8, bs_flag_16x8b);
1218
1219    //Cond2 (ABS(q1 - q0) < beta)
1220    temp1 = _mm_subs_epu8(q0_16x8, q1_16x8);
1221    temp2 = _mm_subs_epu8(q1_16x8, q0_16x8);
1222    temp1 = _mm_add_epi8(temp1, temp2);
1223
1224    temp2 = _mm_unpacklo_epi8(temp1, zero);
1225    temp1 = _mm_unpackhi_epi8(temp1, zero);
1226
1227    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1228    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
1229
1230    flag2_16x8 = _mm_packs_epi16(temp2, temp1);
1231
1232    flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
1233
1234    //Cond3 (ABS(p1 - p0) < beta)
1235    temp1 = _mm_subs_epu8(p0_16x8, p1_16x8);
1236    temp2 = _mm_subs_epu8(p1_16x8, p0_16x8);
1237    temp1 = _mm_add_epi8(temp1, temp2);
1238
1239    temp2 = _mm_unpacklo_epi8(temp1, zero);
1240    temp1 = _mm_unpackhi_epi8(temp1, zero);
1241
1242    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1243    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
1244
1245    flag2_16x8 = _mm_packs_epi16(temp2, temp1);
1246
1247    // !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta))
1248    flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
1249
1250    // (ABS(p2 - p0) < beta)
1251    temp1 = _mm_subs_epu8(p0_16x8, p2_16x8);
1252    temp2 = _mm_subs_epu8(p2_16x8, p0_16x8);
1253    temp1 = _mm_add_epi8(temp1, temp2);
1254
1255    temp2 = _mm_unpacklo_epi8(temp1, zero);
1256    temp1 = _mm_unpackhi_epi8(temp1, zero);
1257    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1258    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
1259
1260    flag2_16x8 = _mm_packs_epi16(temp2, temp1);
1261    flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
1262
1263    temp2 = _mm_subs_epi16(zero, temp2);
1264    temp1 = _mm_subs_epi16(zero, temp1);
1265
1266    C_8x16 = _mm_add_epi16(C0_8x16, temp2);
1267    C_hi_8x16 = _mm_add_epi16(C0_hi_8x16, temp1);
1268
1269    // (ABS(q2 - q0) < beta)
1270    temp1 = _mm_subs_epu8(q0_16x8, q2_16x8);
1271    temp2 = _mm_subs_epu8(q2_16x8, q0_16x8);
1272    temp1 = _mm_add_epi8(temp1, temp2);
1273
1274    temp2 = _mm_unpacklo_epi8(temp1, zero);
1275    temp1 = _mm_unpackhi_epi8(temp1, zero);
1276    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1277    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
1278
1279    flag3_16x8 = _mm_packs_epi16(temp2, temp1);
1280    flag3_16x8 = _mm_and_si128(flag1_16x8, flag3_16x8);
1281
1282    temp2 = _mm_subs_epi16(zero, temp2);
1283    temp1 = _mm_subs_epi16(zero, temp1);
1284
1285    C_8x16 = _mm_add_epi16(C_8x16, temp2);
1286    C_hi_8x16 = _mm_add_epi16(C_hi_8x16, temp1);
1287
1288    const_val4_8x16 = _mm_set1_epi16(4);
1289    temp1 = _mm_subs_epi16(_mm_unpacklo_epi8(q0_16x8, zero),
1290                           _mm_unpacklo_epi8(p0_16x8, zero));
1291    temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(p1_16x8, zero),
1292                           _mm_unpacklo_epi8(q1_16x8, zero));
1293    temp1 = _mm_slli_epi16(temp1, 2);
1294    temp1 = _mm_add_epi16(temp1, temp2);
1295    temp1 = _mm_add_epi16(temp1, const_val4_8x16);
1296    in_macro_16x8 = _mm_srai_epi16(temp1, 3);
1297
1298    temp1 = _mm_subs_epi16(_mm_unpackhi_epi8(q0_16x8, zero),
1299                           _mm_unpackhi_epi8(p0_16x8, zero));
1300    temp2 = _mm_subs_epi16(_mm_unpackhi_epi8(p1_16x8, zero),
1301                           _mm_unpackhi_epi8(q1_16x8, zero));
1302    temp1 = _mm_slli_epi16(temp1, 2);
1303    temp1 = _mm_add_epi16(temp1, temp2);
1304    temp1 = _mm_add_epi16(temp1, const_val4_8x16);
1305    in_macro_hi_16x8 = _mm_srai_epi16(temp1, 3);
1306
1307    in_macro_16x8 = _mm_min_epi16(C_8x16, in_macro_16x8); //CLIP3
1308    in_macro_hi_16x8 = _mm_min_epi16(C_hi_8x16, in_macro_hi_16x8); //CLIP3
1309    C_8x16 = _mm_subs_epi16(zero, C_8x16);
1310    C_hi_8x16 = _mm_subs_epi16(zero, C_hi_8x16);
1311    in_macro_16x8 = _mm_max_epi16(C_8x16, in_macro_16x8); //CLIP3
1312    in_macro_hi_16x8 = _mm_max_epi16(C_hi_8x16, in_macro_hi_16x8); //CLIP3
1313
1314    temp1 = _mm_add_epi16(_mm_unpacklo_epi8(p0_16x8, zero), in_macro_16x8);
1315    temp2 = _mm_add_epi16(_mm_unpackhi_epi8(p0_16x8, zero), in_macro_hi_16x8);
1316
1317    temp1 = _mm_packus_epi16(temp1, temp2);
1318
1319    temp1 = _mm_and_si128(temp1, flag1_16x8);
1320    temp2 = _mm_and_si128(p0_16x8,
1321                          _mm_xor_si128(flag1_16x8, _mm_set1_epi16(0xFFFF)));
1322
1323    temp1 = _mm_add_epi8(temp1, temp2);
1324
1325    _mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP0), temp1);
1326
1327    temp1 = _mm_sub_epi16(_mm_unpacklo_epi8(q0_16x8, zero), in_macro_16x8);
1328    temp2 = _mm_sub_epi16(_mm_unpackhi_epi8(q0_16x8, zero), in_macro_hi_16x8);
1329
1330    temp1 = _mm_packus_epi16(temp1, temp2);
1331
1332    temp1 = _mm_and_si128(temp1, flag1_16x8);
1333    temp2 = _mm_and_si128(q0_16x8,
1334                          _mm_xor_si128(flag1_16x8, _mm_set1_epi16(0xFFFF)));
1335
1336    temp1 = _mm_add_epi8(temp1, temp2);
1337    _mm_storeu_si128((__m128i *)(pu1_src), temp1);
1338
1339    //if(Ap < Beta)
1340    temp1 = _mm_avg_epu16(_mm_unpacklo_epi8(q0_16x8, zero),
1341                          _mm_unpacklo_epi8(p0_16x8, zero));
1342    temp2 = _mm_slli_epi16(_mm_unpacklo_epi8(p1_16x8, zero), 1);
1343    //temp2 = _mm_subs_epi16(zero,temp2);
1344    temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(p2_16x8, zero), temp2);
1345    temp2 = _mm_add_epi16(temp1, temp2);
1346    in_macro_16x8 = _mm_srai_epi16(temp2, 1);
1347
1348    temp1 = _mm_avg_epu16(_mm_unpackhi_epi8(q0_16x8, zero),
1349                          _mm_unpackhi_epi8(p0_16x8, zero));
1350    temp2 = _mm_slli_epi16(_mm_unpackhi_epi8(p1_16x8, zero), 1);
1351    //temp2 = _mm_subs_epi16(zero,temp2);
1352    temp2 = _mm_subs_epi16(_mm_unpackhi_epi8(p2_16x8, zero), temp2);
1353    temp2 = _mm_add_epi16(temp1, temp2);
1354    in_macro_hi_16x8 = _mm_srai_epi16(temp2, 1);
1355
1356    in_macro_16x8 = _mm_min_epi16(C0_8x16, in_macro_16x8); //CLIP3
1357    in_macro_hi_16x8 = _mm_min_epi16(C0_hi_8x16, in_macro_hi_16x8); //CLIP3
1358    C0_8x16 = _mm_subs_epi16(zero, C0_8x16);
1359    C0_hi_8x16 = _mm_subs_epi16(zero, C0_hi_8x16);
1360    in_macro_16x8 = _mm_max_epi16(C0_8x16, in_macro_16x8); //CLIP3
1361    in_macro_hi_16x8 = _mm_max_epi16(C0_hi_8x16, in_macro_hi_16x8); //CLIP3
1362
1363    temp1 = _mm_add_epi16(_mm_unpacklo_epi8(p1_16x8, zero), in_macro_16x8);
1364    temp2 = _mm_add_epi16(_mm_unpackhi_epi8(p1_16x8, zero), in_macro_hi_16x8);
1365
1366    temp1 = _mm_packus_epi16(temp1, temp2);
1367
1368    temp1 = _mm_and_si128(temp1, flag2_16x8);
1369    temp2 = _mm_and_si128(p1_16x8,
1370                          _mm_xor_si128(flag2_16x8, _mm_set1_epi16(0xFFFF)));
1371    temp1 = _mm_add_epi8(temp1, temp2);
1372    _mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP1), temp1);
1373
1374    //if(Aq < Beta)
1375    temp1 = _mm_avg_epu16(_mm_unpacklo_epi8(q0_16x8, zero),
1376                          _mm_unpacklo_epi8(p0_16x8, zero));
1377    temp2 = _mm_slli_epi16(_mm_unpacklo_epi8(q1_16x8, zero), 1);
1378    //temp2 = _mm_slli_epi16 (temp2, 1);
1379    temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(q2_16x8, zero), temp2);
1380    temp2 = _mm_add_epi16(temp1, temp2);
1381    in_macro_16x8 = _mm_srai_epi16(temp2, 1);
1382
1383    temp1 = _mm_avg_epu16(_mm_unpackhi_epi8(q0_16x8, zero),
1384                          _mm_unpackhi_epi8(p0_16x8, zero));
1385    temp2 = _mm_slli_epi16(_mm_unpackhi_epi8(q1_16x8, zero), 1);
1386    //temp2 = _mm_slli_epi16 (temp2, 1);
1387    temp2 = _mm_subs_epi16(_mm_unpackhi_epi8(q2_16x8, zero), temp2);
1388    temp2 = _mm_add_epi16(temp1, temp2);
1389    in_macro_hi_16x8 = _mm_srai_epi16(temp2, 1);
1390
1391    in_macro_16x8 = _mm_max_epi16(C0_8x16, in_macro_16x8); //CLIP3
1392    in_macro_hi_16x8 = _mm_max_epi16(C0_hi_8x16, in_macro_hi_16x8); //CLIP3
1393    C0_8x16 = _mm_subs_epi16(zero, C0_8x16);
1394    C0_hi_8x16 = _mm_subs_epi16(zero, C0_hi_8x16);
1395    in_macro_16x8 = _mm_min_epi16(C0_8x16, in_macro_16x8); //CLIP3
1396    in_macro_hi_16x8 = _mm_min_epi16(C0_hi_8x16, in_macro_hi_16x8); //CLIP3
1397
1398    temp1 = _mm_add_epi16(_mm_unpacklo_epi8(q1_16x8, zero), in_macro_16x8);
1399    temp2 = _mm_add_epi16(_mm_unpackhi_epi8(q1_16x8, zero), in_macro_hi_16x8);
1400
1401    temp1 = _mm_packus_epi16(temp1, temp2);
1402
1403    temp1 = _mm_and_si128(temp1, flag3_16x8);
1404    temp2 = _mm_and_si128(q1_16x8,
1405                          _mm_xor_si128(flag3_16x8, _mm_set1_epi16(0xFFFF)));
1406    temp1 = _mm_add_epi8(temp1, temp2);
1407
1408    _mm_storeu_si128((__m128i *)(pu1_src + i16_posQ1), temp1);
1409
1410}
1411
1412/*****************************************************************************/
1413/*                                                                           */
1414/*  Function Name : ih264_deblk_luma_vert_bs4_mbaff_ssse3()                  */
1415/*                                                                           */
1416/*  Description   : This function performs filtering of a luma block         */
1417/*                  vertical edge when boundary strength is set to 4.        */
1418/*                                                                           */
1419/*  Inputs        : pu1_src       - pointer to the src sample q0             */
1420/*                  src_strd      - source stride                            */
1421/*                  alpha         - alpha value for the boundary             */
1422/*                  beta          - beta value for the boundary              */
1423/*                                                                           */
1424/*  Globals       : None                                                     */
1425/*                                                                           */
1426/*  Processing    : When the function is called twice, this operation is as  */
1427/*                  described in Sec. 8.7.2.3 under the title "Filtering     */
1428/*                  process for edges for bS equal to 4" in ITU T Rec H.264. */
1429/*                                                                           */
1430/*  Outputs       : None                                                     */
1431/*                                                                           */
1432/*  Returns       : None                                                     */
1433/*                                                                           */
1434/*  Issues        : None                                                     */
1435/*                                                                           */
1436/*  Revision History:                                                        */
1437/*                                                                           */
1438/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
1439/*         12 02 2015   Naveen Kumar P  Initial version                      */
1440/*                                                                           */
1441/*****************************************************************************/
1442void ih264_deblk_luma_vert_bs4_mbaff_ssse3(UWORD8 *pu1_src,
1443                                           WORD32 src_strd,
1444                                           WORD32 alpha,
1445                                           WORD32 beta)
1446{
1447    __m128i zero = _mm_setzero_si128();
1448    __m128i q0_16x8, q1_16x8, q2_16x8, q3_16x8;
1449    __m128i p0_16x8, p1_16x8, p2_16x8, p3_16x8;
1450    __m128i q0_8x16, q1_8x16, q2_8x16, q3_8x16;
1451    __m128i p0_8x16, p1_8x16, p2_8x16, p3_8x16;
1452    __m128i q0_16x8_1;
1453    __m128i p0_16x8_1;
1454    __m128i q0_16x8_2, q1_16x8_2, q2_16x8_2;
1455    __m128i p0_16x8_2, p1_16x8_2, p2_16x8_2;
1456    __m128i temp1, temp2, temp3, temp4, temp5, temp6;
1457    __m128i Alpha_8x16, Beta_8x16;
1458    __m128i flag1_16x8, flag2_16x8, flag3_16x8, flag4_16x8;
1459    __m128i const_val2_16x8 = _mm_set1_epi16(2);
1460    __m128i line1, line2, line3, line4, line5, line6, line7, line8;
1461
1462    Alpha_8x16 = _mm_set1_epi16(alpha);
1463    Beta_8x16 = _mm_set1_epi16(beta);
1464
1465    line1 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd));
1466    line2 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd));
1467    line3 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd));
1468    line4 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd));
1469    line5 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd));
1470    line6 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd));
1471    line7 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd));
1472    line8 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd));
1473
1474    temp1 = _mm_unpacklo_epi8(line1, line2);
1475    temp2 = _mm_unpacklo_epi8(line3, line4);
1476    temp3 = _mm_unpacklo_epi8(line5, line6);
1477    temp4 = _mm_unpacklo_epi8(line7, line8);
1478
1479    line1 = _mm_unpacklo_epi16(temp1, temp2);
1480    line2 = _mm_unpackhi_epi16(temp1, temp2);
1481    line3 = _mm_unpacklo_epi16(temp3, temp4);
1482    line4 = _mm_unpackhi_epi16(temp3, temp4);
1483
1484    p1_8x16 = _mm_unpacklo_epi32(line1, line3);
1485    p0_8x16 = _mm_unpackhi_epi32(line1, line3);
1486    q0_8x16 = _mm_unpacklo_epi32(line2, line4);
1487    q1_8x16 = _mm_unpackhi_epi32(line2, line4);
1488
1489    p3_16x8 = _mm_unpacklo_epi64(p1_8x16, zero);
1490    p2_16x8 = _mm_unpackhi_epi64(p1_8x16, zero);
1491    q2_16x8 = _mm_unpacklo_epi64(q1_8x16, zero);
1492    q3_16x8 = _mm_unpackhi_epi64(q1_8x16, zero);
1493    p1_16x8 = _mm_unpacklo_epi64(p0_8x16, zero);
1494    p0_16x8 = _mm_unpackhi_epi64(p0_8x16, zero);
1495    q0_16x8 = _mm_unpacklo_epi64(q0_8x16, zero);
1496    q1_16x8 = _mm_unpackhi_epi64(q0_8x16, zero);
1497
1498    //Cond1 (ABS(p0 - q0) < alpha)
1499    temp1 = _mm_subs_epu8(q0_16x8, p0_16x8);
1500    temp2 = _mm_subs_epu8(p0_16x8, q0_16x8);
1501    temp1 = _mm_add_epi8(temp1, temp2);
1502
1503    temp2 = _mm_unpacklo_epi8(temp1, zero);
1504    temp1 = _mm_unpackhi_epi8(temp1, zero);
1505
1506    temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
1507    temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
1508
1509    flag1_16x8 = _mm_packs_epi16(temp2, temp1);
1510
1511    //Cond2 (ABS(q1 - q0) < beta)
1512    temp1 = _mm_subs_epu8(q0_16x8, q1_16x8);
1513    temp2 = _mm_subs_epu8(q1_16x8, q0_16x8);
1514    temp1 = _mm_add_epi8(temp1, temp2);
1515
1516    temp2 = _mm_unpacklo_epi8(temp1, zero);
1517    temp1 = _mm_unpackhi_epi8(temp1, zero);
1518
1519    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1520    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
1521
1522    flag2_16x8 = _mm_packs_epi16(temp2, temp1);
1523
1524    flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
1525
1526    //Cond3 (ABS(p1 - p0) < beta)
1527    temp1 = _mm_subs_epu8(p0_16x8, p1_16x8);
1528    temp2 = _mm_subs_epu8(p1_16x8, p0_16x8);
1529    temp1 = _mm_add_epi8(temp1, temp2);
1530
1531    temp2 = _mm_unpacklo_epi8(temp1, zero);
1532    temp1 = _mm_unpackhi_epi8(temp1, zero);
1533
1534    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1535    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
1536
1537    flag2_16x8 = _mm_packs_epi16(temp2, temp1);
1538
1539    // !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta))
1540    flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
1541
1542    // (ABS(p0 - q0) < ((alpha >> 2) + 2))
1543    temp1 = _mm_subs_epu8(p0_16x8, q0_16x8);
1544    temp2 = _mm_subs_epu8(q0_16x8, p0_16x8);
1545    temp1 = _mm_add_epi8(temp1, temp2);
1546    Alpha_8x16 = _mm_srai_epi16(Alpha_8x16, 2);
1547    Alpha_8x16 = _mm_add_epi16(Alpha_8x16, const_val2_16x8);
1548
1549    temp2 = _mm_unpacklo_epi8(temp1, zero);
1550    temp1 = _mm_unpackhi_epi8(temp1, zero);
1551    temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
1552    temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
1553
1554    flag2_16x8 = _mm_packs_epi16(temp2, temp1);
1555    flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
1556
1557    // (ABS(p2 - p0) < beta)
1558    temp1 = _mm_subs_epu8(p0_16x8, p2_16x8);
1559    temp2 = _mm_subs_epu8(p2_16x8, p0_16x8);
1560    temp1 = _mm_add_epi8(temp1, temp2);
1561
1562    temp2 = _mm_unpacklo_epi8(temp1, zero);
1563    temp1 = _mm_unpackhi_epi8(temp1, zero);
1564    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1565    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
1566
1567    flag3_16x8 = _mm_packs_epi16(temp2, temp1);
1568    flag3_16x8 = _mm_and_si128(flag3_16x8, flag2_16x8);
1569
1570    // (ABS(q2 - q0) < beta)
1571    temp1 = _mm_subs_epu8(q0_16x8, q2_16x8);
1572    temp2 = _mm_subs_epu8(q2_16x8, q0_16x8);
1573    temp1 = _mm_add_epi8(temp1, temp2);
1574
1575    temp2 = _mm_unpacklo_epi8(temp1, zero);
1576    temp1 = _mm_unpackhi_epi8(temp1, zero);
1577    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1578    temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
1579
1580    flag4_16x8 = _mm_packs_epi16(temp2, temp1);
1581    flag4_16x8 = _mm_and_si128(flag4_16x8, flag2_16x8);
1582
1583    // First 8 pixels
1584    p3_8x16 = _mm_unpacklo_epi8(p3_16x8, zero);
1585    p2_8x16 = _mm_unpacklo_epi8(p2_16x8, zero);
1586    p1_8x16 = _mm_unpacklo_epi8(p1_16x8, zero);
1587    p0_8x16 = _mm_unpacklo_epi8(p0_16x8, zero);
1588    q0_8x16 = _mm_unpacklo_epi8(q0_16x8, zero);
1589    q1_8x16 = _mm_unpacklo_epi8(q1_16x8, zero);
1590    q2_8x16 = _mm_unpacklo_epi8(q2_16x8, zero);
1591    q3_8x16 = _mm_unpacklo_epi8(q3_16x8, zero);
1592
1593    // p0_1 and q0_1
1594    temp1 = _mm_add_epi16(p0_8x16, q1_8x16);
1595    temp2 = _mm_add_epi16(p1_8x16, q0_8x16);
1596    temp5 = _mm_add_epi16(temp1, const_val2_16x8);
1597    temp6 = _mm_add_epi16(temp2, const_val2_16x8);
1598    temp3 = _mm_slli_epi16(p1_8x16, 1);
1599    temp4 = _mm_slli_epi16(q1_8x16, 1);
1600    temp1 = _mm_add_epi16(temp5, temp3);
1601    temp2 = _mm_add_epi16(temp6, temp4);
1602    p0_16x8_1 = _mm_srai_epi16(temp1, 2);
1603    q0_16x8_1 = _mm_srai_epi16(temp2, 2);
1604
1605    // p1_2 and q1_2
1606    temp6 = _mm_add_epi16(temp6, p0_8x16);
1607    temp5 = _mm_add_epi16(temp5, q0_8x16);
1608    temp1 = _mm_add_epi16(temp6, p2_8x16);
1609    temp2 = _mm_add_epi16(temp5, q2_8x16);
1610    p1_16x8_2 = _mm_srai_epi16(temp1, 2);
1611    q1_16x8_2 = _mm_srai_epi16(temp2, 2);
1612
1613    // p0_2 and q0_2
1614    temp1 = _mm_add_epi16(temp3, p2_8x16);
1615    temp2 = _mm_add_epi16(temp4, q2_8x16);
1616    temp1 = _mm_add_epi16(temp1, q1_8x16);
1617    temp2 = _mm_add_epi16(temp2, p1_8x16);
1618    temp3 = _mm_add_epi16(p0_8x16, q0_8x16);
1619    temp3 = _mm_slli_epi16(temp3, 1);
1620    temp1 = _mm_add_epi16(temp1, temp3);
1621    temp2 = _mm_add_epi16(temp2, temp3);
1622    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4));
1623    temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4));
1624    p0_16x8_2 = _mm_srai_epi16(temp1, 3);
1625    q0_16x8_2 = _mm_srai_epi16(temp2, 3);
1626
1627    // p2_2 and q2_2
1628    temp1 = _mm_add_epi16(temp6, const_val2_16x8);
1629    temp2 = _mm_add_epi16(temp5, const_val2_16x8);
1630    temp3 = _mm_slli_epi16(p2_8x16, 1);
1631    temp4 = _mm_slli_epi16(q2_8x16, 1);
1632    temp3 = _mm_add_epi16(p2_8x16, temp3);
1633    temp4 = _mm_add_epi16(q2_8x16, temp4);
1634    temp5 = _mm_slli_epi16(p3_8x16, 1);
1635    temp6 = _mm_slli_epi16(q3_8x16, 1);
1636    temp1 = _mm_add_epi16(temp1, temp3);
1637    temp2 = _mm_add_epi16(temp2, temp4);
1638    temp1 = _mm_add_epi16(temp1, temp5);
1639    temp2 = _mm_add_epi16(temp2, temp6);
1640    p2_16x8_2 = _mm_srai_epi16(temp1, 3);
1641    q2_16x8_2 = _mm_srai_epi16(temp2, 3);
1642
1643    // p0_1 and q0_1
1644    p0_16x8_1 = _mm_packus_epi16(p0_16x8_1, zero);
1645    q0_16x8_1 = _mm_packus_epi16(q0_16x8_1, zero);
1646
1647    // p1_2 and q1_2
1648    p1_16x8_2 = _mm_packus_epi16(p1_16x8_2, zero);
1649    q1_16x8_2 = _mm_packus_epi16(q1_16x8_2, zero);
1650
1651    // p0_2 and q0_2
1652    p0_16x8_2 = _mm_packus_epi16(p0_16x8_2, zero);
1653    q0_16x8_2 = _mm_packus_epi16(q0_16x8_2, zero);
1654
1655    // p2_2 and q2_2
1656    p2_16x8_2 = _mm_packus_epi16(p2_16x8_2, zero);
1657    q2_16x8_2 = _mm_packus_epi16(q2_16x8_2, zero);
1658
1659    // p0 and q0
1660    p0_16x8 = _mm_and_si128(p0_16x8,
1661                            _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF)));
1662    p0_16x8_1 = _mm_and_si128(p0_16x8_1, flag1_16x8);
1663    p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_1);
1664    q0_16x8 = _mm_and_si128(q0_16x8,
1665                            _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF)));
1666    q0_16x8_1 = _mm_and_si128(q0_16x8_1, flag1_16x8);
1667    q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_1);
1668
1669    // p0 and q0
1670    p0_16x8 = _mm_and_si128(p0_16x8,
1671                            _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
1672    p0_16x8_2 = _mm_and_si128(p0_16x8_2, flag3_16x8);
1673    p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_2);
1674    q0_16x8 = _mm_and_si128(q0_16x8,
1675                            _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
1676    q0_16x8_2 = _mm_and_si128(q0_16x8_2, flag4_16x8);
1677    q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_2);
1678
1679    // p1 and q1
1680    p1_16x8 = _mm_and_si128(p1_16x8,
1681                            _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
1682    p1_16x8_2 = _mm_and_si128(p1_16x8_2, flag3_16x8);
1683    p1_16x8 = _mm_add_epi8(p1_16x8, p1_16x8_2);
1684    q1_16x8 = _mm_and_si128(q1_16x8,
1685                            _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
1686    q1_16x8_2 = _mm_and_si128(q1_16x8_2, flag4_16x8);
1687    q1_16x8 = _mm_add_epi8(q1_16x8, q1_16x8_2);
1688
1689    // p2 and q2
1690    p2_16x8 = _mm_and_si128(p2_16x8,
1691                            _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
1692    p2_16x8_2 = _mm_and_si128(p2_16x8_2, flag3_16x8);
1693    p2_16x8 = _mm_add_epi8(p2_16x8, p2_16x8_2);
1694    q2_16x8 = _mm_and_si128(q2_16x8,
1695                            _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
1696    q2_16x8_2 = _mm_and_si128(q2_16x8_2, flag4_16x8);
1697    q2_16x8 = _mm_add_epi8(q2_16x8, q2_16x8_2);
1698
1699    temp1 = _mm_unpacklo_epi8(p3_16x8, p2_16x8);
1700    temp2 = _mm_unpacklo_epi8(p1_16x8, p0_16x8);
1701    temp3 = _mm_unpacklo_epi8(q0_16x8, q1_16x8);
1702    temp4 = _mm_unpacklo_epi8(q2_16x8, q3_16x8);
1703
1704    p3_8x16 = _mm_unpacklo_epi16(temp1, temp2);
1705    p2_8x16 = _mm_unpackhi_epi16(temp1, temp2);
1706    q2_8x16 = _mm_unpacklo_epi16(temp3, temp4);
1707    q3_8x16 = _mm_unpackhi_epi16(temp3, temp4);
1708
1709    line1 = _mm_unpacklo_epi32(p3_8x16, q2_8x16);
1710    line2 = _mm_srli_si128(line1, 8);
1711    line3 = _mm_unpackhi_epi32(p3_8x16, q2_8x16);
1712    line4 = _mm_srli_si128(line3, 8);
1713    line5 = _mm_unpacklo_epi32(p2_8x16, q3_8x16);
1714    line6 = _mm_srli_si128(line5, 8);
1715    line7 = _mm_unpackhi_epi32(p2_8x16, q3_8x16);
1716    line8 = _mm_srli_si128(line7, 8);
1717
1718    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd), line1);
1719    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd), line2);
1720    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd), line3);
1721    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd), line4);
1722    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd), line5);
1723    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd), line6);
1724    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd), line7);
1725    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd), line8);
1726
1727}
1728
1729/*****************************************************************************/
1730/*                                                                           */
1731/*  Function Name : ih264_deblk_luma_vert_bslt4_mbaff_ssse3()                */
1732/*                                                                           */
1733/*  Description   : This function performs filtering of a luma block         */
1734/*                  vertical edge when boundary strength is less than 4.     */
1735/*                                                                           */
1736/*  Inputs        : pu1_src       - pointer to the src sample q0             */
1737/*                  src_strd      - source stride                            */
1738/*                  alpha         - alpha value for the boundary             */
1739/*                  beta          - beta value for the boundary              */
1740/*                  u4_bs         - packed Boundary strength array           */
1741/*                  pu1_cliptab   - tc0_table                                */
1742/*                                                                           */
1743/*  Globals       : None                                                     */
1744/*                                                                           */
1745/*  Processing    : When the function is called twice, this operation is as  */
1746/*                  described in Sec. 8.7.2.3 under the title "Filtering     */
1747/*                  process for edges for bS less than 4" in ITU T Rec H.264.*/
1748/*                                                                           */
1749/*  Outputs       : None                                                     */
1750/*                                                                           */
1751/*  Returns       : None                                                     */
1752/*                                                                           */
1753/*  Issues        : None                                                     */
1754/*                                                                           */
1755/*  Revision History:                                                        */
1756/*                                                                           */
1757/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
1758/*         12 02 2015   Naveen Kumar P  Initial version                      */
1759/*                                                                           */
1760/*****************************************************************************/
1761void ih264_deblk_luma_vert_bslt4_mbaff_ssse3(UWORD8 *pu1_src,
1762                                             WORD32 src_strd,
1763                                             WORD32 alpha,
1764                                             WORD32 beta,
1765                                             UWORD32 u4_bs,
1766                                             const UWORD8 *pu1_cliptab)
1767{
1768    __m128i zero = _mm_setzero_si128();
1769    __m128i bs_flag_16x8b, C0_16x8, C0_8x16, C_8x16;
1770    __m128i q0_16x8, q1_16x8, q2_16x8, q3_16x8;
1771    __m128i p0_16x8, p1_16x8, p2_16x8, p3_16x8;
1772    __m128i temp1, temp2, temp3, temp4;
1773    __m128i Alpha_8x16, Beta_8x16, flag1_16x8, flag2_16x8, flag3_16x8;
1774    __m128i in_macro_16x8;
1775    __m128i const_val4_8x16;
1776    UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
1777    UWORD8 clip0, clip1, clip2, clip3;
1778    __m128i line1, line2, line3, line4, line5, line6, line7, line8;
1779    __m128i q0_16x8_1, q1_16x8_1, q0_16x8_2;
1780    __m128i p0_16x8_1, p1_16x8_1, p0_16x8_2;
1781
1782    line1 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd));
1783    line2 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd));
1784    line3 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd));
1785    line4 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd));
1786    line5 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd));
1787    line6 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd));
1788    line7 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd));
1789    line8 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd));
1790
1791    temp1 = _mm_unpacklo_epi8(line1, line2);
1792    temp2 = _mm_unpacklo_epi8(line3, line4);
1793    temp3 = _mm_unpacklo_epi8(line5, line6);
1794    temp4 = _mm_unpacklo_epi8(line7, line8);
1795
1796    line1 = _mm_unpacklo_epi16(temp1, temp2);
1797    line2 = _mm_unpackhi_epi16(temp1, temp2);
1798    line3 = _mm_unpacklo_epi16(temp3, temp4);
1799    line4 = _mm_unpackhi_epi16(temp3, temp4);
1800
1801    temp1 = _mm_unpacklo_epi32(line1, line3);
1802    temp2 = _mm_unpackhi_epi32(line1, line3);
1803    temp3 = _mm_unpacklo_epi32(line2, line4);
1804    temp4 = _mm_unpackhi_epi32(line2, line4);
1805
1806    p3_16x8 = _mm_unpacklo_epi64(temp1, zero);
1807    p2_16x8 = _mm_unpackhi_epi64(temp1, zero);
1808    q2_16x8 = _mm_unpacklo_epi64(temp4, zero);
1809    q3_16x8 = _mm_unpackhi_epi64(temp4, zero);
1810    p1_16x8 = _mm_unpacklo_epi64(temp2, zero);
1811    p0_16x8 = _mm_unpackhi_epi64(temp2, zero);
1812    q0_16x8 = _mm_unpacklo_epi64(temp3, zero);
1813    q1_16x8 = _mm_unpackhi_epi64(temp3, zero);
1814
1815    u1_Bs0 = (u4_bs >> 24) & 0xff;
1816    u1_Bs1 = (u4_bs >> 16) & 0xff;
1817    u1_Bs2 = (u4_bs >> 8) & 0xff;
1818    u1_Bs3 = (u4_bs >> 0) & 0xff;
1819    clip0 = pu1_cliptab[u1_Bs0];
1820    clip1 = pu1_cliptab[u1_Bs1];
1821    clip2 = pu1_cliptab[u1_Bs2];
1822    clip3 = pu1_cliptab[u1_Bs3];
1823
1824    Alpha_8x16 = _mm_set1_epi16(alpha);
1825    Beta_8x16 = _mm_set1_epi16(beta);
1826
1827    bs_flag_16x8b = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, u1_Bs3, u1_Bs3, u1_Bs2,
1828                                 u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs0, u1_Bs0);
1829
1830    C0_16x8 = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, clip3, clip3, clip2, clip2,
1831                           clip1, clip1, clip0, clip0);
1832
1833    bs_flag_16x8b = _mm_cmpeq_epi8(bs_flag_16x8b, zero);
1834    bs_flag_16x8b = _mm_xor_si128(bs_flag_16x8b, _mm_set1_epi8(0xFF)); //Invert for required mask
1835    C0_8x16 = _mm_unpacklo_epi8(C0_16x8, zero);
1836
1837    //Cond1 (ABS(p0 - q0) < alpha)
1838    temp1 = _mm_subs_epu8(q0_16x8, p0_16x8);
1839    temp2 = _mm_subs_epu8(p0_16x8, q0_16x8);
1840    temp1 = _mm_add_epi8(temp1, temp2);
1841
1842    temp2 = _mm_unpacklo_epi8(temp1, zero);
1843    temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
1844
1845    flag1_16x8 = _mm_packs_epi16(temp2, zero);
1846    flag1_16x8 = _mm_and_si128(flag1_16x8, bs_flag_16x8b);
1847
1848    //Cond2 (ABS(q1 - q0) < beta)
1849    temp1 = _mm_subs_epu8(q0_16x8, q1_16x8);
1850    temp2 = _mm_subs_epu8(q1_16x8, q0_16x8);
1851    temp1 = _mm_add_epi8(temp1, temp2);
1852
1853    temp2 = _mm_unpacklo_epi8(temp1, zero);
1854    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1855
1856    flag2_16x8 = _mm_packs_epi16(temp2, zero);
1857    flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
1858
1859    //Cond3 (ABS(p1 - p0) < beta)
1860    temp1 = _mm_subs_epu8(p0_16x8, p1_16x8);
1861    temp2 = _mm_subs_epu8(p1_16x8, p0_16x8);
1862    temp1 = _mm_add_epi8(temp1, temp2);
1863
1864    temp2 = _mm_unpacklo_epi8(temp1, zero);
1865    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1866
1867    flag2_16x8 = _mm_packs_epi16(temp2, zero);
1868
1869    // !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta))
1870    flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
1871
1872    // (ABS(p2 - p0) < beta)
1873    temp1 = _mm_subs_epu8(p0_16x8, p2_16x8);
1874    temp2 = _mm_subs_epu8(p2_16x8, p0_16x8);
1875    temp1 = _mm_add_epi8(temp1, temp2);
1876
1877    temp2 = _mm_unpacklo_epi8(temp1, zero);
1878    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1879
1880    flag2_16x8 = _mm_packs_epi16(temp2, zero);
1881    flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
1882
1883    temp2 = _mm_subs_epi16(zero, temp2);
1884
1885    C_8x16 = _mm_add_epi16(C0_8x16, temp2);
1886
1887    // (ABS(q2 - q0) < beta)
1888    temp1 = _mm_subs_epu8(q0_16x8, q2_16x8);
1889    temp2 = _mm_subs_epu8(q2_16x8, q0_16x8);
1890    temp1 = _mm_add_epi8(temp1, temp2);
1891
1892    temp2 = _mm_unpacklo_epi8(temp1, zero);
1893    temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1894
1895    flag3_16x8 = _mm_packs_epi16(temp2, zero);
1896    flag3_16x8 = _mm_and_si128(flag1_16x8, flag3_16x8);
1897
1898    temp2 = _mm_subs_epi16(zero, temp2);
1899
1900    C_8x16 = _mm_add_epi16(C_8x16, temp2);
1901
1902    const_val4_8x16 = _mm_set1_epi16(4);
1903    temp1 = _mm_subs_epi16(_mm_unpacklo_epi8(q0_16x8, zero),
1904                           _mm_unpacklo_epi8(p0_16x8, zero));
1905    temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(p1_16x8, zero),
1906                           _mm_unpacklo_epi8(q1_16x8, zero));
1907    temp1 = _mm_slli_epi16(temp1, 2);
1908    temp1 = _mm_add_epi16(temp1, temp2);
1909    temp1 = _mm_add_epi16(temp1, const_val4_8x16);
1910    in_macro_16x8 = _mm_srai_epi16(temp1, 3);
1911
1912    in_macro_16x8 = _mm_min_epi16(C_8x16, in_macro_16x8); //CLIP3
1913    C_8x16 = _mm_subs_epi16(zero, C_8x16);
1914    in_macro_16x8 = _mm_max_epi16(C_8x16, in_macro_16x8); //CLIP3
1915
1916    // p0
1917    temp1 = _mm_add_epi16(_mm_unpacklo_epi8(p0_16x8, zero), in_macro_16x8);
1918
1919    temp1 = _mm_packus_epi16(temp1, zero);
1920
1921    p0_16x8_1 = _mm_and_si128(temp1, flag1_16x8);
1922    p0_16x8_2 = _mm_and_si128(
1923                    p0_16x8, _mm_xor_si128(flag1_16x8, _mm_set1_epi16(0xFFFF)));
1924
1925    p0_16x8_1 = _mm_add_epi8(p0_16x8_1, p0_16x8_2);
1926
1927    // q0
1928    temp1 = _mm_sub_epi16(_mm_unpacklo_epi8(q0_16x8, zero), in_macro_16x8);
1929
1930    temp1 = _mm_packus_epi16(temp1, zero);
1931
1932    q0_16x8_1 = _mm_and_si128(temp1, flag1_16x8);
1933    q0_16x8_2 = _mm_and_si128(
1934                    q0_16x8, _mm_xor_si128(flag1_16x8, _mm_set1_epi16(0xFFFF)));
1935
1936    q0_16x8_1 = _mm_add_epi8(q0_16x8_1, q0_16x8_2);
1937
1938    //if(Ap < Beta)
1939    temp1 = _mm_avg_epu16(_mm_unpacklo_epi8(q0_16x8, zero),
1940                          _mm_unpacklo_epi8(p0_16x8, zero));
1941    temp2 = _mm_slli_epi16(_mm_unpacklo_epi8(p1_16x8, zero), 1);
1942    //temp2 = _mm_subs_epi16(zero,temp2);
1943    temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(p2_16x8, zero), temp2);
1944    temp2 = _mm_add_epi16(temp1, temp2);
1945    in_macro_16x8 = _mm_srai_epi16(temp2, 1);
1946
1947    in_macro_16x8 = _mm_min_epi16(C0_8x16, in_macro_16x8); //CLIP3
1948    C0_8x16 = _mm_subs_epi16(zero, C0_8x16);
1949    in_macro_16x8 = _mm_max_epi16(C0_8x16, in_macro_16x8); //CLIP3
1950
1951    // p1
1952    temp1 = _mm_add_epi16(_mm_unpacklo_epi8(p1_16x8, zero), in_macro_16x8);
1953
1954    temp1 = _mm_packus_epi16(temp1, zero);
1955
1956    p1_16x8_1 = _mm_and_si128(temp1, flag2_16x8);
1957    p1_16x8 = _mm_and_si128(p1_16x8,
1958                            _mm_xor_si128(flag2_16x8, _mm_set1_epi16(0xFFFF)));
1959    p1_16x8 = _mm_add_epi8(p1_16x8, p1_16x8_1);
1960
1961    //if(Aq < Beta)
1962    temp1 = _mm_avg_epu16(_mm_unpacklo_epi8(q0_16x8, zero),
1963                          _mm_unpacklo_epi8(p0_16x8, zero));
1964    temp2 = _mm_slli_epi16(_mm_unpacklo_epi8(q1_16x8, zero), 1);
1965    //temp2 = _mm_slli_epi16 (temp2, 1);
1966    temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(q2_16x8, zero), temp2);
1967    temp2 = _mm_add_epi16(temp1, temp2);
1968    in_macro_16x8 = _mm_srai_epi16(temp2, 1);
1969
1970    in_macro_16x8 = _mm_max_epi16(C0_8x16, in_macro_16x8); //CLIP3
1971    C0_8x16 = _mm_subs_epi16(zero, C0_8x16);
1972    in_macro_16x8 = _mm_min_epi16(C0_8x16, in_macro_16x8); //CLIP3
1973
1974    temp1 = _mm_add_epi16(_mm_unpacklo_epi8(q1_16x8, zero), in_macro_16x8);
1975
1976    // q1
1977    temp1 = _mm_packus_epi16(temp1, zero);
1978
1979    q1_16x8_1 = _mm_and_si128(temp1, flag3_16x8);
1980    q1_16x8 = _mm_and_si128(q1_16x8,
1981                            _mm_xor_si128(flag3_16x8, _mm_set1_epi16(0xFFFF)));
1982    q1_16x8 = _mm_add_epi8(q1_16x8, q1_16x8_1);
1983
1984    temp1 = _mm_unpacklo_epi8(p3_16x8, p2_16x8);
1985    temp2 = _mm_unpacklo_epi8(p1_16x8, p0_16x8_1);
1986    temp3 = _mm_unpacklo_epi8(q0_16x8_1, q1_16x8);
1987    temp4 = _mm_unpacklo_epi8(q2_16x8, q3_16x8);
1988
1989    line7 = _mm_unpacklo_epi16(temp1, temp2);
1990    temp1 = _mm_unpackhi_epi16(temp1, temp2);
1991    line8 = _mm_unpacklo_epi16(temp3, temp4);
1992    temp2 = _mm_unpackhi_epi16(temp3, temp4);
1993
1994    line1 = _mm_unpacklo_epi32(line7, line8);
1995    line2 = _mm_srli_si128(line1, 8);
1996    line3 = _mm_unpackhi_epi32(line7, line8);
1997    line4 = _mm_srli_si128(line3, 8);
1998    line5 = _mm_unpacklo_epi32(temp1, temp2);
1999    line6 = _mm_srli_si128(line5, 8);
2000    line7 = _mm_unpackhi_epi32(temp1, temp2);
2001    line8 = _mm_srli_si128(line7, 8);
2002
2003    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd), line1);
2004    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd), line2);
2005    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd), line3);
2006    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd), line4);
2007    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd), line5);
2008    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd), line6);
2009    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd), line7);
2010    _mm_storel_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd), line8);
2011}
2012
2013