1/******************************************************************************
2 *
3 * Copyright (C) 2015 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19*/
20/*****************************************************************************/
21/*                                                                           */
22/*  File Name         : ih264_deblk_chroma_ssse3.c                           */
23/*                                                                           */
24/*  Description       : Contains function definitions for deblocking         */
25/*                                                                           */
26/*  List of Functions : ih264_deblk_chroma_vert_bs4_ssse3()                  */
27/*                      ih264_deblk_chroma_horz_bs4_ssse3()                  */
28/*                      ih264_deblk_chroma_vert_bslt4_ssse3()                */
29/*                      ih264_deblk_chroma_horz_bslt4_ssse3()                */
30/*                      ih264_deblk_chroma_vert_bs4_mbaff_ssse3()            */
31/*                      ih264_deblk_chroma_vert_bslt4_mbaff_ssse3()          */
32/*                                                                           */
33/*  Issues / Problems : None                                                 */
34/*                                                                           */
35/*  Revision History  :                                                      */
36/*                                                                           */
37/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
38/*         12 02 2015   Naveen Kumar P  Added chrom deblocking ssse3         */
39/*                                      intrinsics                           */
40/*                                                                           */
41/*****************************************************************************/
42
43/*****************************************************************************/
44/* File Includes                                                             */
45/*****************************************************************************/
46
47/* System include files */
48#include <stdio.h>
49
50/* User include files */
51#include "ih264_typedefs.h"
52#include "ih264_platform_macros.h"
53#include "ih264_deblk_edge_filters.h"
54#include "ih264_macros.h"
55
56/*****************************************************************************/
57/* Function Definitions                                                      */
58/*****************************************************************************/
59
60/*****************************************************************************/
61/*                                                                           */
62/*  Function Name : ih264_deblk_chroma_vert_bs4_ssse3()                      */
63/*                                                                           */
64/*  Description   : This function performs filtering of a chroma block       */
65/*                  vertical edge when the boundary strength is set to 4 in  */
66/*                  high profile.                                            */
67/*                                                                           */
68/*  Inputs        : pu1_src    - pointer to the src sample q0 of U           */
69/*                  src_strd   - source stride                               */
70/*                  alpha_cb   - alpha value for the boundary in U           */
71/*                  beta_cb    - beta value for the boundary in U            */
72/*                  alpha_cr   - alpha value for the boundary in V           */
73/*                  beta_cr    - beta value for the boundary in V            */
74/*                                                                           */
75/*  Globals       : None                                                     */
76/*                                                                           */
77/*  Processing    : This operation is described in Sec. 8.7.2.4 under the    */
78/*                  title "Filtering process for edges for bS equal to 4" in */
79/*                  ITU T Rec H.264 with alpha and beta values different in  */
80/*                  U and V.                                                 */
81/*                                                                           */
82/*  Outputs       : None                                                     */
83/*                                                                           */
84/*  Returns       : None                                                     */
85/*                                                                           */
86/*  Issues        : None                                                     */
87/*                                                                           */
88/*  Revision History:                                                        */
89/*                                                                           */
90/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
91/*         12 02 2015   Naveen Kumar P  Initial version                      */
92/*                                                                           */
93/*****************************************************************************/
94void ih264_deblk_chroma_vert_bs4_ssse3(UWORD8 *pu1_src,
95                                       WORD32 src_strd,
96                                       WORD32 alpha_cb,
97                                       WORD32 beta_cb,
98                                       WORD32 alpha_cr,
99                                       WORD32 beta_cr)
100{
101    UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
102    WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
103    WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
104    __m128i linea, lineb, linec, lined, linee, linef, lineg, lineh;
105    __m128i temp1, temp2, temp3, temp4;
106
107    __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
108    __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
109    __m128i flag1, flag2;
110    __m128i diff, alpha_cbcr_16x8, beta_cbcr_16x8;
111    __m128i zero = _mm_setzero_si128();
112    __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
113
114    /* Load and transpose the pixel values */
115    linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4));
116    lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd));
117    linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd));
118    lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd));
119    linee = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd));
120    linef = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd));
121    lineg = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd));
122    lineh = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd));
123
124    temp1 = _mm_unpacklo_epi16(linea, lineb);
125    temp2 = _mm_unpacklo_epi16(linec, lined);
126    temp3 = _mm_unpacklo_epi16(linee, linef);
127    temp4 = _mm_unpacklo_epi16(lineg, lineh);
128
129    p1_uv_8x16 = _mm_unpacklo_epi32(temp1, temp2);
130    p0_uv_8x16 = _mm_unpacklo_epi32(temp3, temp4);
131    q0_uv_8x16 = _mm_unpackhi_epi32(temp1, temp2);
132    q1_uv_8x16 = _mm_unpackhi_epi32(temp3, temp4);
133
134    p1_uv_16x8 = _mm_unpacklo_epi64(p1_uv_8x16, p0_uv_8x16);
135    p0_uv_16x8 = _mm_unpackhi_epi64(p1_uv_8x16, p0_uv_8x16);
136    q0_uv_16x8 = _mm_unpacklo_epi64(q0_uv_8x16, q1_uv_8x16);
137    q1_uv_16x8 = _mm_unpackhi_epi64(q0_uv_8x16, q1_uv_8x16);
138    /* End of transpose */
139
140    q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
141    q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
142    p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
143    p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
144
145    diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
146    diff = _mm_abs_epi16(diff);
147    alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
148    flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
149
150    diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
151    diff = _mm_abs_epi16(diff);
152    beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
153    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
154
155    diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
156    diff = _mm_abs_epi16(diff);
157    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
158
159    temp1 = _mm_slli_epi16(p1_uv_8x16, 1);
160    temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);
161    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
162    temp1 = _mm_add_epi16(temp1, temp2);
163    p0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
164
165    temp1 = _mm_slli_epi16(q1_uv_8x16, 1);
166    temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);
167    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
168    temp1 = _mm_add_epi16(temp1, temp2);
169    q0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
170
171    q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero);
172    q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero);
173    p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero);
174    p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero);
175
176    diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
177    diff = _mm_abs_epi16(diff);
178    alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
179    flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
180
181    diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
182    diff = _mm_abs_epi16(diff);
183    beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
184    flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
185
186    diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
187    diff = _mm_abs_epi16(diff);
188    flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
189
190    temp1 = _mm_slli_epi16(p1_uv_8x16, 1);
191    temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);
192    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
193    temp1 = _mm_add_epi16(temp1, temp2);
194    p0_uv_8x16_2 = _mm_srai_epi16(temp1, 2);
195
196    temp1 = _mm_slli_epi16(q1_uv_8x16, 1);
197    temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);
198    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
199    temp1 = _mm_add_epi16(temp1, temp2);
200    q0_uv_8x16_2 = _mm_srai_epi16(temp1, 2);
201
202    p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2);
203    q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2);
204
205    flag1 = _mm_packs_epi16(flag1, flag2);
206
207    p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
208                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
209    p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
210    p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
211
212    q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
213                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
214    q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
215    q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
216
217    /* Inverse-transpose and store back */
218    temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8);
219    temp2 = _mm_unpackhi_epi16(p1_uv_16x8, p0_uv_16x8);
220    temp3 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8);
221    temp4 = _mm_unpackhi_epi16(q0_uv_16x8, q1_uv_16x8);
222
223    linea = _mm_unpacklo_epi32(temp1, temp3);
224    lineb = _mm_srli_si128(linea, 8);
225    linec = _mm_unpackhi_epi32(temp1, temp3);
226    lined = _mm_srli_si128(linec, 8);
227    linee = _mm_unpacklo_epi32(temp2, temp4);
228    linef = _mm_srli_si128(linee, 8);
229    lineg = _mm_unpackhi_epi32(temp2, temp4);
230    lineh = _mm_srli_si128(lineg, 8);
231
232    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea);
233    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb);
234    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec);
235    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined);
236    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd), linee);
237    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd), linef);
238    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd), lineg);
239    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd), lineh);
240
241}
242
243/*****************************************************************************/
244/*                                                                           */
245/*  Function Name : ih264_deblk_chroma_horz_bs4_ssse3()                      */
246/*                                                                           */
247/*  Description   : This function performs filtering of a chroma block       */
248/*                  horizontal edge when the boundary strength is set to 4   */
249/*                  in high profile.                                         */
250/*                                                                           */
251/*  Inputs        : pu1_src    - pointer to the src sample q0 of U           */
252/*                  src_strd   - source stride                               */
253/*                  alpha_cb   - alpha value for the boundary in U           */
254/*                  beta_cb    - beta value for the boundary in U            */
255/*                  alpha_cr   - alpha value for the boundary in V           */
256/*                  beta_cr    - beta value for the boundary in V            */
257/*                                                                           */
258/*  Globals       : None                                                     */
259/*                                                                           */
260/*  Processing    : This operation is described in Sec. 8.7.2.4 under the    */
261/*                  title "Filtering process for edges for bS equal to 4" in */
262/*                  ITU T Rec H.264 with alpha and beta values different in  */
263/*                  U and V.                                                 */
264/*                                                                           */
265/*  Outputs       : None                                                     */
266/*                                                                           */
267/*  Returns       : None                                                     */
268/*                                                                           */
269/*  Issues        : None                                                     */
270/*                                                                           */
271/*  Revision History:                                                        */
272/*                                                                           */
273/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
274/*         12 02 2015   Naveen Kumar P  Initial version                      */
275/*                                                                           */
276/*****************************************************************************/
277void ih264_deblk_chroma_horz_bs4_ssse3(UWORD8 *pu1_src,
278                                       WORD32 src_strd,
279                                       WORD32 alpha_cb,
280                                       WORD32 beta_cb,
281                                       WORD32 alpha_cr,
282                                       WORD32 beta_cr)
283{
284    UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
285    WORD16 i16_posP1, i16_posP0, i16_posQ1;
286
287    UWORD8 *pu1_HorzPixelUV; /*! < Pointer to the first pixel of the boundary */
288    WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
289    WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
290    __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
291    __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
292    __m128i flag1, flag2;
293    __m128i diff, alpha_cbcr_16x8, beta_cbcr_16x8;
294    __m128i zero = _mm_setzero_si128();
295    __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
296    __m128i temp1, temp2;
297
298    pu1_HorzPixelUV = pu1_src_uv - (src_strd << 1);
299
300    i16_posQ1 = src_strd;
301    i16_posP0 = src_strd;
302    i16_posP1 = 0;
303
304    q0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv));
305    q1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv + i16_posQ1));
306    p1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP1));
307    p0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0));
308
309    q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
310    q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
311    p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
312    p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
313
314    diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
315    diff = _mm_abs_epi16(diff);
316    alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
317    flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
318
319    diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
320    diff = _mm_abs_epi16(diff);
321    beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
322    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
323
324    diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
325    diff = _mm_abs_epi16(diff);
326    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
327
328    temp1 = _mm_slli_epi16(p1_uv_8x16, 1);
329    temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);
330    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
331    temp1 = _mm_add_epi16(temp1, temp2);
332    p0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
333
334    temp1 = _mm_slli_epi16(q1_uv_8x16, 1);
335    temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);
336    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
337    temp1 = _mm_add_epi16(temp1, temp2);
338    q0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
339
340    q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero);
341    q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero);
342    p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero);
343    p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero);
344
345    diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
346    diff = _mm_abs_epi16(diff);
347    alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
348    flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
349
350    diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
351    diff = _mm_abs_epi16(diff);
352    beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
353    flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
354
355    diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
356    diff = _mm_abs_epi16(diff);
357    flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
358
359    temp1 = _mm_slli_epi16(p1_uv_8x16, 1);
360    temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);
361    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
362    temp1 = _mm_add_epi16(temp1, temp2);
363    p0_uv_8x16_2 = _mm_srai_epi16(temp1, 2);
364
365    temp1 = _mm_slli_epi16(q1_uv_8x16, 1);
366    temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);
367    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
368    temp1 = _mm_add_epi16(temp1, temp2);
369    q0_uv_8x16_2 = _mm_srai_epi16(temp1, 2);
370
371    p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2);
372    q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2);
373
374    flag1 = _mm_packs_epi16(flag1, flag2);
375
376    p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
377                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
378    p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
379    p0_uv_8x16_1 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
380    _mm_storeu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0), p0_uv_8x16_1);
381
382    q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
383                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
384    q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
385    q0_uv_8x16_1 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
386    _mm_storeu_si128((__m128i *)(pu1_src_uv), q0_uv_8x16_1);
387
388}
389
390/*****************************************************************************/
391/*                                                                           */
392/*  Function Name : ih264_deblk_chroma_vert_bslt4_ssse3()                    */
393/*                                                                           */
394/*  Description   : This function performs filtering of a chroma block       */
395/*                  vertical edge when the boundary strength is less than 4  */
396/*                  in high profile.                                         */
397/*                                                                           */
398/*  Inputs        : pu1_src          - pointer to the src sample q0 of U     */
399/*                  src_strd         - source stride                         */
400/*                  alpha_cb         - alpha value for the boundary in U     */
401/*                  beta_cb          - beta value for the boundary in U      */
402/*                  alpha_cr         - alpha value for the boundary in V     */
403/*                  beta_cr          - beta value for the boundary in V      */
404/*                  u4_bs            - packed Boundary strength array        */
405/*                  pu1_cliptab_cb   - tc0_table for U                       */
406/*                  pu1_cliptab_cr   - tc0_table for V                       */
407/*                                                                           */
408/*  Globals       : None                                                     */
409/*                                                                           */
410/*  Processing    : This operation is described in Sec. 8.7.2.3 under the    */
411/*                  title "Filtering process for edges for bS less than 4"   */
412/*                  in ITU T Rec H.264 with alpha and beta values different  */
413/*                  in U and V.                                              */
414/*                                                                           */
415/*  Outputs       : None                                                     */
416/*                                                                           */
417/*  Returns       : None                                                     */
418/*                                                                           */
419/*  Issues        : None                                                     */
420/*                                                                           */
421/*  Revision History:                                                        */
422/*                                                                           */
423/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
424/*         12 02 2015   Naveen Kumar P  Initial version                      */
425/*                                                                           */
426/*****************************************************************************/
427void ih264_deblk_chroma_vert_bslt4_ssse3(UWORD8 *pu1_src,
428                                         WORD32 src_strd,
429                                         WORD32 alpha_cb,
430                                         WORD32 beta_cb,
431                                         WORD32 alpha_cr,
432                                         WORD32 beta_cr,
433                                         UWORD32 u4_bs,
434                                         const UWORD8 *pu1_cliptab_cb,
435                                         const UWORD8 *pu1_cliptab_cr)
436{
437    UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
438    UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
439    WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
440    WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
441    __m128i linea, lineb, linec, lined, linee, linef, lineg, lineh;
442    __m128i temp1, temp2, temp3, temp4;
443
444    __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
445    __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
446    __m128i flag_bs, flag1, flag2;
447    __m128i diff, diff1, alpha_cbcr_16x8, beta_cbcr_16x8, in_macro;
448    __m128i zero = _mm_setzero_si128();
449    __m128i C0_uv_8x16;
450    __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
451
452    u1_Bs0 = (u4_bs >> 24) & 0xff;
453    u1_Bs1 = (u4_bs >> 16) & 0xff;
454    u1_Bs2 = (u4_bs >> 8) & 0xff;
455    u1_Bs3 = (u4_bs >> 0) & 0xff;
456
457    flag_bs = _mm_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs2, u1_Bs2,
458                           u1_Bs2, u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,
459                           u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0);
460    flag_bs = _mm_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s
461    flag_bs = _mm_xor_si128(flag_bs, _mm_set1_epi8(0xFF)); //Invert for required mask
462
463    /* Load and transpose the pixel values */
464    linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4));
465    lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd));
466    linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd));
467    lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd));
468    linee = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd));
469    linef = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd));
470    lineg = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd));
471    lineh = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd));
472
473    temp1 = _mm_unpacklo_epi16(linea, lineb);
474    temp2 = _mm_unpacklo_epi16(linec, lined);
475    temp3 = _mm_unpacklo_epi16(linee, linef);
476    temp4 = _mm_unpacklo_epi16(lineg, lineh);
477
478    p1_uv_8x16 = _mm_unpacklo_epi32(temp1, temp2);
479    p0_uv_8x16 = _mm_unpacklo_epi32(temp3, temp4);
480    q0_uv_8x16 = _mm_unpackhi_epi32(temp1, temp2);
481    q1_uv_8x16 = _mm_unpackhi_epi32(temp3, temp4);
482
483    p1_uv_16x8 = _mm_unpacklo_epi64(p1_uv_8x16, p0_uv_8x16);
484    p0_uv_16x8 = _mm_unpackhi_epi64(p1_uv_8x16, p0_uv_8x16);
485    q0_uv_16x8 = _mm_unpacklo_epi64(q0_uv_8x16, q1_uv_8x16);
486    q1_uv_16x8 = _mm_unpackhi_epi64(q0_uv_8x16, q1_uv_8x16);
487    /* End of transpose */
488
489    q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
490    q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
491    p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
492    p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
493
494    diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
495    diff = _mm_abs_epi16(diff);
496    alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
497    flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
498
499    diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
500    diff = _mm_abs_epi16(diff);
501    beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
502    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
503
504    diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
505    diff = _mm_abs_epi16(diff);
506    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
507
508    diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);
509    diff = _mm_slli_epi16(diff, 2);
510    diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);
511    diff = _mm_add_epi16(diff, diff1);
512    diff = _mm_add_epi16(diff, _mm_set1_epi16(4));
513    in_macro = _mm_srai_epi16(diff, 3);
514
515    C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
516                               pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
517                               pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0],
518                               pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]);
519
520    C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));
521
522    in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3
523    C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);
524    in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);
525
526    p0_uv_8x16_1 = _mm_add_epi16(p0_uv_8x16, in_macro);
527    q0_uv_8x16_1 = _mm_sub_epi16(q0_uv_8x16, in_macro);
528
529    q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero);
530    q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero);
531    p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero);
532    p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero);
533
534    diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
535    diff = _mm_abs_epi16(diff);
536    alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
537    flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
538
539    diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
540    diff = _mm_abs_epi16(diff);
541    beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
542    flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
543
544    diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
545    diff = _mm_abs_epi16(diff);
546    flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
547
548    diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);
549    diff = _mm_slli_epi16(diff, 2);
550    diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);
551    diff = _mm_add_epi16(diff, diff1);
552    diff = _mm_add_epi16(diff, _mm_set1_epi16(4));
553    in_macro = _mm_srai_epi16(diff, 3);
554
555    C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
556                               pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
557                               pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2],
558                               pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2]);
559
560    C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));
561
562    in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3
563    C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);
564    in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);
565
566    p0_uv_8x16_2 = _mm_add_epi16(p0_uv_8x16, in_macro);
567    q0_uv_8x16_2 = _mm_sub_epi16(q0_uv_8x16, in_macro);
568
569    p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2);
570    q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2);
571
572    flag1 = _mm_packs_epi16(flag1, flag2);
573    flag1 = _mm_and_si128(flag1, flag_bs); //Final flag (BS condition + other 3 conditions)
574
575    p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
576                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
577    p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
578    p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
579
580    q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
581                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
582    q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
583    q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
584
585    /* Inverse-transpose and store back */
586    temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8);
587    temp2 = _mm_unpackhi_epi16(p1_uv_16x8, p0_uv_16x8);
588    temp3 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8);
589    temp4 = _mm_unpackhi_epi16(q0_uv_16x8, q1_uv_16x8);
590
591    linea = _mm_unpacklo_epi32(temp1, temp3);
592    lineb = _mm_srli_si128(linea, 8);
593    linec = _mm_unpackhi_epi32(temp1, temp3);
594    lined = _mm_srli_si128(linec, 8);
595    linee = _mm_unpacklo_epi32(temp2, temp4);
596    linef = _mm_srli_si128(linee, 8);
597    lineg = _mm_unpackhi_epi32(temp2, temp4);
598    lineh = _mm_srli_si128(lineg, 8);
599
600    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea);
601    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb);
602    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec);
603    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined);
604    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd), linee);
605    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd), linef);
606    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd), lineg);
607    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd), lineh);
608
609}
610
611/*****************************************************************************/
612/*                                                                           */
613/*  Function Name : ih264_deblk_chroma_horz_bslt4_ssse3()                    */
614/*                                                                           */
615/*  Description   : This function performs filtering of a chroma block       */
616/*                  horizontal edge when the boundary strength is less than  */
617/*                  4 in high profile.                                       */
618/*                                                                           */
619/*  Inputs        : pu1_src          - pointer to the src sample q0 of U     */
620/*                  src_strd         - source stride                         */
621/*                  alpha_cb         - alpha value for the boundary in U     */
622/*                  beta_cb          - beta value for the boundary in U      */
623/*                  alpha_cr         - alpha value for the boundary in V     */
624/*                  beta_cr          - beta value for the boundary in V      */
625/*                  u4_bs            - packed Boundary strength array        */
626/*                  pu1_cliptab_cb   - tc0_table for U                       */
627/*                  pu1_cliptab_cr   - tc0_table for V                       */
628/*                                                                           */
629/*  Globals       : None                                                     */
630/*                                                                           */
631/*  Processing    : This operation is described in Sec. 8.7.2.3 under the    */
632/*                  title "Filtering process for edges for bS less than 4"   */
633/*                  in ITU T Rec H.264 with alpha and beta values different  */
634/*                  in U and V.                                              */
635/*                                                                           */
636/*  Outputs       : None                                                     */
637/*                                                                           */
638/*  Returns       : None                                                     */
639/*                                                                           */
640/*  Issues        : None                                                     */
641/*                                                                           */
642/*  Revision History:                                                        */
643/*                                                                           */
644/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
645/*         12 02 2015   Naveen Kumar P  Initial version                      */
646/*                                                                           */
647/*****************************************************************************/
648void ih264_deblk_chroma_horz_bslt4_ssse3(UWORD8 *pu1_src,
649                                         WORD32 src_strd,
650                                         WORD32 alpha_cb,
651                                         WORD32 beta_cb,
652                                         WORD32 alpha_cr,
653                                         WORD32 beta_cr,
654                                         UWORD32 u4_bs,
655                                         const UWORD8 *pu1_cliptab_cb,
656                                         const UWORD8 *pu1_cliptab_cr)
657{
658    UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
659    WORD16 i16_posP1, i16_posP0, i16_posQ1;
660    UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
661
662    UWORD8 *pu1_HorzPixelUV; /*! < Pointer to the first pixel of the boundary */
663    WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
664    WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
665    __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
666    __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
667    __m128i flag_bs, flag1, flag2;
668    __m128i diff, diff1, alpha_cbcr_16x8, beta_cbcr_16x8, in_macro;
669    __m128i zero = _mm_setzero_si128();
670    __m128i C0_uv_8x16;
671    __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
672
673    pu1_HorzPixelUV = pu1_src_uv - (src_strd << 1);
674
675    i16_posQ1 = src_strd;
676    i16_posP0 = src_strd;
677    i16_posP1 = 0;
678
679    u1_Bs0 = (u4_bs >> 24) & 0xff;
680    u1_Bs1 = (u4_bs >> 16) & 0xff;
681    u1_Bs2 = (u4_bs >> 8) & 0xff;
682    u1_Bs3 = (u4_bs >> 0) & 0xff;
683
684    flag_bs = _mm_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs2, u1_Bs2,
685                           u1_Bs2, u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,
686                           u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0);
687    flag_bs = _mm_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s
688    flag_bs = _mm_xor_si128(flag_bs, _mm_set1_epi8(0xFF)); //Invert for required mask
689
690    q0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv));
691    q1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv + i16_posQ1));
692    p1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP1));
693    p0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0));
694
695    q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
696    q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
697    p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
698    p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
699
700    diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
701    diff = _mm_abs_epi16(diff);
702    alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
703    flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
704
705    diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
706    diff = _mm_abs_epi16(diff);
707    beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
708    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
709
710    diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
711    diff = _mm_abs_epi16(diff);
712    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
713
714    diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);
715    diff = _mm_slli_epi16(diff, 2);
716    diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);
717    diff = _mm_add_epi16(diff, diff1);
718    diff = _mm_add_epi16(diff, _mm_set1_epi16(4));
719    in_macro = _mm_srai_epi16(diff, 3);
720
721    C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
722                               pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
723                               pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0],
724                               pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]);
725
726    C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));
727
728    in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3
729    C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);
730    in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);
731
732    p0_uv_8x16_1 = _mm_add_epi16(p0_uv_8x16, in_macro);
733    q0_uv_8x16_1 = _mm_sub_epi16(q0_uv_8x16, in_macro);
734
735    q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero);
736    q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero);
737    p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero);
738    p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero);
739
740    diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
741    diff = _mm_abs_epi16(diff);
742    alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
743    flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
744
745    diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
746    diff = _mm_abs_epi16(diff);
747    beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
748    flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
749
750    diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
751    diff = _mm_abs_epi16(diff);
752    flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
753
754    diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);
755    diff = _mm_slli_epi16(diff, 2);
756    diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);
757    diff = _mm_add_epi16(diff, diff1);
758    diff = _mm_add_epi16(diff, _mm_set1_epi16(4));
759    in_macro = _mm_srai_epi16(diff, 3);
760
761    C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
762                               pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
763                               pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2],
764                               pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2]);
765
766    C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));
767
768    in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3
769    C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);
770    in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);
771
772    p0_uv_8x16_2 = _mm_add_epi16(p0_uv_8x16, in_macro);
773    q0_uv_8x16_2 = _mm_sub_epi16(q0_uv_8x16, in_macro);
774
775    p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2);
776    q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2);
777
778    flag1 = _mm_packs_epi16(flag1, flag2);
779    flag1 = _mm_and_si128(flag1, flag_bs); //Final flag (BS condition + other 3 conditions)
780
781    p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
782                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
783    p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
784    p0_uv_8x16_1 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
785    _mm_storeu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0), p0_uv_8x16_1);
786
787    q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
788                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
789    q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
790    q0_uv_8x16_1 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
791    _mm_storeu_si128((__m128i *)(pu1_src_uv), q0_uv_8x16_1);
792
793}
794
795/*****************************************************************************/
796/*                                                                           */
797/*  Function Name : ih264_deblk_chroma_vert_bs4_mbaff_ssse3()                */
798/*                                                                           */
799/*  Description   : This function performs filtering of a chroma block       */
800/*                  vertical edge when boundary strength is set to 4 in high */
801/*                  profile.                                                 */
802/*                                                                           */
803/*  Inputs        : pu1_src          - pointer to the src sample q0 of U     */
804/*                  src_strd         - source stride                         */
805/*                  alpha_cb         - alpha value for the boundary in U     */
806/*                  beta_cb          - beta value for the boundary in U      */
807/*                  alpha_cr         - alpha value for the boundary in V     */
808/*                  beta_cr          - beta value for the boundary in V      */
809/*                  u4_bs            - packed Boundary strength array        */
810/*                  pu1_cliptab_cb   - tc0_table for U                       */
811/*                  pu1_cliptab_cr   - tc0_table for V                       */
812/*                                                                           */
813/*  Globals       : None                                                     */
814/*                                                                           */
815/*  Processing    : When the function is called twice, this operation is as  */
816/*                  described in Sec. 8.7.2.4 under the title "Filtering     */
817/*                  process for edges for bS equal to 4" in ITU T Rec H.264  */
818/*                  with alpha and beta values different in U and V.         */
819/*                                                                           */
820/*  Outputs       : None                                                     */
821/*                                                                           */
822/*  Returns       : None                                                     */
823/*                                                                           */
824/*  Issues        : None                                                     */
825/*                                                                           */
826/*  Revision History:                                                        */
827/*                                                                           */
828/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
829/*         12 02 2015   Naveen Kumar P  Initial version                      */
830/*                                                                           */
831/*****************************************************************************/
832void ih264_deblk_chroma_vert_bs4_mbaff_ssse3(UWORD8 *pu1_src,
833                                             WORD32 src_strd,
834                                             WORD32 alpha_cb,
835                                             WORD32 beta_cb,
836                                             WORD32 alpha_cr,
837                                             WORD32 beta_cr)
838{
839    UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
840    WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
841    WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
842    __m128i linea, lineb, linec, lined;
843    __m128i temp1, temp2;
844
845    __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
846    __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
847    __m128i flag1;
848    __m128i diff, alpha_cbcr_16x8, beta_cbcr_16x8;
849    __m128i zero = _mm_setzero_si128();
850    __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
851
852    /* Load and transpose the pixel values */
853    linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4));
854    lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd));
855    linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd));
856    lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd));
857
858    temp1 = _mm_unpacklo_epi16(linea, lineb);
859    temp2 = _mm_unpacklo_epi16(linec, lined);
860
861    p1_uv_16x8 = _mm_unpacklo_epi32(temp1, temp2);
862    p0_uv_16x8 = _mm_srli_si128(p1_uv_16x8, 8);
863    q0_uv_16x8 = _mm_unpackhi_epi32(temp1, temp2);
864    q1_uv_16x8 = _mm_srli_si128(q0_uv_16x8, 8);
865    /* End of transpose */
866
867    q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
868    q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
869    p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
870    p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
871
872    diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
873    diff = _mm_abs_epi16(diff);
874    alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
875    flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
876
877    diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
878    diff = _mm_abs_epi16(diff);
879    beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
880    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
881
882    diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
883    diff = _mm_abs_epi16(diff);
884    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
885
886    temp1 = _mm_slli_epi16(p1_uv_8x16, 1);
887    temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);
888    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
889    temp1 = _mm_add_epi16(temp1, temp2);
890    p0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
891
892    temp1 = _mm_slli_epi16(q1_uv_8x16, 1);
893    temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);
894    temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
895    temp1 = _mm_add_epi16(temp1, temp2);
896    q0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
897
898    p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_1);
899    q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_1);
900
901    flag1 = _mm_packs_epi16(flag1, flag1);
902
903    p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
904                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
905    p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
906    p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
907
908    q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
909                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
910    q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
911    q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
912
913    /* Inverse-transpose and store back */
914    temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8);
915    temp2 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8);
916
917    linea = _mm_unpacklo_epi32(temp1, temp2);
918    lineb = _mm_srli_si128(linea, 8);
919    linec = _mm_unpackhi_epi32(temp1, temp2);
920    lined = _mm_srli_si128(linec, 8);
921
922    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea);
923    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb);
924    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec);
925    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined);
926
927}
928
929/*****************************************************************************/
930/*                                                                           */
931/*  Function Name : ih264_deblk_chroma_vert_bslt4_mbaff_ssse3()              */
932/*                                                                           */
933/*  Description   : This function performs filtering of a chroma block       */
934/*                  vertical edge when boundary strength is less than 4 in   */
935/*                  high profile.                                            */
936/*                                                                           */
937/*  Inputs        : pu1_src          - pointer to the src sample q0 of U     */
938/*                  src_strd         - source stride                         */
939/*                  alpha_cb         - alpha value for the boundary in U     */
940/*                  beta_cb          - beta value for the boundary in U      */
941/*                  alpha_cr         - alpha value for the boundary in V     */
942/*                  beta_cr          - beta value for the boundary in V      */
943/*                  u4_bs            - packed Boundary strength array        */
944/*                  pu1_cliptab_cb   - tc0_table for U                       */
945/*                  pu1_cliptab_cr   - tc0_table for V                       */
946/*                                                                           */
947/*  Globals       : None                                                     */
948/*                                                                           */
949/*  Processing    : When the function is called twice, this operation is as  */
950/*                  described in Sec. 8.7.2.4 under the title "Filtering     */
951/*                  process for edges for bS less than 4" in ITU T Rec H.264 */
952/*                  with alpha and beta values different in U and V.         */
953/*                                                                           */
954/*  Outputs       : None                                                     */
955/*                                                                           */
956/*  Returns       : None                                                     */
957/*                                                                           */
958/*  Issues        : None                                                     */
959/*                                                                           */
960/*  Revision History:                                                        */
961/*                                                                           */
962/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
963/*         12 02 2015   Naveen Kumar P  Initial version                      */
964/*                                                                           */
965/*****************************************************************************/
966void ih264_deblk_chroma_vert_bslt4_mbaff_ssse3(UWORD8 *pu1_src,
967                                               WORD32 src_strd,
968                                               WORD32 alpha_cb,
969                                               WORD32 beta_cb,
970                                               WORD32 alpha_cr,
971                                               WORD32 beta_cr,
972                                               UWORD32 u4_bs,
973                                               const UWORD8 *pu1_cliptab_cb,
974                                               const UWORD8 *pu1_cliptab_cr)
975{
976    UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
977    UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
978    WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
979    WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
980    __m128i linea, lineb, linec, lined;
981    __m128i temp1, temp2;
982
983    __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
984    __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
985    __m128i flag_bs, flag1;
986    __m128i diff, diff1, alpha_cbcr_16x8, beta_cbcr_16x8, in_macro;
987    __m128i zero = _mm_setzero_si128();
988    __m128i C0_uv_8x16;
989    __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
990
991    u1_Bs0 = (u4_bs >> 24) & 0xff;
992    u1_Bs1 = (u4_bs >> 16) & 0xff;
993    u1_Bs2 = (u4_bs >> 8) & 0xff;
994    u1_Bs3 = (u4_bs >> 0) & 0xff;
995
996    flag_bs = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, u1_Bs3, u1_Bs3, u1_Bs2,
997                           u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs0, u1_Bs0);
998    flag_bs = _mm_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s
999    flag_bs = _mm_xor_si128(flag_bs, _mm_set1_epi8(0xFF)); //Invert for required mask
1000
1001    /* Load and transpose the pixel values */
1002    linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4));
1003    lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd));
1004    linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd));
1005    lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd));
1006
1007    temp1 = _mm_unpacklo_epi16(linea, lineb);
1008    temp2 = _mm_unpacklo_epi16(linec, lined);
1009
1010    p1_uv_16x8 = _mm_unpacklo_epi32(temp1, temp2);
1011    p0_uv_16x8 = _mm_srli_si128(p1_uv_16x8, 8);
1012    q0_uv_16x8 = _mm_unpackhi_epi32(temp1, temp2);
1013    q1_uv_16x8 = _mm_srli_si128(q0_uv_16x8, 8);
1014    /* End of transpose */
1015
1016    q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
1017    q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
1018    p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
1019    p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
1020
1021    diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
1022    diff = _mm_abs_epi16(diff);
1023    alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
1024    flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
1025
1026    diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
1027    diff = _mm_abs_epi16(diff);
1028    beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
1029    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
1030
1031    diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
1032    diff = _mm_abs_epi16(diff);
1033    flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
1034
1035    diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);
1036    diff = _mm_slli_epi16(diff, 2);
1037    diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);
1038    diff = _mm_add_epi16(diff, diff1);
1039    diff = _mm_add_epi16(diff, _mm_set1_epi16(4));
1040    in_macro = _mm_srai_epi16(diff, 3);
1041
1042    C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
1043                               pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2],
1044                               pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
1045                               pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]);
1046
1047    C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));
1048
1049    in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3
1050    C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);
1051    in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);
1052
1053    p0_uv_8x16_1 = _mm_add_epi16(p0_uv_8x16, in_macro);
1054    q0_uv_8x16_1 = _mm_sub_epi16(q0_uv_8x16, in_macro);
1055
1056    p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_1);
1057    q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_1);
1058
1059    flag1 = _mm_packs_epi16(flag1, flag1);
1060    flag1 = _mm_and_si128(flag1, flag_bs); //Final flag (BS condition + other 3 conditions)
1061
1062    p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
1063                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
1064    p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
1065    p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
1066
1067    q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
1068                                 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
1069    q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
1070    q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
1071
1072    /* Inverse-transpose and store back */
1073    temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8);
1074    temp2 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8);
1075
1076    linea = _mm_unpacklo_epi32(temp1, temp2);
1077    lineb = _mm_srli_si128(linea, 8);
1078    linec = _mm_unpackhi_epi32(temp1, temp2);
1079    lined = _mm_srli_si128(linec, 8);
1080
1081    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea);
1082    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb);
1083    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec);
1084    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined);
1085
1086}
1087
1088