1/****************************************************************************** 2 * 3 * Copyright (C) 2015 The Android Open Source Project 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at: 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 ***************************************************************************** 18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19*/ 20/*****************************************************************************/ 21/* */ 22/* File Name : ih264_deblk_chroma_ssse3.c */ 23/* */ 24/* Description : Contains function definitions for deblocking */ 25/* */ 26/* List of Functions : ih264_deblk_chroma_vert_bs4_ssse3() */ 27/* ih264_deblk_chroma_horz_bs4_ssse3() */ 28/* ih264_deblk_chroma_vert_bslt4_ssse3() */ 29/* ih264_deblk_chroma_horz_bslt4_ssse3() */ 30/* ih264_deblk_chroma_vert_bs4_mbaff_ssse3() */ 31/* ih264_deblk_chroma_vert_bslt4_mbaff_ssse3() */ 32/* */ 33/* Issues / Problems : None */ 34/* */ 35/* Revision History : */ 36/* */ 37/* DD MM YYYY Author(s) Changes (Describe the changes made) */ 38/* 12 02 2015 Naveen Kumar P Added chrom deblocking ssse3 */ 39/* intrinsics */ 40/* */ 41/*****************************************************************************/ 42 43/*****************************************************************************/ 44/* File Includes */ 45/*****************************************************************************/ 46 47/* System include files */ 48#include <stdio.h> 49 50/* User include files */ 51#include "ih264_typedefs.h" 52#include "ih264_platform_macros.h" 53#include "ih264_deblk_edge_filters.h" 54#include "ih264_macros.h" 55 56/*****************************************************************************/ 57/* Function Definitions */ 58/*****************************************************************************/ 59 60/*****************************************************************************/ 61/* */ 62/* Function Name : ih264_deblk_chroma_vert_bs4_ssse3() */ 63/* */ 64/* Description : This function performs filtering of a chroma block */ 65/* vertical edge when the boundary strength is set to 4 in */ 66/* high profile. */ 67/* */ 68/* Inputs : pu1_src - pointer to the src sample q0 of U */ 69/* src_strd - source stride */ 70/* alpha_cb - alpha value for the boundary in U */ 71/* beta_cb - beta value for the boundary in U */ 72/* alpha_cr - alpha value for the boundary in V */ 73/* beta_cr - beta value for the boundary in V */ 74/* */ 75/* Globals : None */ 76/* */ 77/* Processing : This operation is described in Sec. 8.7.2.4 under the */ 78/* title "Filtering process for edges for bS equal to 4" in */ 79/* ITU T Rec H.264 with alpha and beta values different in */ 80/* U and V. */ 81/* */ 82/* Outputs : None */ 83/* */ 84/* Returns : None */ 85/* */ 86/* Issues : None */ 87/* */ 88/* Revision History: */ 89/* */ 90/* DD MM YYYY Author(s) Changes (Describe the changes made) */ 91/* 12 02 2015 Naveen Kumar P Initial version */ 92/* */ 93/*****************************************************************************/ 94void ih264_deblk_chroma_vert_bs4_ssse3(UWORD8 *pu1_src, 95 WORD32 src_strd, 96 WORD32 alpha_cb, 97 WORD32 beta_cb, 98 WORD32 alpha_cr, 99 WORD32 beta_cr) 100{ 101 UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/ 102 WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb; 103 WORD32 beta_cbcr = (beta_cr << 16) + beta_cb; 104 __m128i linea, lineb, linec, lined, linee, linef, lineg, lineh; 105 __m128i temp1, temp2, temp3, temp4; 106 107 __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8; 108 __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16; 109 __m128i flag1, flag2; 110 __m128i diff, alpha_cbcr_16x8, beta_cbcr_16x8; 111 __m128i zero = _mm_setzero_si128(); 112 __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2; 113 114 /* Load and transpose the pixel values */ 115 linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4)); 116 lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd)); 117 linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd)); 118 lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd)); 119 linee = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd)); 120 linef = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd)); 121 lineg = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd)); 122 lineh = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd)); 123 124 temp1 = _mm_unpacklo_epi16(linea, lineb); 125 temp2 = _mm_unpacklo_epi16(linec, lined); 126 temp3 = _mm_unpacklo_epi16(linee, linef); 127 temp4 = _mm_unpacklo_epi16(lineg, lineh); 128 129 p1_uv_8x16 = _mm_unpacklo_epi32(temp1, temp2); 130 p0_uv_8x16 = _mm_unpacklo_epi32(temp3, temp4); 131 q0_uv_8x16 = _mm_unpackhi_epi32(temp1, temp2); 132 q1_uv_8x16 = _mm_unpackhi_epi32(temp3, temp4); 133 134 p1_uv_16x8 = _mm_unpacklo_epi64(p1_uv_8x16, p0_uv_8x16); 135 p0_uv_16x8 = _mm_unpackhi_epi64(p1_uv_8x16, p0_uv_8x16); 136 q0_uv_16x8 = _mm_unpacklo_epi64(q0_uv_8x16, q1_uv_8x16); 137 q1_uv_16x8 = _mm_unpackhi_epi64(q0_uv_8x16, q1_uv_8x16); 138 /* End of transpose */ 139 140 q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero); 141 q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero); 142 p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero); 143 p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero); 144 145 diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1 146 diff = _mm_abs_epi16(diff); 147 alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr); 148 flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff); 149 150 diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2 151 diff = _mm_abs_epi16(diff); 152 beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr); 153 flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); 154 155 diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3 156 diff = _mm_abs_epi16(diff); 157 flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); 158 159 temp1 = _mm_slli_epi16(p1_uv_8x16, 1); 160 temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16); 161 temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2)); 162 temp1 = _mm_add_epi16(temp1, temp2); 163 p0_uv_8x16_1 = _mm_srai_epi16(temp1, 2); 164 165 temp1 = _mm_slli_epi16(q1_uv_8x16, 1); 166 temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16); 167 temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2)); 168 temp1 = _mm_add_epi16(temp1, temp2); 169 q0_uv_8x16_1 = _mm_srai_epi16(temp1, 2); 170 171 q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero); 172 q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero); 173 p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero); 174 p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero); 175 176 diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1 177 diff = _mm_abs_epi16(diff); 178 alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr); 179 flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff); 180 181 diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2 182 diff = _mm_abs_epi16(diff); 183 beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr); 184 flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); 185 186 diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3 187 diff = _mm_abs_epi16(diff); 188 flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); 189 190 temp1 = _mm_slli_epi16(p1_uv_8x16, 1); 191 temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16); 192 temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2)); 193 temp1 = _mm_add_epi16(temp1, temp2); 194 p0_uv_8x16_2 = _mm_srai_epi16(temp1, 2); 195 196 temp1 = _mm_slli_epi16(q1_uv_8x16, 1); 197 temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16); 198 temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2)); 199 temp1 = _mm_add_epi16(temp1, temp2); 200 q0_uv_8x16_2 = _mm_srai_epi16(temp1, 2); 201 202 p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2); 203 q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2); 204 205 flag1 = _mm_packs_epi16(flag1, flag2); 206 207 p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8, 208 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); 209 p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1); 210 p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2); 211 212 q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8, 213 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); 214 q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1); 215 q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2); 216 217 /* Inverse-transpose and store back */ 218 temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8); 219 temp2 = _mm_unpackhi_epi16(p1_uv_16x8, p0_uv_16x8); 220 temp3 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8); 221 temp4 = _mm_unpackhi_epi16(q0_uv_16x8, q1_uv_16x8); 222 223 linea = _mm_unpacklo_epi32(temp1, temp3); 224 lineb = _mm_srli_si128(linea, 8); 225 linec = _mm_unpackhi_epi32(temp1, temp3); 226 lined = _mm_srli_si128(linec, 8); 227 linee = _mm_unpacklo_epi32(temp2, temp4); 228 linef = _mm_srli_si128(linee, 8); 229 lineg = _mm_unpackhi_epi32(temp2, temp4); 230 lineh = _mm_srli_si128(lineg, 8); 231 232 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea); 233 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb); 234 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec); 235 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined); 236 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd), linee); 237 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd), linef); 238 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd), lineg); 239 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd), lineh); 240 241} 242 243/*****************************************************************************/ 244/* */ 245/* Function Name : ih264_deblk_chroma_horz_bs4_ssse3() */ 246/* */ 247/* Description : This function performs filtering of a chroma block */ 248/* horizontal edge when the boundary strength is set to 4 */ 249/* in high profile. */ 250/* */ 251/* Inputs : pu1_src - pointer to the src sample q0 of U */ 252/* src_strd - source stride */ 253/* alpha_cb - alpha value for the boundary in U */ 254/* beta_cb - beta value for the boundary in U */ 255/* alpha_cr - alpha value for the boundary in V */ 256/* beta_cr - beta value for the boundary in V */ 257/* */ 258/* Globals : None */ 259/* */ 260/* Processing : This operation is described in Sec. 8.7.2.4 under the */ 261/* title "Filtering process for edges for bS equal to 4" in */ 262/* ITU T Rec H.264 with alpha and beta values different in */ 263/* U and V. */ 264/* */ 265/* Outputs : None */ 266/* */ 267/* Returns : None */ 268/* */ 269/* Issues : None */ 270/* */ 271/* Revision History: */ 272/* */ 273/* DD MM YYYY Author(s) Changes (Describe the changes made) */ 274/* 12 02 2015 Naveen Kumar P Initial version */ 275/* */ 276/*****************************************************************************/ 277void ih264_deblk_chroma_horz_bs4_ssse3(UWORD8 *pu1_src, 278 WORD32 src_strd, 279 WORD32 alpha_cb, 280 WORD32 beta_cb, 281 WORD32 alpha_cr, 282 WORD32 beta_cr) 283{ 284 UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/ 285 WORD16 i16_posP1, i16_posP0, i16_posQ1; 286 287 UWORD8 *pu1_HorzPixelUV; /*! < Pointer to the first pixel of the boundary */ 288 WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb; 289 WORD32 beta_cbcr = (beta_cr << 16) + beta_cb; 290 __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8; 291 __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16; 292 __m128i flag1, flag2; 293 __m128i diff, alpha_cbcr_16x8, beta_cbcr_16x8; 294 __m128i zero = _mm_setzero_si128(); 295 __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2; 296 __m128i temp1, temp2; 297 298 pu1_HorzPixelUV = pu1_src_uv - (src_strd << 1); 299 300 i16_posQ1 = src_strd; 301 i16_posP0 = src_strd; 302 i16_posP1 = 0; 303 304 q0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv)); 305 q1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv + i16_posQ1)); 306 p1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP1)); 307 p0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0)); 308 309 q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero); 310 q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero); 311 p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero); 312 p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero); 313 314 diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1 315 diff = _mm_abs_epi16(diff); 316 alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr); 317 flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff); 318 319 diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2 320 diff = _mm_abs_epi16(diff); 321 beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr); 322 flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); 323 324 diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3 325 diff = _mm_abs_epi16(diff); 326 flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); 327 328 temp1 = _mm_slli_epi16(p1_uv_8x16, 1); 329 temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16); 330 temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2)); 331 temp1 = _mm_add_epi16(temp1, temp2); 332 p0_uv_8x16_1 = _mm_srai_epi16(temp1, 2); 333 334 temp1 = _mm_slli_epi16(q1_uv_8x16, 1); 335 temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16); 336 temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2)); 337 temp1 = _mm_add_epi16(temp1, temp2); 338 q0_uv_8x16_1 = _mm_srai_epi16(temp1, 2); 339 340 q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero); 341 q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero); 342 p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero); 343 p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero); 344 345 diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1 346 diff = _mm_abs_epi16(diff); 347 alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr); 348 flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff); 349 350 diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2 351 diff = _mm_abs_epi16(diff); 352 beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr); 353 flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); 354 355 diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3 356 diff = _mm_abs_epi16(diff); 357 flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); 358 359 temp1 = _mm_slli_epi16(p1_uv_8x16, 1); 360 temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16); 361 temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2)); 362 temp1 = _mm_add_epi16(temp1, temp2); 363 p0_uv_8x16_2 = _mm_srai_epi16(temp1, 2); 364 365 temp1 = _mm_slli_epi16(q1_uv_8x16, 1); 366 temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16); 367 temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2)); 368 temp1 = _mm_add_epi16(temp1, temp2); 369 q0_uv_8x16_2 = _mm_srai_epi16(temp1, 2); 370 371 p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2); 372 q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2); 373 374 flag1 = _mm_packs_epi16(flag1, flag2); 375 376 p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8, 377 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); 378 p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1); 379 p0_uv_8x16_1 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2); 380 _mm_storeu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0), p0_uv_8x16_1); 381 382 q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8, 383 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); 384 q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1); 385 q0_uv_8x16_1 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2); 386 _mm_storeu_si128((__m128i *)(pu1_src_uv), q0_uv_8x16_1); 387 388} 389 390/*****************************************************************************/ 391/* */ 392/* Function Name : ih264_deblk_chroma_vert_bslt4_ssse3() */ 393/* */ 394/* Description : This function performs filtering of a chroma block */ 395/* vertical edge when the boundary strength is less than 4 */ 396/* in high profile. */ 397/* */ 398/* Inputs : pu1_src - pointer to the src sample q0 of U */ 399/* src_strd - source stride */ 400/* alpha_cb - alpha value for the boundary in U */ 401/* beta_cb - beta value for the boundary in U */ 402/* alpha_cr - alpha value for the boundary in V */ 403/* beta_cr - beta value for the boundary in V */ 404/* u4_bs - packed Boundary strength array */ 405/* pu1_cliptab_cb - tc0_table for U */ 406/* pu1_cliptab_cr - tc0_table for V */ 407/* */ 408/* Globals : None */ 409/* */ 410/* Processing : This operation is described in Sec. 8.7.2.3 under the */ 411/* title "Filtering process for edges for bS less than 4" */ 412/* in ITU T Rec H.264 with alpha and beta values different */ 413/* in U and V. */ 414/* */ 415/* Outputs : None */ 416/* */ 417/* Returns : None */ 418/* */ 419/* Issues : None */ 420/* */ 421/* Revision History: */ 422/* */ 423/* DD MM YYYY Author(s) Changes (Describe the changes made) */ 424/* 12 02 2015 Naveen Kumar P Initial version */ 425/* */ 426/*****************************************************************************/ 427void ih264_deblk_chroma_vert_bslt4_ssse3(UWORD8 *pu1_src, 428 WORD32 src_strd, 429 WORD32 alpha_cb, 430 WORD32 beta_cb, 431 WORD32 alpha_cr, 432 WORD32 beta_cr, 433 UWORD32 u4_bs, 434 const UWORD8 *pu1_cliptab_cb, 435 const UWORD8 *pu1_cliptab_cr) 436{ 437 UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/ 438 UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3; 439 WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb; 440 WORD32 beta_cbcr = (beta_cr << 16) + beta_cb; 441 __m128i linea, lineb, linec, lined, linee, linef, lineg, lineh; 442 __m128i temp1, temp2, temp3, temp4; 443 444 __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8; 445 __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16; 446 __m128i flag_bs, flag1, flag2; 447 __m128i diff, diff1, alpha_cbcr_16x8, beta_cbcr_16x8, in_macro; 448 __m128i zero = _mm_setzero_si128(); 449 __m128i C0_uv_8x16; 450 __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2; 451 452 u1_Bs0 = (u4_bs >> 24) & 0xff; 453 u1_Bs1 = (u4_bs >> 16) & 0xff; 454 u1_Bs2 = (u4_bs >> 8) & 0xff; 455 u1_Bs3 = (u4_bs >> 0) & 0xff; 456 457 flag_bs = _mm_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs2, u1_Bs2, 458 u1_Bs2, u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1, 459 u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0); 460 flag_bs = _mm_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s 461 flag_bs = _mm_xor_si128(flag_bs, _mm_set1_epi8(0xFF)); //Invert for required mask 462 463 /* Load and transpose the pixel values */ 464 linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4)); 465 lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd)); 466 linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd)); 467 lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd)); 468 linee = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd)); 469 linef = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd)); 470 lineg = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd)); 471 lineh = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd)); 472 473 temp1 = _mm_unpacklo_epi16(linea, lineb); 474 temp2 = _mm_unpacklo_epi16(linec, lined); 475 temp3 = _mm_unpacklo_epi16(linee, linef); 476 temp4 = _mm_unpacklo_epi16(lineg, lineh); 477 478 p1_uv_8x16 = _mm_unpacklo_epi32(temp1, temp2); 479 p0_uv_8x16 = _mm_unpacklo_epi32(temp3, temp4); 480 q0_uv_8x16 = _mm_unpackhi_epi32(temp1, temp2); 481 q1_uv_8x16 = _mm_unpackhi_epi32(temp3, temp4); 482 483 p1_uv_16x8 = _mm_unpacklo_epi64(p1_uv_8x16, p0_uv_8x16); 484 p0_uv_16x8 = _mm_unpackhi_epi64(p1_uv_8x16, p0_uv_8x16); 485 q0_uv_16x8 = _mm_unpacklo_epi64(q0_uv_8x16, q1_uv_8x16); 486 q1_uv_16x8 = _mm_unpackhi_epi64(q0_uv_8x16, q1_uv_8x16); 487 /* End of transpose */ 488 489 q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero); 490 q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero); 491 p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero); 492 p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero); 493 494 diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1 495 diff = _mm_abs_epi16(diff); 496 alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr); 497 flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff); 498 499 diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2 500 diff = _mm_abs_epi16(diff); 501 beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr); 502 flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); 503 504 diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3 505 diff = _mm_abs_epi16(diff); 506 flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); 507 508 diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16); 509 diff = _mm_slli_epi16(diff, 2); 510 diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16); 511 diff = _mm_add_epi16(diff, diff1); 512 diff = _mm_add_epi16(diff, _mm_set1_epi16(4)); 513 in_macro = _mm_srai_epi16(diff, 3); 514 515 C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1], 516 pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1], 517 pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0], 518 pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]); 519 520 C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1)); 521 522 in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3 523 C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16); 524 in_macro = _mm_max_epi16(C0_uv_8x16, in_macro); 525 526 p0_uv_8x16_1 = _mm_add_epi16(p0_uv_8x16, in_macro); 527 q0_uv_8x16_1 = _mm_sub_epi16(q0_uv_8x16, in_macro); 528 529 q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero); 530 q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero); 531 p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero); 532 p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero); 533 534 diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1 535 diff = _mm_abs_epi16(diff); 536 alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr); 537 flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff); 538 539 diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2 540 diff = _mm_abs_epi16(diff); 541 beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr); 542 flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); 543 544 diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3 545 diff = _mm_abs_epi16(diff); 546 flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); 547 548 diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16); 549 diff = _mm_slli_epi16(diff, 2); 550 diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16); 551 diff = _mm_add_epi16(diff, diff1); 552 diff = _mm_add_epi16(diff, _mm_set1_epi16(4)); 553 in_macro = _mm_srai_epi16(diff, 3); 554 555 C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3], 556 pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3], 557 pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2], 558 pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2]); 559 560 C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1)); 561 562 in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3 563 C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16); 564 in_macro = _mm_max_epi16(C0_uv_8x16, in_macro); 565 566 p0_uv_8x16_2 = _mm_add_epi16(p0_uv_8x16, in_macro); 567 q0_uv_8x16_2 = _mm_sub_epi16(q0_uv_8x16, in_macro); 568 569 p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2); 570 q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2); 571 572 flag1 = _mm_packs_epi16(flag1, flag2); 573 flag1 = _mm_and_si128(flag1, flag_bs); //Final flag (BS condition + other 3 conditions) 574 575 p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8, 576 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); 577 p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1); 578 p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2); 579 580 q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8, 581 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); 582 q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1); 583 q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2); 584 585 /* Inverse-transpose and store back */ 586 temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8); 587 temp2 = _mm_unpackhi_epi16(p1_uv_16x8, p0_uv_16x8); 588 temp3 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8); 589 temp4 = _mm_unpackhi_epi16(q0_uv_16x8, q1_uv_16x8); 590 591 linea = _mm_unpacklo_epi32(temp1, temp3); 592 lineb = _mm_srli_si128(linea, 8); 593 linec = _mm_unpackhi_epi32(temp1, temp3); 594 lined = _mm_srli_si128(linec, 8); 595 linee = _mm_unpacklo_epi32(temp2, temp4); 596 linef = _mm_srli_si128(linee, 8); 597 lineg = _mm_unpackhi_epi32(temp2, temp4); 598 lineh = _mm_srli_si128(lineg, 8); 599 600 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea); 601 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb); 602 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec); 603 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined); 604 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd), linee); 605 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd), linef); 606 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd), lineg); 607 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd), lineh); 608 609} 610 611/*****************************************************************************/ 612/* */ 613/* Function Name : ih264_deblk_chroma_horz_bslt4_ssse3() */ 614/* */ 615/* Description : This function performs filtering of a chroma block */ 616/* horizontal edge when the boundary strength is less than */ 617/* 4 in high profile. */ 618/* */ 619/* Inputs : pu1_src - pointer to the src sample q0 of U */ 620/* src_strd - source stride */ 621/* alpha_cb - alpha value for the boundary in U */ 622/* beta_cb - beta value for the boundary in U */ 623/* alpha_cr - alpha value for the boundary in V */ 624/* beta_cr - beta value for the boundary in V */ 625/* u4_bs - packed Boundary strength array */ 626/* pu1_cliptab_cb - tc0_table for U */ 627/* pu1_cliptab_cr - tc0_table for V */ 628/* */ 629/* Globals : None */ 630/* */ 631/* Processing : This operation is described in Sec. 8.7.2.3 under the */ 632/* title "Filtering process for edges for bS less than 4" */ 633/* in ITU T Rec H.264 with alpha and beta values different */ 634/* in U and V. */ 635/* */ 636/* Outputs : None */ 637/* */ 638/* Returns : None */ 639/* */ 640/* Issues : None */ 641/* */ 642/* Revision History: */ 643/* */ 644/* DD MM YYYY Author(s) Changes (Describe the changes made) */ 645/* 12 02 2015 Naveen Kumar P Initial version */ 646/* */ 647/*****************************************************************************/ 648void ih264_deblk_chroma_horz_bslt4_ssse3(UWORD8 *pu1_src, 649 WORD32 src_strd, 650 WORD32 alpha_cb, 651 WORD32 beta_cb, 652 WORD32 alpha_cr, 653 WORD32 beta_cr, 654 UWORD32 u4_bs, 655 const UWORD8 *pu1_cliptab_cb, 656 const UWORD8 *pu1_cliptab_cr) 657{ 658 UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/ 659 WORD16 i16_posP1, i16_posP0, i16_posQ1; 660 UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3; 661 662 UWORD8 *pu1_HorzPixelUV; /*! < Pointer to the first pixel of the boundary */ 663 WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb; 664 WORD32 beta_cbcr = (beta_cr << 16) + beta_cb; 665 __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8; 666 __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16; 667 __m128i flag_bs, flag1, flag2; 668 __m128i diff, diff1, alpha_cbcr_16x8, beta_cbcr_16x8, in_macro; 669 __m128i zero = _mm_setzero_si128(); 670 __m128i C0_uv_8x16; 671 __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2; 672 673 pu1_HorzPixelUV = pu1_src_uv - (src_strd << 1); 674 675 i16_posQ1 = src_strd; 676 i16_posP0 = src_strd; 677 i16_posP1 = 0; 678 679 u1_Bs0 = (u4_bs >> 24) & 0xff; 680 u1_Bs1 = (u4_bs >> 16) & 0xff; 681 u1_Bs2 = (u4_bs >> 8) & 0xff; 682 u1_Bs3 = (u4_bs >> 0) & 0xff; 683 684 flag_bs = _mm_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs2, u1_Bs2, 685 u1_Bs2, u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1, 686 u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0); 687 flag_bs = _mm_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s 688 flag_bs = _mm_xor_si128(flag_bs, _mm_set1_epi8(0xFF)); //Invert for required mask 689 690 q0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv)); 691 q1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv + i16_posQ1)); 692 p1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP1)); 693 p0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0)); 694 695 q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero); 696 q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero); 697 p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero); 698 p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero); 699 700 diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1 701 diff = _mm_abs_epi16(diff); 702 alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr); 703 flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff); 704 705 diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2 706 diff = _mm_abs_epi16(diff); 707 beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr); 708 flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); 709 710 diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3 711 diff = _mm_abs_epi16(diff); 712 flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); 713 714 diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16); 715 diff = _mm_slli_epi16(diff, 2); 716 diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16); 717 diff = _mm_add_epi16(diff, diff1); 718 diff = _mm_add_epi16(diff, _mm_set1_epi16(4)); 719 in_macro = _mm_srai_epi16(diff, 3); 720 721 C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1], 722 pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1], 723 pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0], 724 pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]); 725 726 C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1)); 727 728 in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3 729 C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16); 730 in_macro = _mm_max_epi16(C0_uv_8x16, in_macro); 731 732 p0_uv_8x16_1 = _mm_add_epi16(p0_uv_8x16, in_macro); 733 q0_uv_8x16_1 = _mm_sub_epi16(q0_uv_8x16, in_macro); 734 735 q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero); 736 q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero); 737 p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero); 738 p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero); 739 740 diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1 741 diff = _mm_abs_epi16(diff); 742 alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr); 743 flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff); 744 745 diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2 746 diff = _mm_abs_epi16(diff); 747 beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr); 748 flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); 749 750 diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3 751 diff = _mm_abs_epi16(diff); 752 flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); 753 754 diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16); 755 diff = _mm_slli_epi16(diff, 2); 756 diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16); 757 diff = _mm_add_epi16(diff, diff1); 758 diff = _mm_add_epi16(diff, _mm_set1_epi16(4)); 759 in_macro = _mm_srai_epi16(diff, 3); 760 761 C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3], 762 pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3], 763 pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2], 764 pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2]); 765 766 C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1)); 767 768 in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3 769 C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16); 770 in_macro = _mm_max_epi16(C0_uv_8x16, in_macro); 771 772 p0_uv_8x16_2 = _mm_add_epi16(p0_uv_8x16, in_macro); 773 q0_uv_8x16_2 = _mm_sub_epi16(q0_uv_8x16, in_macro); 774 775 p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2); 776 q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2); 777 778 flag1 = _mm_packs_epi16(flag1, flag2); 779 flag1 = _mm_and_si128(flag1, flag_bs); //Final flag (BS condition + other 3 conditions) 780 781 p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8, 782 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); 783 p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1); 784 p0_uv_8x16_1 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2); 785 _mm_storeu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0), p0_uv_8x16_1); 786 787 q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8, 788 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); 789 q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1); 790 q0_uv_8x16_1 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2); 791 _mm_storeu_si128((__m128i *)(pu1_src_uv), q0_uv_8x16_1); 792 793} 794 795/*****************************************************************************/ 796/* */ 797/* Function Name : ih264_deblk_chroma_vert_bs4_mbaff_ssse3() */ 798/* */ 799/* Description : This function performs filtering of a chroma block */ 800/* vertical edge when boundary strength is set to 4 in high */ 801/* profile. */ 802/* */ 803/* Inputs : pu1_src - pointer to the src sample q0 of U */ 804/* src_strd - source stride */ 805/* alpha_cb - alpha value for the boundary in U */ 806/* beta_cb - beta value for the boundary in U */ 807/* alpha_cr - alpha value for the boundary in V */ 808/* beta_cr - beta value for the boundary in V */ 809/* u4_bs - packed Boundary strength array */ 810/* pu1_cliptab_cb - tc0_table for U */ 811/* pu1_cliptab_cr - tc0_table for V */ 812/* */ 813/* Globals : None */ 814/* */ 815/* Processing : When the function is called twice, this operation is as */ 816/* described in Sec. 8.7.2.4 under the title "Filtering */ 817/* process for edges for bS equal to 4" in ITU T Rec H.264 */ 818/* with alpha and beta values different in U and V. */ 819/* */ 820/* Outputs : None */ 821/* */ 822/* Returns : None */ 823/* */ 824/* Issues : None */ 825/* */ 826/* Revision History: */ 827/* */ 828/* DD MM YYYY Author(s) Changes (Describe the changes made) */ 829/* 12 02 2015 Naveen Kumar P Initial version */ 830/* */ 831/*****************************************************************************/ 832void ih264_deblk_chroma_vert_bs4_mbaff_ssse3(UWORD8 *pu1_src, 833 WORD32 src_strd, 834 WORD32 alpha_cb, 835 WORD32 beta_cb, 836 WORD32 alpha_cr, 837 WORD32 beta_cr) 838{ 839 UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/ 840 WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb; 841 WORD32 beta_cbcr = (beta_cr << 16) + beta_cb; 842 __m128i linea, lineb, linec, lined; 843 __m128i temp1, temp2; 844 845 __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8; 846 __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16; 847 __m128i flag1; 848 __m128i diff, alpha_cbcr_16x8, beta_cbcr_16x8; 849 __m128i zero = _mm_setzero_si128(); 850 __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2; 851 852 /* Load and transpose the pixel values */ 853 linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4)); 854 lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd)); 855 linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd)); 856 lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd)); 857 858 temp1 = _mm_unpacklo_epi16(linea, lineb); 859 temp2 = _mm_unpacklo_epi16(linec, lined); 860 861 p1_uv_16x8 = _mm_unpacklo_epi32(temp1, temp2); 862 p0_uv_16x8 = _mm_srli_si128(p1_uv_16x8, 8); 863 q0_uv_16x8 = _mm_unpackhi_epi32(temp1, temp2); 864 q1_uv_16x8 = _mm_srli_si128(q0_uv_16x8, 8); 865 /* End of transpose */ 866 867 q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero); 868 q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero); 869 p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero); 870 p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero); 871 872 diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1 873 diff = _mm_abs_epi16(diff); 874 alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr); 875 flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff); 876 877 diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2 878 diff = _mm_abs_epi16(diff); 879 beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr); 880 flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); 881 882 diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3 883 diff = _mm_abs_epi16(diff); 884 flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); 885 886 temp1 = _mm_slli_epi16(p1_uv_8x16, 1); 887 temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16); 888 temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2)); 889 temp1 = _mm_add_epi16(temp1, temp2); 890 p0_uv_8x16_1 = _mm_srai_epi16(temp1, 2); 891 892 temp1 = _mm_slli_epi16(q1_uv_8x16, 1); 893 temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16); 894 temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2)); 895 temp1 = _mm_add_epi16(temp1, temp2); 896 q0_uv_8x16_1 = _mm_srai_epi16(temp1, 2); 897 898 p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_1); 899 q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_1); 900 901 flag1 = _mm_packs_epi16(flag1, flag1); 902 903 p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8, 904 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); 905 p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1); 906 p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2); 907 908 q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8, 909 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); 910 q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1); 911 q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2); 912 913 /* Inverse-transpose and store back */ 914 temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8); 915 temp2 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8); 916 917 linea = _mm_unpacklo_epi32(temp1, temp2); 918 lineb = _mm_srli_si128(linea, 8); 919 linec = _mm_unpackhi_epi32(temp1, temp2); 920 lined = _mm_srli_si128(linec, 8); 921 922 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea); 923 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb); 924 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec); 925 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined); 926 927} 928 929/*****************************************************************************/ 930/* */ 931/* Function Name : ih264_deblk_chroma_vert_bslt4_mbaff_ssse3() */ 932/* */ 933/* Description : This function performs filtering of a chroma block */ 934/* vertical edge when boundary strength is less than 4 in */ 935/* high profile. */ 936/* */ 937/* Inputs : pu1_src - pointer to the src sample q0 of U */ 938/* src_strd - source stride */ 939/* alpha_cb - alpha value for the boundary in U */ 940/* beta_cb - beta value for the boundary in U */ 941/* alpha_cr - alpha value for the boundary in V */ 942/* beta_cr - beta value for the boundary in V */ 943/* u4_bs - packed Boundary strength array */ 944/* pu1_cliptab_cb - tc0_table for U */ 945/* pu1_cliptab_cr - tc0_table for V */ 946/* */ 947/* Globals : None */ 948/* */ 949/* Processing : When the function is called twice, this operation is as */ 950/* described in Sec. 8.7.2.4 under the title "Filtering */ 951/* process for edges for bS less than 4" in ITU T Rec H.264 */ 952/* with alpha and beta values different in U and V. */ 953/* */ 954/* Outputs : None */ 955/* */ 956/* Returns : None */ 957/* */ 958/* Issues : None */ 959/* */ 960/* Revision History: */ 961/* */ 962/* DD MM YYYY Author(s) Changes (Describe the changes made) */ 963/* 12 02 2015 Naveen Kumar P Initial version */ 964/* */ 965/*****************************************************************************/ 966void ih264_deblk_chroma_vert_bslt4_mbaff_ssse3(UWORD8 *pu1_src, 967 WORD32 src_strd, 968 WORD32 alpha_cb, 969 WORD32 beta_cb, 970 WORD32 alpha_cr, 971 WORD32 beta_cr, 972 UWORD32 u4_bs, 973 const UWORD8 *pu1_cliptab_cb, 974 const UWORD8 *pu1_cliptab_cr) 975{ 976 UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/ 977 UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3; 978 WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb; 979 WORD32 beta_cbcr = (beta_cr << 16) + beta_cb; 980 __m128i linea, lineb, linec, lined; 981 __m128i temp1, temp2; 982 983 __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8; 984 __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16; 985 __m128i flag_bs, flag1; 986 __m128i diff, diff1, alpha_cbcr_16x8, beta_cbcr_16x8, in_macro; 987 __m128i zero = _mm_setzero_si128(); 988 __m128i C0_uv_8x16; 989 __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2; 990 991 u1_Bs0 = (u4_bs >> 24) & 0xff; 992 u1_Bs1 = (u4_bs >> 16) & 0xff; 993 u1_Bs2 = (u4_bs >> 8) & 0xff; 994 u1_Bs3 = (u4_bs >> 0) & 0xff; 995 996 flag_bs = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, u1_Bs3, u1_Bs3, u1_Bs2, 997 u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs0, u1_Bs0); 998 flag_bs = _mm_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s 999 flag_bs = _mm_xor_si128(flag_bs, _mm_set1_epi8(0xFF)); //Invert for required mask 1000 1001 /* Load and transpose the pixel values */ 1002 linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4)); 1003 lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd)); 1004 linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd)); 1005 lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd)); 1006 1007 temp1 = _mm_unpacklo_epi16(linea, lineb); 1008 temp2 = _mm_unpacklo_epi16(linec, lined); 1009 1010 p1_uv_16x8 = _mm_unpacklo_epi32(temp1, temp2); 1011 p0_uv_16x8 = _mm_srli_si128(p1_uv_16x8, 8); 1012 q0_uv_16x8 = _mm_unpackhi_epi32(temp1, temp2); 1013 q1_uv_16x8 = _mm_srli_si128(q0_uv_16x8, 8); 1014 /* End of transpose */ 1015 1016 q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero); 1017 q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero); 1018 p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero); 1019 p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero); 1020 1021 diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1 1022 diff = _mm_abs_epi16(diff); 1023 alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr); 1024 flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff); 1025 1026 diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2 1027 diff = _mm_abs_epi16(diff); 1028 beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr); 1029 flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); 1030 1031 diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3 1032 diff = _mm_abs_epi16(diff); 1033 flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); 1034 1035 diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16); 1036 diff = _mm_slli_epi16(diff, 2); 1037 diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16); 1038 diff = _mm_add_epi16(diff, diff1); 1039 diff = _mm_add_epi16(diff, _mm_set1_epi16(4)); 1040 in_macro = _mm_srai_epi16(diff, 3); 1041 1042 C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3], 1043 pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2], 1044 pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1], 1045 pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]); 1046 1047 C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1)); 1048 1049 in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3 1050 C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16); 1051 in_macro = _mm_max_epi16(C0_uv_8x16, in_macro); 1052 1053 p0_uv_8x16_1 = _mm_add_epi16(p0_uv_8x16, in_macro); 1054 q0_uv_8x16_1 = _mm_sub_epi16(q0_uv_8x16, in_macro); 1055 1056 p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_1); 1057 q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_1); 1058 1059 flag1 = _mm_packs_epi16(flag1, flag1); 1060 flag1 = _mm_and_si128(flag1, flag_bs); //Final flag (BS condition + other 3 conditions) 1061 1062 p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8, 1063 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); 1064 p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1); 1065 p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2); 1066 1067 q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8, 1068 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); 1069 q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1); 1070 q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2); 1071 1072 /* Inverse-transpose and store back */ 1073 temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8); 1074 temp2 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8); 1075 1076 linea = _mm_unpacklo_epi32(temp1, temp2); 1077 lineb = _mm_srli_si128(linea, 8); 1078 linec = _mm_unpackhi_epi32(temp1, temp2); 1079 lined = _mm_srli_si128(linec, 8); 1080 1081 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea); 1082 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb); 1083 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec); 1084 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined); 1085 1086} 1087 1088