ihevcd_deblk.c revision 0d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098
1/****************************************************************************** 2* 3* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4* 5* Licensed under the Apache License, Version 2.0 (the "License"); 6* you may not use this file except in compliance with the License. 7* You may obtain a copy of the License at: 8* 9* http://www.apache.org/licenses/LICENSE-2.0 10* 11* Unless required by applicable law or agreed to in writing, software 12* distributed under the License is distributed on an "AS IS" BASIS, 13* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14* See the License for the specific language governing permissions and 15* limitations under the License. 16* 17******************************************************************************/ 18/** 19******************************************************************************* 20* @file 21* ihevc_deblk.c 22* 23* @brief 24* Contains definition for the ctb level deblk function 25* 26* @author 27* Srinivas T 28* 29* @par List of Functions: 30* - ihevc_deblk() 31* 32* @remarks 33* None 34* 35******************************************************************************* 36*/ 37 38#include <stdio.h> 39#include <stddef.h> 40#include <stdlib.h> 41#include <string.h> 42#include <assert.h> 43 44#include "ihevc_typedefs.h" 45#include "iv.h" 46#include "ivd.h" 47#include "ihevcd_cxa.h" 48#include "ithread.h" 49 50#include "ihevc_defs.h" 51#include "ihevc_debug.h" 52#include "ihevc_defs.h" 53#include "ihevc_structs.h" 54#include "ihevc_macros.h" 55#include "ihevc_platform_macros.h" 56#include "ihevc_cabac_tables.h" 57 58#include "ihevc_error.h" 59#include "ihevc_common_tables.h" 60 61#include "ihevcd_trace.h" 62#include "ihevcd_defs.h" 63#include "ihevcd_function_selector.h" 64#include "ihevcd_structs.h" 65#include "ihevcd_error.h" 66#include "ihevcd_nal.h" 67#include "ihevcd_bitstream.h" 68#include "ihevcd_job_queue.h" 69#include "ihevcd_utils.h" 70#include "ihevcd_debug.h" 71 72#include "ihevc_deblk.h" 73#include "ihevc_deblk_tables.h" 74#include "ihevcd_profile.h" 75/** 76******************************************************************************* 77* 78* @brief 79* Deblock CTB level function. 80* 81* @par Description: 82* For a given CTB, deblocking on both vertical and 83* horizontal edges is done. Both the luma and chroma 84* blocks are processed 85* 86* @param[in] ps_deblk 87* Pointer to the deblock context 88* 89* @returns 90* 91* @remarks 92* None 93* 94******************************************************************************* 95*/ 96 97void ihevcd_deblk_ctb(deblk_ctxt_t *ps_deblk, 98 WORD32 i4_is_last_ctb_x, 99 WORD32 i4_is_last_ctb_y) 100{ 101 WORD32 ctb_size; 102 WORD32 log2_ctb_size; 103 UWORD32 u4_bs; 104 WORD32 bs_tz; /*Leading zeros in boundary strength*/ 105 WORD32 qp_p, qp_q; 106 107 WORD32 filter_p, filter_q; 108 109 UWORD8 *pu1_src; 110 WORD32 qp_strd; 111 UWORD32 *pu4_vert_bs, *pu4_horz_bs; 112 UWORD32 *pu4_ctb_vert_bs, *pu4_ctb_horz_bs; 113 WORD32 vert_bs_strd, horz_bs_strd; 114 WORD32 src_strd; 115 UWORD8 *pu1_qp; 116 UWORD16 *pu2_ctb_no_loop_filter_flag; 117 UWORD16 au2_ctb_no_loop_filter_flag[9]; 118 119 WORD32 col, row; 120 121 /* Flag to indicate if QP is constant in CTB 122 * 0 - top_left, 1 - top, 2 - left, 3 - current */ 123 UWORD32 u4_qp_const_in_ctb[4] = { 0, 0, 0, 0 }; 124 WORD32 ctb_indx; 125 WORD32 chroma_yuv420sp_vu = ps_deblk->is_chroma_yuv420sp_vu; 126 sps_t *ps_sps; 127 pps_t *ps_pps; 128 codec_t *ps_codec; 129 slice_header_t *ps_slice_hdr; 130 131 PROFILE_DISABLE_DEBLK(); 132 133 ps_sps = ps_deblk->ps_sps; 134 ps_pps = ps_deblk->ps_pps; 135 ps_codec = ps_deblk->ps_codec; 136 ps_slice_hdr = ps_deblk->ps_slice_hdr; 137 138 log2_ctb_size = ps_sps->i1_log2_ctb_size; 139 ctb_size = (1 << ps_sps->i1_log2_ctb_size); 140 141 /* strides are in units of number of bytes */ 142 /* ctb_size * ctb_size / 8 / 16 is the number of bytes needed per CTB */ 143 vert_bs_strd = ps_sps->i2_pic_wd_in_ctb << (2 * log2_ctb_size - 7); 144 horz_bs_strd = (ps_sps->i2_pic_wd_in_ctb + 1) << (2 * log2_ctb_size - 7); 145 pu4_vert_bs = (UWORD32 *)((UWORD8 *)ps_deblk->s_bs_ctxt.pu4_pic_vert_bs + 146 (ps_deblk->i4_ctb_x << (2 * log2_ctb_size - 7)) + 147 ps_deblk->i4_ctb_y * vert_bs_strd); 148 pu4_ctb_vert_bs = pu4_vert_bs; 149 150 pu4_horz_bs = (UWORD32 *)((UWORD8 *)ps_deblk->s_bs_ctxt.pu4_pic_horz_bs + 151 (ps_deblk->i4_ctb_x << (2 * log2_ctb_size - 7)) + 152 ps_deblk->i4_ctb_y * horz_bs_strd); 153 pu4_ctb_horz_bs = pu4_horz_bs; 154 155 qp_strd = ps_sps->i2_pic_wd_in_ctb << (log2_ctb_size - 3); 156 pu1_qp = ps_deblk->s_bs_ctxt.pu1_pic_qp + ((ps_deblk->i4_ctb_x + ps_deblk->i4_ctb_y * qp_strd) << (log2_ctb_size - 3)); 157 158 pu2_ctb_no_loop_filter_flag = ps_deblk->au2_ctb_no_loop_filter_flag; 159 160 ctb_indx = ps_deblk->i4_ctb_x + ps_sps->i2_pic_wd_in_ctb * ps_deblk->i4_ctb_y; 161 if(i4_is_last_ctb_y) 162 { 163 pu4_vert_bs = (UWORD32 *)((UWORD8 *)pu4_vert_bs + vert_bs_strd); 164 pu4_ctb_vert_bs = pu4_vert_bs; 165 /* ctb_size/8 is the number of edges per CTB 166 * ctb_size/4 is the number of BS values needed per edge 167 * divided by 8 for the number of bytes 168 * 2 is the number of bits needed for each BS value */ 169 memset(pu4_vert_bs, 0, 1 << (2 * log2_ctb_size - 7)); 170 171 pu1_qp += (qp_strd << (log2_ctb_size - 3)); 172 pu2_ctb_no_loop_filter_flag += (ctb_size >> 3); 173 ctb_indx += ps_sps->i2_pic_wd_in_ctb; 174 } 175 176 if(i4_is_last_ctb_x) 177 { 178 pu4_horz_bs = (UWORD32 *)((UWORD8 *)pu4_horz_bs + (1 << (2 * log2_ctb_size - 7))); 179 pu4_ctb_horz_bs = pu4_horz_bs; 180 memset(pu4_horz_bs, 0, 1 << (2 * log2_ctb_size - 7)); 181 182 pu1_qp += (ctb_size >> 3); 183 184 for(row = 0; row < (ctb_size >> 3) + 1; row++) 185 au2_ctb_no_loop_filter_flag[row] = ps_deblk->au2_ctb_no_loop_filter_flag[row] >> (ctb_size >> 3); 186 pu2_ctb_no_loop_filter_flag = au2_ctb_no_loop_filter_flag; 187 ctb_indx += 1; 188 } 189 190 u4_qp_const_in_ctb[3] = ps_deblk->s_bs_ctxt.pu1_pic_qp_const_in_ctb[(ctb_indx) >> 3] & (1 << (ctb_indx & 7)); 191 192 if(ps_deblk->i4_ctb_x || i4_is_last_ctb_x) 193 { 194 u4_qp_const_in_ctb[2] = ps_deblk->s_bs_ctxt.pu1_pic_qp_const_in_ctb[(ctb_indx - 1) >> 3] & (1 << ((ctb_indx - 1) & 7)); 195 } 196 197 if((ps_deblk->i4_ctb_x || i4_is_last_ctb_x) && (ps_deblk->i4_ctb_y || i4_is_last_ctb_y)) 198 { 199 u4_qp_const_in_ctb[0] = 200 ps_deblk->s_bs_ctxt.pu1_pic_qp_const_in_ctb[(ctb_indx - ps_sps->i2_pic_wd_in_ctb - 1) >> 3] & 201 (1 << ((ctb_indx - ps_sps->i2_pic_wd_in_ctb - 1) & 7)); 202 } 203 204 205 206 if(ps_deblk->i4_ctb_y || i4_is_last_ctb_y) 207 { 208 u4_qp_const_in_ctb[1] = 209 ps_deblk->s_bs_ctxt.pu1_pic_qp_const_in_ctb[(ctb_indx - ps_sps->i2_pic_wd_in_ctb) >> 3] & 210 (1 << ((ctb_indx - ps_sps->i2_pic_wd_in_ctb) & 7)); 211 } 212 213 src_strd = ps_codec->i4_strd; 214 215 /* Luma Vertical Edge */ 216 217 if(0 == i4_is_last_ctb_x) 218 { 219 /* Top CTB's slice header */ 220 slice_header_t *ps_slice_hdr_top; 221#ifdef GPU_BUILD 222//TODO GPU : Later define it for ARM only version as well 223 { 224 WORD32 cur_ctb_indx = ps_deblk->i4_ctb_x + ps_deblk->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb; 225 if(i4_is_last_ctb_y) 226 cur_ctb_indx += ps_sps->i2_pic_wd_in_ctb; 227 ps_slice_hdr_top = ps_deblk->ps_slice_hdr_base + ps_deblk->pu1_slice_idx[cur_ctb_indx - ps_sps->i2_pic_wd_in_ctb]; 228 } 229#else 230 { 231 WORD32 cur_ctb_indx = ps_deblk->i4_ctb_x + ps_deblk->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb; 232 if(i4_is_last_ctb_y) 233 cur_ctb_indx += ps_sps->i2_pic_wd_in_ctb; 234 ps_slice_hdr_top = ps_codec->ps_slice_hdr_base + ps_deblk->pu1_slice_idx[cur_ctb_indx - ps_sps->i2_pic_wd_in_ctb]; 235 } 236#endif 237 238 pu1_src = ps_deblk->pu1_cur_pic_luma + ((ps_deblk->i4_ctb_x + ps_deblk->i4_ctb_y * ps_deblk->ps_codec->i4_strd) << (log2_ctb_size)); 239 pu1_src += i4_is_last_ctb_y ? ps_deblk->ps_codec->i4_strd << log2_ctb_size : 0; 240 241 /** Deblocking is done on a shifted CTB - 242 * Vertical edge processing is done by shifting the CTB up by four pixels */ 243 pu1_src -= 4 * src_strd; 244 245 for(col = 0; col < ctb_size / 8; col++) 246 { 247 WORD32 shift = 0; 248 249 /* downshift vert_bs by ctb_size/2 for each column 250 * shift = (col & ((MAX_CTB_SIZE >> log2_ctb_size) - 1)) << (log2_ctb_size - 1); 251 * which will reduce to the following assuming ctb size is one of 16, 32 and 64 252 * and deblocking is done on 8x8 grid 253 */ 254 if(6 != log2_ctb_size) 255 shift = (col & 1) << (log2_ctb_size - 1); 256 257 /* BS for the column - Last row is excluded and the top row is included*/ 258 u4_bs = (pu4_vert_bs[0] >> shift) << 2; 259 260 if(ps_deblk->i4_ctb_y || i4_is_last_ctb_y) 261 { 262 /* Picking the last BS of the previous CTB corresponding to the same column */ 263 UWORD32 *pu4_vert_bs_top = (UWORD32 *)((UWORD8 *)pu4_vert_bs - vert_bs_strd); 264 UWORD32 u4_top_bs = (*pu4_vert_bs_top) >> (shift + (1 << (log2_ctb_size - 1)) - 2); 265 u4_bs |= u4_top_bs & 3; 266 } 267 268 for(row = 0; row < ctb_size / 4;) 269 { 270 WORD8 i1_beta_offset_div2 = ps_slice_hdr->i1_beta_offset_div2; 271 WORD8 i1_tc_offset_div2 = ps_slice_hdr->i1_tc_offset_div2; 272 273 /* Trailing zeros are computed and the corresponding rows are not processed */ 274 bs_tz = CTZ(u4_bs) >> 1; 275 if(0 != bs_tz) 276 { 277 u4_bs = u4_bs >> (bs_tz << 1); 278 if((row + bs_tz) >= (ctb_size / 4)) 279 pu1_src += 4 * (ctb_size / 4 - row) * src_strd; 280 else 281 pu1_src += 4 * bs_tz * src_strd; 282 283 row += bs_tz; 284 continue; 285 } 286 287 if(0 == row) 288 { 289 i1_beta_offset_div2 = ps_slice_hdr_top->i1_beta_offset_div2; 290 i1_tc_offset_div2 = ps_slice_hdr_top->i1_tc_offset_div2; 291 292 if(0 == col) 293 { 294 qp_p = u4_qp_const_in_ctb[0] ? 295 pu1_qp[-ctb_size / 8 * qp_strd - ctb_size / 8] : 296 pu1_qp[-qp_strd - 1]; 297 } 298 else 299 { 300 qp_p = u4_qp_const_in_ctb[1] ? 301 pu1_qp[-ctb_size / 8 * qp_strd] : 302 pu1_qp[col - 1 - qp_strd]; 303 } 304 305 qp_q = u4_qp_const_in_ctb[1] ? 306 pu1_qp[-ctb_size / 8 * qp_strd] : 307 pu1_qp[col - qp_strd]; 308 } 309 else 310 { 311 if(0 == col) 312 { 313 qp_p = u4_qp_const_in_ctb[2] ? 314 pu1_qp[-ctb_size / 8] : 315 pu1_qp[((row - 1) >> 1) * qp_strd - 1]; 316 } 317 else 318 { 319 qp_p = u4_qp_const_in_ctb[3] ? 320 pu1_qp[0] : 321 pu1_qp[((row - 1) >> 1) * qp_strd + col - 1]; 322 } 323 324 qp_q = u4_qp_const_in_ctb[3] ? 325 pu1_qp[0] : 326 pu1_qp[((row - 1) >> 1) * qp_strd + col]; 327 } 328 329 filter_p = (pu2_ctb_no_loop_filter_flag[(row + 1) >> 1] >> col) & 1; 330 filter_q = (pu2_ctb_no_loop_filter_flag[(row + 1) >> 1] >> col) & 2; 331 /* filter_p and filter_q are inverted as they are calculated using no_loop_filter_flags */ 332 filter_p = !filter_p; 333 filter_q = !filter_q; 334 335 if(filter_p || filter_q) 336 { 337#if DEBUG_DEBLK_LEAF_LEVEL 338 { 339 DUMP_DEBLK_LUMA_VERT(pu1_src, src_strd, 340 u4_bs & 3, qp_p, qp_q, 341 ps_slice_hdr->i1_beta_offset_div2, 342 ps_slice_hdr->i1_tc_offset_div2, 343 filter_p, filter_q); 344 } 345#endif 346 ps_codec->s_func_selector.ihevc_deblk_luma_vert_fptr(pu1_src, src_strd, 347 u4_bs & 3, qp_p, qp_q, 348 i1_beta_offset_div2, 349 i1_tc_offset_div2, 350 filter_p, filter_q); 351 } 352 353 pu1_src += 4 * src_strd; 354 u4_bs = u4_bs >> 2; 355 row++; 356 } 357 358 if((64 == ctb_size) || 359 ((32 == ctb_size) && (col & 1))) 360 { 361 pu4_vert_bs++; 362 } 363 pu1_src -= (src_strd << log2_ctb_size); 364 pu1_src += 8; 365 } 366 pu4_vert_bs = pu4_ctb_vert_bs; 367 } 368 369 370 /* Luma Horizontal Edge */ 371 372 if(0 == i4_is_last_ctb_y) 373 { 374 375 /* Left CTB's slice header */ 376 slice_header_t *ps_slice_hdr_left; 377#ifdef GPU_BUILD 378//TODO GPU : Later define it for ARM only version as well 379 { 380 WORD32 cur_ctb_indx = ps_deblk->i4_ctb_x + ps_deblk->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb; 381 if(i4_is_last_ctb_x) 382 cur_ctb_indx += 1; 383 ps_slice_hdr_left = ps_deblk->ps_slice_hdr_base + ps_deblk->pu1_slice_idx[cur_ctb_indx - 1]; 384 } 385#else 386 { 387 WORD32 cur_ctb_indx = ps_deblk->i4_ctb_x + ps_deblk->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb; 388 if(i4_is_last_ctb_x) 389 cur_ctb_indx += 1; 390 ps_slice_hdr_left = ps_codec->ps_slice_hdr_base + ps_deblk->pu1_slice_idx[cur_ctb_indx - 1]; 391 } 392#endif 393 pu1_src = ps_deblk->pu1_cur_pic_luma + ((ps_deblk->i4_ctb_x + ps_deblk->i4_ctb_y * ps_deblk->ps_codec->i4_strd) << log2_ctb_size); 394 pu1_src += i4_is_last_ctb_x ? ctb_size : 0; 395 396 /** Deblocking is done on a shifted CTB - 397 * Horizontal edge processing is done by shifting the CTB left by four pixels */ 398 pu1_src -= 4; 399 for(row = 0; row < ctb_size / 8; row++) 400 { 401 WORD32 shift = 0; 402 403 /* downshift vert_bs by ctb_size/2 for each column 404 * shift = (row & (MAX_CTB_SIZE / ctb_size - 1)) * ctb_size / 2; 405 * which will reduce to the following assuming ctb size is one of 16, 32 and 64 406 * and deblocking is done on 8x8 grid 407 */ 408 if(6 != log2_ctb_size) 409 shift = (row & 1) << (log2_ctb_size - 1); 410 411 /* BS for the row - Last column is excluded and the left column is included*/ 412 u4_bs = (pu4_horz_bs[0] >> shift) << 2; 413 414 if(ps_deblk->i4_ctb_x || i4_is_last_ctb_x) 415 { 416 /** Picking the last BS of the previous CTB corresponding to the same row 417 * UWORD32 *pu4_horz_bs_left = (UWORD32 *)((UWORD8 *)pu4_horz_bs - (ctb_size / 8) * (ctb_size / 4) / 8 * 2); 418 */ 419 UWORD32 *pu4_horz_bs_left = (UWORD32 *)((UWORD8 *)pu4_horz_bs - (1 << (2 * log2_ctb_size - 7))); 420 UWORD32 u4_left_bs = (*pu4_horz_bs_left) >> (shift + (1 << (log2_ctb_size - 1)) - 2); 421 u4_bs |= u4_left_bs & 3; 422 } 423 424 for(col = 0; col < ctb_size / 4;) 425 { 426 WORD8 i1_beta_offset_div2 = ps_slice_hdr->i1_beta_offset_div2; 427 WORD8 i1_tc_offset_div2 = ps_slice_hdr->i1_tc_offset_div2; 428 429 bs_tz = CTZ(u4_bs) >> 1; 430 if(0 != bs_tz) 431 { 432 u4_bs = u4_bs >> (bs_tz << 1); 433 434 if((col + bs_tz) >= (ctb_size / 4)) 435 pu1_src += 4 * (ctb_size / 4 - col); 436 else 437 pu1_src += 4 * bs_tz; 438 439 col += bs_tz; 440 continue; 441 } 442 443 if(0 == col) 444 { 445 i1_beta_offset_div2 = ps_slice_hdr_left->i1_beta_offset_div2; 446 i1_tc_offset_div2 = ps_slice_hdr_left->i1_tc_offset_div2; 447 448 if(0 == row) 449 { 450 qp_p = u4_qp_const_in_ctb[0] ? 451 pu1_qp[-ctb_size / 8 * qp_strd - ctb_size / 8] : 452 pu1_qp[-qp_strd - 1]; 453 } 454 else 455 { 456 qp_p = u4_qp_const_in_ctb[2] ? 457 pu1_qp[-ctb_size / 8] : 458 pu1_qp[(row - 1) * qp_strd - 1]; 459 } 460 461 qp_q = u4_qp_const_in_ctb[2] ? 462 pu1_qp[-ctb_size / 8] : 463 pu1_qp[row * qp_strd - 1]; 464 } 465 else 466 { 467 if(0 == row) 468 { 469 qp_p = u4_qp_const_in_ctb[1] ? 470 pu1_qp[-ctb_size / 8 * qp_strd] : 471 pu1_qp[((col - 1) >> 1) - qp_strd]; 472 } 473 else 474 { 475 qp_p = u4_qp_const_in_ctb[3] ? 476 pu1_qp[0] : 477 pu1_qp[((col - 1) >> 1) + (row - 1) * qp_strd]; 478 } 479 480 qp_q = u4_qp_const_in_ctb[3] ? 481 pu1_qp[0] : 482 pu1_qp[((col - 1) >> 1) + row * qp_strd]; 483 } 484 485 filter_p = (pu2_ctb_no_loop_filter_flag[row] >> ((col + 1) >> 1)) & 1; 486 filter_q = (pu2_ctb_no_loop_filter_flag[row + 1] >> ((col + 1) >> 1)) & 1; 487 /* filter_p and filter_q are inverted as they are calculated using no_loop_filter_flags */ 488 filter_p = !filter_p; 489 filter_q = !filter_q; 490 491 if(filter_p || filter_q) 492 { 493#if DEBUG_DEBLK_LEAF_LEVEL 494 { 495 DUMP_DEBLK_LUMA_HORZ(pu1_src, src_strd, 496 u4_bs & 3, qp_p, qp_q, 497 ps_slice_hdr->i1_beta_offset_div2, 498 ps_slice_hdr->i1_tc_offset_div2, 499 filter_p, filter_q); 500 } 501#endif 502 ps_codec->s_func_selector.ihevc_deblk_luma_horz_fptr(pu1_src, src_strd, 503 u4_bs & 3, qp_p, qp_q, 504 i1_beta_offset_div2, 505 i1_tc_offset_div2, filter_p, filter_q); 506 } 507 508 pu1_src += 4; 509 u4_bs = u4_bs >> 2; 510 col++; 511 } 512 513 if((64 == ctb_size) || 514 ((32 == ctb_size) && (row & 1))) 515 { 516 pu4_horz_bs++; 517 } 518 pu1_src -= ctb_size; 519 pu1_src += (src_strd << 3); 520 } 521 pu4_horz_bs = pu4_ctb_horz_bs; 522 } 523 524 525 /* Chroma Veritcal Edge */ 526 527 if(0 == i4_is_last_ctb_x) 528 { 529 530 /* Top CTB's slice header */ 531 slice_header_t *ps_slice_hdr_top; 532#ifdef GPU_BUILD 533//TODO GPU : Later define it for ARM only version as well 534 { 535 WORD32 cur_ctb_indx = ps_deblk->i4_ctb_x + ps_deblk->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb; 536 if(i4_is_last_ctb_y) 537 cur_ctb_indx += ps_sps->i2_pic_wd_in_ctb; 538 ps_slice_hdr_top = ps_deblk->ps_slice_hdr_base + ps_deblk->pu1_slice_idx[cur_ctb_indx - ps_sps->i2_pic_wd_in_ctb]; 539 } 540#else 541 { 542 WORD32 cur_ctb_indx = ps_deblk->i4_ctb_x + ps_deblk->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb; 543 if(i4_is_last_ctb_y) 544 cur_ctb_indx += ps_sps->i2_pic_wd_in_ctb; 545 ps_slice_hdr_top = ps_codec->ps_slice_hdr_base + ps_deblk->pu1_slice_idx[cur_ctb_indx - ps_sps->i2_pic_wd_in_ctb]; 546 } 547#endif 548 549 pu1_src = ps_deblk->pu1_cur_pic_chroma + ((ps_deblk->i4_ctb_x + ps_deblk->i4_ctb_y * ps_deblk->ps_codec->i4_strd / 2) << log2_ctb_size); 550 pu1_src += i4_is_last_ctb_y ? (ps_deblk->ps_codec->i4_strd / 2) << log2_ctb_size : 0; 551 552 /** Deblocking is done on a shifted CTB - 553 * Vertical edge processing is done by shifting the CTB up by four pixels */ 554 pu1_src -= 4 * src_strd; 555 556 for(col = 0; col < ctb_size / 16; col++) 557 { 558 559 /* BS for the column - Last row is excluded and the top row is included*/ 560 u4_bs = pu4_vert_bs[0] << 2; 561 562 if(ps_deblk->i4_ctb_y || i4_is_last_ctb_y) 563 { 564 /* Picking the last BS of the previous CTB corresponding to the same column */ 565 UWORD32 *pu4_vert_bs_top = (UWORD32 *)((UWORD8 *)pu4_vert_bs - vert_bs_strd); 566 UWORD32 u4_top_bs = (*pu4_vert_bs_top) >> ((1 << (log2_ctb_size - 1)) - 2); 567 u4_bs |= u4_top_bs & 3; 568 } 569 570 /* Every alternate boundary strength value is used for chroma */ 571 u4_bs &= 0x22222222; 572 573 for(row = 0; row < ctb_size / 8;) 574 { 575 WORD8 i1_tc_offset_div2 = ps_slice_hdr->i1_tc_offset_div2; 576 577 bs_tz = CTZ(u4_bs) >> 2; 578 if(0 != bs_tz) 579 { 580 if((row + bs_tz) >= (ctb_size / 8)) 581 pu1_src += 4 * (ctb_size / 8 - row) * src_strd; 582 else 583 pu1_src += 4 * bs_tz * src_strd; 584 row += bs_tz; 585 u4_bs = u4_bs >> (bs_tz << 2); 586 continue; 587 } 588 589 if(0 == row) 590 { 591 i1_tc_offset_div2 = ps_slice_hdr_top->i1_tc_offset_div2; 592 593 if(0 == col) 594 { 595 qp_p = u4_qp_const_in_ctb[0] ? 596 pu1_qp[-ctb_size / 8 * qp_strd - ctb_size / 8] : 597 pu1_qp[-qp_strd - 1]; 598 } 599 else 600 { 601 qp_p = u4_qp_const_in_ctb[1] ? 602 pu1_qp[-ctb_size / 8 * qp_strd] : 603 pu1_qp[2 * col - 1 - qp_strd]; 604 } 605 606 qp_q = u4_qp_const_in_ctb[1] ? 607 pu1_qp[-ctb_size / 8 * qp_strd] : 608 pu1_qp[2 * col - qp_strd]; 609 } 610 else 611 { 612 if(0 == col) 613 { 614 qp_p = u4_qp_const_in_ctb[2] ? 615 pu1_qp[-ctb_size / 8] : 616 pu1_qp[(row - 1) * qp_strd - 1]; 617 } 618 else 619 { 620 qp_p = u4_qp_const_in_ctb[3] ? 621 pu1_qp[0] : 622 pu1_qp[(row - 1) * qp_strd + 2 * col - 1]; 623 } 624 625 qp_q = u4_qp_const_in_ctb[3] ? 626 pu1_qp[0] : 627 pu1_qp[(row - 1) * qp_strd + 2 * col]; 628 } 629 630 filter_p = (pu2_ctb_no_loop_filter_flag[row] >> (col << 1)) & 1; 631 filter_q = (pu2_ctb_no_loop_filter_flag[row] >> (col << 1)) & 2; 632 /* filter_p and filter_q are inverted as they are calculated using no_loop_filter_flags */ 633 filter_p = !filter_p; 634 filter_q = !filter_q; 635 636 if(filter_p || filter_q) 637 { 638 ASSERT(1 == ((u4_bs & 3) >> 1)); 639#if DEBUG_DEBLK_LEAF_LEVEL 640 { 641 DUMP_DEBLK_CHROMA_VERT(pu1_src, src_strd, 642 u4_bs & 3, qp_p, qp_q, 643 ps_pps->i1_pic_cb_qp_offset, 644 ps_pps->i1_pic_cr_qp_offset, 645 ps_slice_hdr->i1_tc_offset_div2, 646 filter_p, filter_q); 647 } 648#endif 649 if(chroma_yuv420sp_vu) 650 { 651 ps_codec->s_func_selector.ihevc_deblk_chroma_vert_fptr(pu1_src, 652 src_strd, 653 qp_q, 654 qp_p, 655 ps_pps->i1_pic_cr_qp_offset, 656 ps_pps->i1_pic_cb_qp_offset, 657 i1_tc_offset_div2, 658 filter_q, 659 filter_p); 660 } 661 else 662 { 663 ps_codec->s_func_selector.ihevc_deblk_chroma_vert_fptr(pu1_src, 664 src_strd, 665 qp_p, 666 qp_q, 667 ps_pps->i1_pic_cb_qp_offset, 668 ps_pps->i1_pic_cr_qp_offset, 669 i1_tc_offset_div2, 670 filter_p, 671 filter_q); 672 } 673 } 674 675 pu1_src += 4 * src_strd; 676 u4_bs = u4_bs >> 4; 677 row++; 678 } 679 680 pu4_vert_bs += (64 == ctb_size) ? 2 : 1; 681 pu1_src -= ((src_strd / 2) << log2_ctb_size); 682 pu1_src += 16; 683 } 684 } 685 686 /* Chroma Horizontal Edge */ 687 688 if(0 == i4_is_last_ctb_y) 689 { 690 691 /* Left CTB's slice header */ 692 slice_header_t *ps_slice_hdr_left; 693#ifdef GPU_BUILD 694//TODO GPU : Later define it for ARM only version as well 695 { 696 WORD32 cur_ctb_indx = ps_deblk->i4_ctb_x + ps_deblk->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb; 697 if(i4_is_last_ctb_x) 698 cur_ctb_indx += 1; 699 ps_slice_hdr_left = ps_deblk->ps_slice_hdr_base + ps_deblk->pu1_slice_idx[cur_ctb_indx - 1]; 700 } 701#else 702 { 703 WORD32 cur_ctb_indx = ps_deblk->i4_ctb_x + ps_deblk->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb; 704 if(i4_is_last_ctb_x) 705 cur_ctb_indx += 1; 706 ps_slice_hdr_left = ps_codec->ps_slice_hdr_base + ps_deblk->pu1_slice_idx[cur_ctb_indx - 1]; 707 } 708#endif 709 710 pu1_src = ps_deblk->pu1_cur_pic_chroma + ((ps_deblk->i4_ctb_x + ps_deblk->i4_ctb_y * ps_deblk->ps_codec->i4_strd / 2) << log2_ctb_size); 711 pu1_src += i4_is_last_ctb_x ? ctb_size : 0; 712 713 /** Deblocking is done on a shifted CTB - 714 * Vertical edge processing is done by shifting the CTB up by four pixels (8 here beacuse UV are interleaved) */ 715 pu1_src -= 8; 716 for(row = 0; row < ctb_size / 16; row++) 717 { 718 /* BS for the row - Last column is excluded and the left column is included*/ 719 u4_bs = pu4_horz_bs[0] << 2; 720 721 if(ps_deblk->i4_ctb_x || i4_is_last_ctb_x) 722 { 723 /** Picking the last BS of the previous CTB corresponding to the same row 724 * UWORD32 *pu4_horz_bs_left = (UWORD32 *)((UWORD8 *)pu4_horz_bs - (ctb_size / 8) * (ctb_size / 4) / 8 * 2); 725 */ 726 UWORD32 *pu4_horz_bs_left = (UWORD32 *)((UWORD8 *)pu4_horz_bs - (1 << (2 * log2_ctb_size - 7))); 727 UWORD32 u4_left_bs = (*pu4_horz_bs_left) >> ((1 << (log2_ctb_size - 1)) - 2); 728 u4_bs |= u4_left_bs & 3; 729 } 730 731 /* Every alternate boundary strength value is used for chroma */ 732 u4_bs &= 0x22222222; 733 734 for(col = 0; col < ctb_size / 8;) 735 { 736 WORD8 i1_tc_offset_div2 = ps_slice_hdr->i1_tc_offset_div2; 737 738 bs_tz = CTZ(u4_bs) >> 2; 739 if(0 != bs_tz) 740 { 741 u4_bs = u4_bs >> (bs_tz << 2); 742 743 if((col + bs_tz) >= (ctb_size / 8)) 744 pu1_src += 8 * (ctb_size / 8 - col); 745 else 746 pu1_src += 8 * bs_tz; 747 748 col += bs_tz; 749 continue; 750 } 751 752 if(0 == col) 753 { 754 i1_tc_offset_div2 = ps_slice_hdr_left->i1_tc_offset_div2; 755 756 if(0 == row) 757 { 758 qp_p = u4_qp_const_in_ctb[0] ? 759 pu1_qp[-ctb_size / 8 * qp_strd - ctb_size / 8] : 760 pu1_qp[-qp_strd - 1]; 761 } 762 else 763 { 764 qp_p = u4_qp_const_in_ctb[2] ? 765 pu1_qp[-ctb_size / 8] : 766 pu1_qp[(2 * row - 1) * qp_strd - 1]; 767 } 768 769 qp_q = u4_qp_const_in_ctb[2] ? 770 pu1_qp[-ctb_size / 8] : 771 pu1_qp[(2 * row) * qp_strd - 1]; 772 } 773 else 774 { 775 if(0 == row) 776 { 777 qp_p = u4_qp_const_in_ctb[1] ? 778 pu1_qp[-ctb_size / 8 * qp_strd] : 779 pu1_qp[col - 1 - qp_strd]; 780 } 781 else 782 { 783 qp_p = u4_qp_const_in_ctb[3] ? 784 pu1_qp[0] : 785 pu1_qp[(col - 1) + (2 * row - 1) * qp_strd]; 786 } 787 788 qp_q = u4_qp_const_in_ctb[3] ? 789 pu1_qp[0] : 790 pu1_qp[(col - 1) + 2 * row * qp_strd]; 791 } 792 793 filter_p = (pu2_ctb_no_loop_filter_flag[row << 1] >> col) & 1; 794 filter_q = (pu2_ctb_no_loop_filter_flag[(row << 1) + 1] >> col) & 1; 795 /* filter_p and filter_q are inverted as they are calculated using no_loop_filter_flags */ 796 filter_p = !filter_p; 797 filter_q = !filter_q; 798 799 if(filter_p || filter_q) 800 { 801 ASSERT(1 == ((u4_bs & 3) >> 1)); 802#if DEBUG_DEBLK_LEAF_LEVEL 803 { 804 DUMP_DEBLK_CHROMA_HORZ(pu1_src, src_strd, 805 u4_bs & 3, qp_p, qp_q, 806 ps_pps->i1_pic_cb_qp_offset, 807 ps_pps->i1_pic_cr_qp_offset, 808 ps_slice_hdr->i1_tc_offset_div2, 809 filter_p, filter_q); 810 } 811#endif 812 if(chroma_yuv420sp_vu) 813 { 814 ps_codec->s_func_selector.ihevc_deblk_chroma_horz_fptr(pu1_src, 815 src_strd, 816 qp_q, 817 qp_p, 818 ps_pps->i1_pic_cr_qp_offset, 819 ps_pps->i1_pic_cb_qp_offset, 820 i1_tc_offset_div2, 821 filter_q, 822 filter_p); 823 } 824 else 825 { 826 ps_codec->s_func_selector.ihevc_deblk_chroma_horz_fptr(pu1_src, 827 src_strd, 828 qp_p, 829 qp_q, 830 ps_pps->i1_pic_cb_qp_offset, 831 ps_pps->i1_pic_cr_qp_offset, 832 i1_tc_offset_div2, 833 filter_p, 834 filter_q); 835 } 836 } 837 838 pu1_src += 8; 839 u4_bs = u4_bs >> 4; 840 col++; 841 } 842 843 pu4_horz_bs += (64 == ctb_size) ? 2 : 1; 844 pu1_src -= ctb_size; 845 pu1_src += 8 * src_strd; 846 847 } 848 } 849} 850