ihevcd_process_slice.c revision 707042fda96ebede81408b854385173483798bcd
1/******************************************************************************
2*
3* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4*
5* Licensed under the Apache License, Version 2.0 (the "License");
6* you may not use this file except in compliance with the License.
7* You may obtain a copy of the License at:
8*
9* http://www.apache.org/licenses/LICENSE-2.0
10*
11* Unless required by applicable law or agreed to in writing, software
12* distributed under the License is distributed on an "AS IS" BASIS,
13* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14* See the License for the specific language governing permissions and
15* limitations under the License.
16*
17******************************************************************************/
18/**
19 *******************************************************************************
20 * @file
21 *  ihevcd_process_slice.c
22 *
23 * @brief
24 *  Contains functions for processing slice data
25 *
26 * @author
27 *  Harish
28 *
29 * @par List of Functions:
30 *
31 * @remarks
32 *  None
33 *
34 *******************************************************************************
35 */
36/*****************************************************************************/
37/* File Includes                                                             */
38/*****************************************************************************/
39#include <stdio.h>
40#include <stddef.h>
41#include <stdlib.h>
42#include <string.h>
43#include <assert.h>
44
45#include "ihevc_typedefs.h"
46#include "iv.h"
47#include "ivd.h"
48#include "ihevcd_cxa.h"
49#include "ithread.h"
50
51#include "ihevc_defs.h"
52#include "ihevc_debug.h"
53#include "ihevc_defs.h"
54#include "ihevc_structs.h"
55#include "ihevc_macros.h"
56#include "ihevc_platform_macros.h"
57#include "ihevc_cabac_tables.h"
58#include "ihevc_padding.h"
59#include "ihevc_iquant_itrans_recon.h"
60#include "ihevc_chroma_iquant_itrans_recon.h"
61#include "ihevc_recon.h"
62#include "ihevc_chroma_recon.h"
63#include "ihevc_iquant_recon.h"
64#include "ihevc_chroma_iquant_recon.h"
65#include "ihevc_intra_pred.h"
66
67#include "ihevc_error.h"
68#include "ihevc_common_tables.h"
69#include "ihevc_quant_tables.h"
70#include "ihevcd_common_tables.h"
71
72#include "ihevcd_profile.h"
73#include "ihevcd_trace.h"
74#include "ihevcd_defs.h"
75#include "ihevcd_function_selector.h"
76#include "ihevcd_structs.h"
77#include "ihevcd_error.h"
78#include "ihevcd_nal.h"
79#include "ihevcd_bitstream.h"
80#include "ihevcd_job_queue.h"
81#include "ihevcd_utils.h"
82#include "ihevcd_debug.h"
83#include "ihevcd_get_mv.h"
84#include "ihevcd_inter_pred.h"
85#include "ihevcd_iquant_itrans_recon_ctb.h"
86#include "ihevcd_boundary_strength.h"
87#include "ihevcd_deblk.h"
88#include "ihevcd_fmt_conv.h"
89#ifdef GPU_BUILD
90#include "ihevcd_opencl_mc_interface.h"
91#endif
92#include "ihevcd_sao.h"
93#include "ihevcd_profile.h"
94
95IHEVCD_ERROR_T ihevcd_fmt_conv(codec_t *ps_codec,
96                               process_ctxt_t *ps_proc,
97                               UWORD8 *pu1_y_dst,
98                               UWORD8 *pu1_u_dst,
99                               UWORD8 *pu1_v_dst,
100                               WORD32 cur_row,
101                               WORD32 num_rows);
102
103typedef enum
104{
105    PROC_ALL,
106    PROC_INTER_PRED,
107    PROC_RECON,
108    PROC_DEBLK,
109    PROC_SAO
110}proc_type_t;
111
112void ihevcd_proc_map_check(process_ctxt_t *ps_proc, proc_type_t proc_type, WORD32 nctb)
113{
114    tile_t *ps_tile = ps_proc->ps_tile;
115    sps_t *ps_sps = ps_proc->ps_sps;
116    pps_t *ps_pps = ps_proc->ps_pps;
117    codec_t *ps_codec = ps_proc->ps_codec;
118    WORD32 idx;
119    WORD32 nop_cnt;
120    WORD32 bit_pos = proc_type;
121    WORD32 bit_mask = (1 << bit_pos);
122
123    if(ps_proc->i4_check_proc_status)
124    {
125        nop_cnt = PROC_NOP_CNT;
126        while(1)
127        {
128            volatile UWORD8 *pu1_buf;
129            volatile WORD32 status;
130            status = 1;
131            /* Check if all dependencies for the next nCTBs are met */
132            {
133                WORD32 x_pos;
134
135                {
136                    /* Check if the top right of next nCTBs are processed */
137                    if(ps_proc->i4_ctb_y > 0)
138                    {
139                        x_pos = (ps_proc->i4_ctb_tile_x + nctb);
140                        idx = MIN(x_pos, (ps_tile->u2_wd - 1));
141
142                        /* Check if top-right CTB for the last CTB in nCTB is within the tile */
143                        {
144                            idx += ps_tile->u1_pos_x;
145                            idx += ((ps_proc->i4_ctb_y - 1)
146                                            * ps_sps->i2_pic_wd_in_ctb);
147#ifdef GPU_BUILD
148                            //TODO GPU : Later define it for ARM only version as well
149                            pu1_buf = (ps_proc->pu1_proc_map + idx);
150#else
151                            pu1_buf = (ps_codec->pu1_proc_map + idx);
152#endif
153                            status = *pu1_buf & bit_mask;
154                        }
155                    }
156                }
157
158                /* If tiles are enabled, then test left and top-left as well */
159                ps_pps = ps_proc->ps_pps;
160                if(ps_pps->i1_tiles_enabled_flag)
161                {
162                    /*Check if left ctb is processed*/
163                    if((ps_proc->i4_ctb_x > 0) && ((0 != status)))
164                    {
165                        x_pos   = ps_tile->u1_pos_x + ps_proc->i4_ctb_tile_x - 1;
166                        idx     = x_pos + (ps_proc->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb);
167#ifdef GPU_BUILD
168                        //TODO GPU : Later define it for ARM only version as well
169                        pu1_buf = (ps_proc->pu1_proc_map + idx);
170#else
171                        pu1_buf = (ps_codec->pu1_proc_map + idx);
172#endif
173                        status  = *pu1_buf & bit_mask;
174                    }
175
176                    /*Check if top left ctb is processed*/
177                    if((ps_proc->i4_ctb_x > 0) && (0 != status) && (ps_proc->i4_ctb_y > 0))
178                    {
179                        x_pos   = ps_tile->u1_pos_x + ps_proc->i4_ctb_tile_x - 1;
180                        idx     = x_pos + ((ps_proc->i4_ctb_y - 1) * ps_sps->i2_pic_wd_in_ctb);
181#ifdef GPU_BUILD
182                        //TODO GPU : Later define it for ARM only version as well
183                        pu1_buf = (ps_proc->pu1_proc_map + idx);
184#else
185                        pu1_buf = (ps_codec->pu1_proc_map + idx);
186#endif
187                        status  = *pu1_buf & bit_mask;
188                    }
189                }
190            }
191
192            if(status)
193                break;
194
195            /* if dependencies are not met, then wait for few cycles.
196             * Even after few iterations, if the dependencies are not met then yield
197             */
198            if(nop_cnt > 0)
199            {
200                NOP(128);
201                nop_cnt -= 128;
202            }
203            else
204            {
205                nop_cnt = PROC_NOP_CNT;
206                ithread_yield();
207                //NOP(128 * 16);
208            }
209        }
210    }
211}
212
213void ihevcd_proc_map_update(process_ctxt_t *ps_proc, proc_type_t proc_type, WORD32 nctb)
214{
215    codec_t *ps_codec = ps_proc->ps_codec;
216    WORD32 i, idx;
217    WORD32 bit_pos = proc_type;
218    WORD32 bit_mask = (1 << bit_pos);
219
220    /* Update the current CTBs processing status */
221    if(ps_proc->i4_check_proc_status)
222    {
223        for(i = 0; i < nctb; i++)
224        {
225            sps_t *ps_sps = ps_proc->ps_sps;
226            UWORD8 *pu1_buf;
227            idx = (ps_proc->i4_ctb_x + i);
228            idx += ((ps_proc->i4_ctb_y) * ps_sps->i2_pic_wd_in_ctb);
229#ifdef GPU_BUILD
230            //TODO GPU : Later define it for ARM only version as well
231            pu1_buf = (ps_proc->pu1_proc_map + idx);
232#else
233            pu1_buf = (ps_codec->pu1_proc_map + idx);
234#endif
235            *pu1_buf = *pu1_buf | bit_mask;
236        }
237    }
238}
239
240
241void ihevcd_slice_hdr_update(process_ctxt_t *ps_proc)
242{
243
244    /* Slice x and y are initialized in proc_init. But initialize slice x and y count here
245     *  if a new slice begins at the middle of a row since proc_init is invoked only at the beginning of each row */
246    if(!((ps_proc->i4_ctb_x == 0) && (ps_proc->i4_ctb_y == 0)))
247    {
248#ifdef GPU_BUILD
249        //TODO GPU : Later define it for ARM only version as well
250        slice_header_t *ps_slice_hdr_next = ps_proc->ps_slice_hdr_base + ((ps_proc->i4_cur_slice_idx + 1) & (MAX_SLICE_HDR_CNT - 1));
251#else
252        slice_header_t *ps_slice_hdr_next = ps_proc->ps_codec->ps_slice_hdr_base + ((ps_proc->i4_cur_slice_idx + 1) & (MAX_SLICE_HDR_CNT - 1));
253#endif
254
255        if((ps_slice_hdr_next->i2_ctb_x == ps_proc->i4_ctb_x)
256                        && (ps_slice_hdr_next->i2_ctb_y == ps_proc->i4_ctb_y))
257        {
258            if(0 == ps_slice_hdr_next->i1_dependent_slice_flag)
259            {
260                ps_proc->i4_ctb_slice_x = 0;
261                ps_proc->i4_ctb_slice_y = 0;
262            }
263
264            ps_proc->i4_cur_slice_idx++;
265            ps_proc->ps_slice_hdr = ps_slice_hdr_next;
266        }
267
268    }
269}
270
271void ihevcd_ctb_pos_update(process_ctxt_t *ps_proc, WORD32 nctb)
272{
273    WORD32 tile_start_ctb_idx, slice_start_ctb_idx;
274    slice_header_t *ps_slice_hdr = ps_proc->ps_slice_hdr;
275    tile_t *ps_tile = ps_proc->ps_tile;
276    sps_t *ps_sps = ps_proc->ps_sps;
277
278    /* Update x and y positions */
279    ps_proc->i4_ctb_tile_x += nctb;
280    ps_proc->i4_ctb_x += nctb;
281
282    ps_proc->i4_ctb_slice_x += nctb;
283    /*If tile are enabled, then handle the tile & slice counters differently*/
284    if(ps_proc->ps_pps->i1_tiles_enabled_flag)
285    {
286        /* Update slice counters*/
287        slice_start_ctb_idx = ps_slice_hdr->i2_ctb_x + (ps_slice_hdr->i2_ctb_y * ps_sps->i2_pic_wd_in_ctb);
288        tile_start_ctb_idx = ps_tile->u1_pos_x + (ps_tile->u1_pos_y * ps_sps->i2_pic_wd_in_ctb);
289        /*
290         * There can be 2 cases where slice counters must be handled differently.
291         * 1 - Multiple tiles span across a single/one of the many slice.
292         * 2 - Multiple slices span across a single/one of the many tiles.
293         */
294
295        /*Case 1 */
296        if(slice_start_ctb_idx < tile_start_ctb_idx)
297        {
298            /*End of tile row*/
299            if(ps_proc->i4_ctb_x > ps_slice_hdr->i2_ctb_x)
300            {
301                if(ps_proc->i4_ctb_slice_x >= (ps_tile->u2_wd + ps_tile->u1_pos_x))
302                {
303                    ps_proc->i4_ctb_slice_y++;
304                    ps_proc->i4_ctb_slice_x = ps_proc->i4_ctb_slice_x
305                                    - ps_tile->u2_wd;
306                }
307            }
308            else
309            {
310                WORD32 temp_stride = (ps_sps->i2_pic_wd_in_ctb - ps_slice_hdr->i2_ctb_x);
311                if(ps_proc->i4_ctb_slice_x >= (temp_stride + ps_tile->u2_wd + ps_tile->u1_pos_x))
312                {
313                    ps_proc->i4_ctb_slice_y++;
314                    ps_proc->i4_ctb_slice_x = ps_proc->i4_ctb_slice_x
315                                    - ps_tile->u2_wd;
316                }
317            }
318        }
319        /*Case 2*/
320        else if(ps_proc->i4_ctb_slice_x >= (ps_tile->u2_wd))
321        {
322            /*End of tile row*/
323            ps_proc->i4_ctb_slice_y++;
324            ps_proc->i4_ctb_slice_x = 0;
325        }
326    }
327    else
328    {
329        if(ps_proc->i4_ctb_slice_x >= ps_tile->u2_wd)
330        {
331            ps_proc->i4_ctb_slice_y++;
332            ps_proc->i4_ctb_slice_x = ps_proc->i4_ctb_slice_x
333                            - ps_tile->u2_wd;
334        }
335    }
336}
337
338void ihevcd_ctb_avail_update(process_ctxt_t *ps_proc)
339{
340    slice_header_t *ps_slice_hdr = ps_proc->ps_slice_hdr;
341    sps_t *ps_sps = ps_proc->ps_sps;
342    tile_t *ps_tile_prev;
343    tile_t *ps_tile = ps_proc->ps_tile;
344    WORD32 cur_pu_idx;
345    WORD32 tile_start_ctb_idx, slice_start_ctb_idx;
346    WORD16 i2_wd_in_ctb;
347    WORD32 continuous_tiles = 0;
348    WORD32 cur_ctb_idx;
349    WORD32 check_tile_wd;
350
351    if((0 != ps_tile->u1_pos_x) && (0 != ps_tile->u1_pos_y))
352    {
353        ps_tile_prev = ps_tile - 1;
354    }
355    else
356    {
357        ps_tile_prev = ps_tile;
358    }
359
360
361    check_tile_wd = ps_slice_hdr->i2_ctb_x + ps_tile_prev->u2_wd;
362    if(!(((check_tile_wd >= ps_sps->i2_pic_wd_in_ctb) && (check_tile_wd % ps_sps->i2_pic_wd_in_ctb == ps_tile->u1_pos_x))
363                                    || ((ps_slice_hdr->i2_ctb_x == ps_tile->u1_pos_x))))
364    {
365        continuous_tiles = 1;
366    }
367
368    slice_start_ctb_idx = ps_slice_hdr->i2_ctb_x + (ps_slice_hdr->i2_ctb_y * ps_sps->i2_pic_wd_in_ctb);
369    tile_start_ctb_idx = ps_tile->u1_pos_x + (ps_tile->u1_pos_y * ps_sps->i2_pic_wd_in_ctb);
370
371    if((slice_start_ctb_idx < tile_start_ctb_idx) && (continuous_tiles))
372    {
373        //Slices span across multiple tiles.
374        i2_wd_in_ctb = ps_sps->i2_pic_wd_in_ctb;
375    }
376    else
377    {
378        i2_wd_in_ctb = ps_tile->u2_wd;
379    }
380    cur_ctb_idx = ps_proc->i4_ctb_x
381                    + ps_proc->i4_ctb_y * (ps_sps->i2_pic_wd_in_ctb);
382
383    /* Ctb level availability */
384    /* Bottom left will not be available at a CTB level, no need to pass this */
385    ps_proc->u1_top_ctb_avail = 1;
386    ps_proc->u1_left_ctb_avail = 1;
387    ps_proc->u1_top_lt_ctb_avail = 1;
388    ps_proc->u1_top_rt_ctb_avail = 1;
389    /* slice and tile boundaries */
390
391    if((0 == ps_proc->i4_ctb_y) || (0 == ps_proc->i4_ctb_tile_y))
392    {
393        ps_proc->u1_top_ctb_avail = 0;
394        ps_proc->u1_top_lt_ctb_avail = 0;
395        ps_proc->u1_top_rt_ctb_avail = 0;
396    }
397
398    if((0 == ps_proc->i4_ctb_x) || (0 == ps_proc->i4_ctb_tile_x))
399    {
400        ps_proc->u1_left_ctb_avail = 0;
401        ps_proc->u1_top_lt_ctb_avail = 0;
402        if((0 == ps_proc->i4_ctb_slice_y) || (0 == ps_proc->i4_ctb_tile_y))
403        {
404            ps_proc->u1_top_ctb_avail = 0;
405            if((i2_wd_in_ctb - 1) != ps_proc->i4_ctb_slice_x)
406            {
407                ps_proc->u1_top_rt_ctb_avail = 0;
408            }
409        }
410    }
411    /*For slices not beginning at start of a ctb row*/
412    else if(ps_proc->i4_ctb_x > 0)
413    {
414        if((0 == ps_proc->i4_ctb_slice_y) || (0 == ps_proc->i4_ctb_tile_y))
415        {
416            ps_proc->u1_top_ctb_avail = 0;
417            ps_proc->u1_top_lt_ctb_avail = 0;
418            if(0 == ps_proc->i4_ctb_slice_x)
419            {
420                ps_proc->u1_left_ctb_avail = 0;
421            }
422            if((i2_wd_in_ctb - 1) != ps_proc->i4_ctb_slice_x)
423            {
424                ps_proc->u1_top_rt_ctb_avail = 0;
425            }
426        }
427        else if((1 == ps_proc->i4_ctb_slice_y) && (0 == ps_proc->i4_ctb_slice_x))
428        {
429            ps_proc->u1_top_lt_ctb_avail = 0;
430        }
431    }
432
433    if((ps_proc->i4_ctb_x == (ps_sps->i2_pic_wd_in_ctb - 1)) || ((ps_tile->u2_wd - 1) == ps_proc->i4_ctb_tile_x))
434    {
435        ps_proc->u1_top_rt_ctb_avail = 0;
436    }
437
438
439#if 0
440    if((((0 == ps_proc->i4_ctb_slice_x)
441         && (0 == ps_proc->i4_ctb_slice_y))
442        || (0 == ps_proc->i4_ctb_tile_x)))
443    {
444        ps_proc->u1_left_ctb_avail = 0;
445        ps_proc->u1_top_lt_ctb_avail = 0;
446    }
447    if((0 == ps_proc->i4_ctb_slice_y) || (0 == ps_proc->i4_ctb_tile_y))
448    {
449        ps_proc->u1_top_ctb_avail = 0;
450        ps_proc->u1_top_lt_ctb_avail = 0;
451        ps_proc->u1_top_rt_ctb_avail = 0;
452    }
453    /* Image boundaries */
454    if(ps_proc->i4_ctb_x == 0)
455    {
456        ps_proc->u1_left_ctb_avail = 0;
457        ps_proc->u1_top_lt_ctb_avail = 0;
458    }
459    if(ps_proc->i4_ctb_x == (ps_sps->i2_pic_wd_in_ctb - 1))
460    {
461        ps_proc->u1_top_rt_ctb_avail = 0;
462    }
463    if(ps_proc->i4_ctb_y == 0)
464    {
465        ps_proc->u1_top_ctb_avail = 0;
466        ps_proc->u1_top_lt_ctb_avail = 0;
467        ps_proc->u1_top_rt_ctb_avail = 0;
468    }
469#endif
470    {
471        WORD32 next_ctb_idx;
472        next_ctb_idx = cur_ctb_idx + 1;
473
474        if(ps_tile->u2_wd == (ps_proc->i4_ctb_tile_x + 1))
475        {
476            if((ps_proc->i4_ctb_tile_y + 1) == ps_tile->u2_ht)
477            {
478                //Last tile
479                if(((ps_proc->i4_ctb_tile_y + 1 + ps_tile->u1_pos_y) == ps_sps->i2_pic_ht_in_ctb) && ((ps_proc->i4_ctb_tile_x + 1 + ps_tile->u1_pos_x) == ps_sps->i2_pic_wd_in_ctb))
480                {
481                    next_ctb_idx = cur_ctb_idx + 1;
482                }
483                else //Not last tile, but new tile
484                {
485                    tile_t *ps_tile_next = ps_proc->ps_tile + 1;
486                    next_ctb_idx = ps_tile_next->u1_pos_x + (ps_tile_next->u1_pos_y * ps_sps->i2_pic_wd_in_ctb);
487                }
488            }
489            else //End of each tile row
490            {
491                next_ctb_idx = ((ps_tile->u1_pos_y + ps_proc->i4_ctb_tile_y + 1) * ps_sps->i2_pic_wd_in_ctb) + ps_tile->u1_pos_x;
492            }
493        }
494        ps_proc->i4_next_pu_ctb_cnt = next_ctb_idx;
495        ps_proc->i4_ctb_pu_cnt =
496                        ps_proc->pu4_pic_pu_idx[next_ctb_idx]
497                        - ps_proc->pu4_pic_pu_idx[cur_ctb_idx];
498        cur_pu_idx = ps_proc->pu4_pic_pu_idx[cur_ctb_idx];
499        ps_proc->i4_ctb_start_pu_idx = cur_pu_idx;
500        ps_proc->ps_pu = &ps_proc->ps_pic_pu[cur_pu_idx];
501    }
502}
503
504void ihevcd_update_ctb_tu_cnt(process_ctxt_t *ps_proc)
505{
506    sps_t *ps_sps = ps_proc->ps_sps;
507    codec_t *ps_codec = ps_proc->ps_codec;
508    WORD32 cur_ctb_idx;
509
510    cur_ctb_idx = ps_proc->i4_ctb_x
511                    + ps_proc->i4_ctb_y * (ps_sps->i2_pic_wd_in_ctb);
512
513    {
514        tile_t *ps_tile;
515        WORD32 next_ctb_tu_idx;
516        ps_tile = ps_proc->ps_tile;
517
518
519        if(1 == ps_codec->i4_num_cores)
520        {
521            next_ctb_tu_idx = cur_ctb_idx % RESET_TU_BUF_NCTB + 1;
522            if(ps_tile->u2_wd == (ps_proc->i4_ctb_tile_x + 1))
523            {
524                if((ps_proc->i4_ctb_tile_y + 1) == ps_tile->u2_ht)
525                {
526                    //Last tile
527                    if(((ps_proc->i4_ctb_tile_y + 1 + ps_tile->u1_pos_y) == ps_sps->i2_pic_ht_in_ctb) && ((ps_proc->i4_ctb_tile_x + 1 + ps_tile->u1_pos_x) == ps_sps->i2_pic_wd_in_ctb))
528                    {
529                        next_ctb_tu_idx = (cur_ctb_idx % RESET_TU_BUF_NCTB) + 1;
530                    }
531                    else //Not last tile, but new tile
532                    {
533                        tile_t *ps_tile_next = ps_proc->ps_tile + 1;
534                        next_ctb_tu_idx = ps_tile_next->u1_pos_x + (ps_tile_next->u1_pos_y * ps_sps->i2_pic_wd_in_ctb);
535                    }
536                }
537                else //End of each tile row
538                {
539                    next_ctb_tu_idx = ((ps_tile->u1_pos_y + ps_proc->i4_ctb_tile_y + 1) * ps_sps->i2_pic_wd_in_ctb) + ps_tile->u1_pos_x;
540                }
541            }
542            ps_proc->i4_next_tu_ctb_cnt = next_ctb_tu_idx;
543            ps_proc->i4_ctb_tu_cnt = ps_proc->pu4_pic_tu_idx[next_ctb_tu_idx] - ps_proc->pu4_pic_tu_idx[cur_ctb_idx % RESET_TU_BUF_NCTB];
544        }
545        else
546        {
547            next_ctb_tu_idx = cur_ctb_idx + 1;
548            if(ps_tile->u2_wd == (ps_proc->i4_ctb_tile_x + 1))
549            {
550                if((ps_proc->i4_ctb_tile_y + 1) == ps_tile->u2_ht)
551                {
552                    //Last tile
553                    if(((ps_proc->i4_ctb_tile_y + 1 + ps_tile->u1_pos_y) == ps_sps->i2_pic_ht_in_ctb) && ((ps_proc->i4_ctb_tile_x + 1 + ps_tile->u1_pos_x) == ps_sps->i2_pic_wd_in_ctb))
554                    {
555                        next_ctb_tu_idx = (cur_ctb_idx % RESET_TU_BUF_NCTB) + 1;
556                    }
557                    else //Not last tile, but new tile
558                    {
559                        tile_t *ps_tile_next = ps_proc->ps_tile + 1;
560                        next_ctb_tu_idx = ps_tile_next->u1_pos_x + (ps_tile_next->u1_pos_y * ps_sps->i2_pic_wd_in_ctb);
561                    }
562                }
563                else //End of each tile row
564                {
565                    next_ctb_tu_idx = ((ps_tile->u1_pos_y + ps_proc->i4_ctb_tile_y + 1) * ps_sps->i2_pic_wd_in_ctb) + ps_tile->u1_pos_x;
566                }
567            }
568            ps_proc->i4_next_tu_ctb_cnt = next_ctb_tu_idx;
569            ps_proc->i4_ctb_tu_cnt = ps_proc->pu4_pic_tu_idx[next_ctb_tu_idx] -
570                            ps_proc->pu4_pic_tu_idx[cur_ctb_idx];
571        }
572    }
573}
574
575IHEVCD_ERROR_T ihevcd_process(process_ctxt_t *ps_proc)
576{
577    IHEVCD_ERROR_T ret = (IHEVCD_ERROR_T)IHEVCD_SUCCESS;
578    codec_t *ps_codec;
579    sps_t *ps_sps = ps_proc->ps_sps;
580
581    WORD32 nctb;
582    WORD32 i;
583    WORD32 idx;
584    WORD32 nop_cnt;
585    WORD32 num_minpu_in_ctb;
586    WORD32 cur_slice_idx, cur_ctb_tile_x, cur_ctb_slice_x, cur_ctb_tile_y, cur_ctb_slice_y;
587    WORD32 nxt_ctb_slice_y, nxt_ctb_slice_x;
588    tu_t *ps_tu_cur, *ps_tu_nxt;
589    UWORD8 *pu1_pu_map_cur, *pu1_pu_map_nxt;
590    WORD32 num_ctb, num_ctb_tmp;
591    proc_type_t proc_type;
592
593
594    WORD32 ctb_size = 1 << ps_sps->i1_log2_ctb_size;
595
596    PROFILE_DISABLE_PROCESS_CTB();
597
598    ps_codec = ps_proc->ps_codec;
599    num_minpu_in_ctb = (ctb_size / MIN_PU_SIZE) * (ctb_size / MIN_PU_SIZE);
600
601    nctb = MIN(ps_codec->i4_proc_nctb, ps_proc->i4_ctb_cnt);
602    nctb = MIN(nctb, (ps_proc->ps_tile->u2_wd - ps_proc->i4_ctb_tile_x));
603
604    if(ps_proc->i4_cur_slice_idx > (MAX_SLICE_HDR_CNT - 2 * ps_sps->i2_pic_wd_in_ctb))
605    {
606        num_ctb = 1;
607    }
608    else
609    {
610        num_ctb = ps_proc->i4_nctb;
611    }
612    nxt_ctb_slice_y = ps_proc->i4_ctb_slice_y;
613    nxt_ctb_slice_x = ps_proc->i4_ctb_slice_x;
614    pu1_pu_map_nxt = ps_proc->pu1_pu_map;
615    ps_tu_nxt = ps_proc->ps_tu;
616
617    while(ps_proc->i4_ctb_cnt)
618    {
619        ps_proc->i4_ctb_slice_y = nxt_ctb_slice_y;
620        ps_proc->i4_ctb_slice_x = nxt_ctb_slice_x;
621        ps_proc->pu1_pu_map = pu1_pu_map_nxt;
622        ps_proc->ps_tu = ps_tu_nxt;
623
624        cur_ctb_tile_x = ps_proc->i4_ctb_tile_x;
625        cur_ctb_tile_y = ps_proc->i4_ctb_tile_y;
626        cur_ctb_slice_x = ps_proc->i4_ctb_slice_x;
627        cur_ctb_slice_y = ps_proc->i4_ctb_slice_y;
628        cur_slice_idx = ps_proc->i4_cur_slice_idx;
629        ps_tu_cur = ps_proc->ps_tu;
630        pu1_pu_map_cur = ps_proc->pu1_pu_map;
631        proc_type = PROC_INTER_PRED;
632
633        if(ps_proc->i4_ctb_cnt < num_ctb)
634        {
635            num_ctb = ps_proc->i4_ctb_cnt;
636        }
637#ifdef GPU_BUILD
638        num_ctb = MIN(num_ctb, (ps_proc->ps_tile->u2_wd - ps_proc->i4_ctb_tile_x));
639#endif
640        num_ctb_tmp = num_ctb;
641
642        while(num_ctb_tmp)
643        {
644            slice_header_t *ps_slice_hdr;
645            tile_t *ps_tile = ps_proc->ps_tile;
646
647            /* Waiting for Parsing to be done*/
648            {
649
650
651                nop_cnt = PROC_NOP_CNT;
652                if(ps_proc->i4_check_parse_status || ps_proc->i4_check_proc_status)
653                {
654                    while(1)
655                    {
656                        volatile UWORD8 *pu1_buf;
657                        volatile WORD32 status;
658                        status = 1;
659#ifdef GPU_BUILD
660                        /* If GPU is enabled, don't check for the status of parsing
661                         * since processing starts after waiting for MC which means
662                         * parsing is done.*/
663                        //TODO GPU : Also remove the flag being updated in parsing
664#endif
665                        /* Check if all dependencies for the next nCTBs are met */
666#ifndef GPU_BUILD
667                        /* Check if the next nCTBs are parsed */
668                        if(ps_proc->i4_check_parse_status)
669                        {
670                            idx = (ps_proc->i4_ctb_x + nctb - 1);
671                            idx += (ps_proc->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb);
672                            pu1_buf = (ps_codec->pu1_parse_map + idx);
673                            status = *pu1_buf;
674                        }
675#endif
676
677                        if(status)
678                            break;
679
680                        /* if dependencies are not met, then wait for few cycles.
681                         * Even after few iterations, if the dependencies are not met then yield
682                         */
683                        if(nop_cnt > 0)
684                        {
685                            NOP(128);
686                            nop_cnt -= 128;
687                        }
688                        else
689                        {
690                            nop_cnt = PROC_NOP_CNT;
691                            ithread_yield();
692                        }
693                    }
694                }
695            }
696
697            /* Check proc map to ensure dependencies for recon are met */
698            ihevcd_proc_map_check(ps_proc, proc_type, nctb);
699
700            ihevcd_slice_hdr_update(ps_proc);
701            ps_slice_hdr = ps_proc->ps_slice_hdr;
702
703            //ihevcd_mv_prediction();
704            //ihevcd_lvl_unpack();
705            //ihevcd_inter_iq_it_recon();
706            //Following does prediction, iq, it and recon on a TU by TU basis for intra TUs
707            //ihevcd_intra_process();
708            //ihevcd_ctb_boundary_strength_islice(ps_proc, ctb_size);
709            //ihevcd_deblk_ctb(ps_proc);
710
711            /* iq,it recon of Intra TU */
712            {
713                UWORD32 *pu4_ctb_top_pu_idx, *pu4_ctb_left_pu_idx, *pu4_ctb_top_left_pu_idx;
714                WORD32 cur_ctb_idx;
715
716                ihevcd_ctb_avail_update(ps_proc);
717
718#if DEBUG_DUMP_FRAME_BUFFERS_INFO
719                au1_pic_avail_ctb_flags[ps_proc->i4_ctb_x + ps_proc->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb] =
720                                ((ps_proc->u1_top_ctb_avail << 3) | (ps_proc->u1_left_ctb_avail << 2) | (ps_proc->u1_top_lt_ctb_avail << 1) | (ps_proc->u1_top_rt_ctb_avail));
721                au4_pic_ctb_slice_xy[ps_proc->i4_ctb_x + ps_proc->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb] =
722                                (((UWORD16)ps_proc->i4_ctb_slice_x << 16) | ((UWORD16)ps_proc->i4_ctb_slice_y << 16));
723#endif
724
725                /*************************************************/
726                /****************   MV pred **********************/
727                /*************************************************/
728                if(PSLICE == ps_slice_hdr->i1_slice_type
729                                || BSLICE == ps_slice_hdr->i1_slice_type)
730                {
731                    mv_ctxt_t s_mv_ctxt;
732
733                    pu4_ctb_top_pu_idx = ps_proc->pu4_pic_pu_idx_top
734                                    + (ps_proc->i4_ctb_x * ctb_size / MIN_PU_SIZE);
735                    pu4_ctb_left_pu_idx = ps_proc->pu4_pic_pu_idx_left;
736                    pu4_ctb_top_left_pu_idx = &ps_proc->u4_ctb_top_left_pu_idx;
737
738                    /* Initializing s_mv_ctxt */
739                    if(ps_codec->i4_num_cores > MV_PRED_NUM_CORES_THRESHOLD)
740                    {
741                        s_mv_ctxt.ps_pps = ps_proc->ps_pps;
742                        s_mv_ctxt.ps_sps = ps_proc->ps_sps;
743                        s_mv_ctxt.ps_slice_hdr = ps_proc->ps_slice_hdr;
744                        s_mv_ctxt.i4_ctb_x = ps_proc->i4_ctb_x;
745                        s_mv_ctxt.i4_ctb_y = ps_proc->i4_ctb_y;
746                        s_mv_ctxt.ps_pu = ps_proc->ps_pu;
747                        s_mv_ctxt.ps_pic_pu = ps_proc->ps_pic_pu;
748                        s_mv_ctxt.ps_tile = ps_tile;
749                        s_mv_ctxt.pu4_pic_pu_idx_map = ps_proc->pu4_pic_pu_idx_map;
750                        s_mv_ctxt.pu4_pic_pu_idx = ps_proc->pu4_pic_pu_idx;
751                        s_mv_ctxt.pu1_pic_pu_map = ps_proc->pu1_pic_pu_map;
752                        s_mv_ctxt.i4_ctb_pu_cnt = ps_proc->i4_ctb_pu_cnt;
753                        s_mv_ctxt.i4_ctb_start_pu_idx = ps_proc->i4_ctb_start_pu_idx;
754                        s_mv_ctxt.u1_top_ctb_avail = ps_proc->u1_top_ctb_avail;
755                        s_mv_ctxt.u1_top_rt_ctb_avail = ps_proc->u1_top_rt_ctb_avail;
756                        s_mv_ctxt.u1_top_lt_ctb_avail = ps_proc->u1_top_lt_ctb_avail;
757                        s_mv_ctxt.u1_left_ctb_avail = ps_proc->u1_left_ctb_avail;
758
759                        ihevcd_get_mv_ctb(&s_mv_ctxt, pu4_ctb_top_pu_idx,
760                                          pu4_ctb_left_pu_idx, pu4_ctb_top_left_pu_idx);
761                    }
762
763                    ihevcd_inter_pred_ctb(ps_proc);
764                }
765                else if(ps_codec->i4_num_cores > MV_PRED_NUM_CORES_THRESHOLD)
766                {
767                    WORD32 next_ctb_idx, num_pu_per_ctb, ctb_start_pu_idx, pu_cnt;
768                    pu_t *ps_pu;
769                    WORD32 num_minpu_in_ctb = (ctb_size / MIN_PU_SIZE) * (ctb_size / MIN_PU_SIZE);
770                    UWORD8 *pu1_pic_pu_map_ctb = ps_proc->pu1_pic_pu_map +
771                                    (ps_proc->i4_ctb_x + ps_proc->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb) * num_minpu_in_ctb;
772                    WORD32 row, col;
773                    UWORD32 *pu4_nbr_pu_idx = ps_proc->pu4_pic_pu_idx_map;
774                    WORD32 nbr_pu_idx_strd = MAX_CTB_SIZE / MIN_PU_SIZE + 2;
775
776                    for(row = 0; row < ctb_size / MIN_PU_SIZE; row++)
777                    {
778                        for(col = 0; col < ctb_size / MIN_PU_SIZE; col++)
779                        {
780                            pu1_pic_pu_map_ctb[row * ctb_size / MIN_PU_SIZE + col] = 0;
781                        }
782                    }
783                    /* Neighbor PU idx update inside CTB */
784                    /* 1byte per 4x4. Indicates the PU idx that 4x4 block belongs to */
785
786                    cur_ctb_idx = ps_proc->i4_ctb_x
787                                    + ps_proc->i4_ctb_y * (ps_sps->i2_pic_wd_in_ctb);
788                    next_ctb_idx = ps_proc->i4_next_pu_ctb_cnt;
789                    num_pu_per_ctb = ps_proc->pu4_pic_pu_idx[next_ctb_idx]
790                                    - ps_proc->pu4_pic_pu_idx[cur_ctb_idx];
791                    ctb_start_pu_idx = ps_proc->pu4_pic_pu_idx[cur_ctb_idx];
792                    ps_pu = &ps_proc->ps_pic_pu[ctb_start_pu_idx];
793
794                    for(pu_cnt = 0; pu_cnt < num_pu_per_ctb; pu_cnt++, ps_pu++)
795                    {
796                        UWORD32 cur_pu_idx;
797                        WORD32 pu_ht = (ps_pu->b4_ht + 1) << 2;
798                        WORD32 pu_wd = (ps_pu->b4_wd + 1) << 2;
799
800                        cur_pu_idx = ctb_start_pu_idx + pu_cnt;
801
802                        for(row = 0; row < pu_ht / MIN_PU_SIZE; row++)
803                            for(col = 0; col < pu_wd / MIN_PU_SIZE; col++)
804                                pu4_nbr_pu_idx[(1 + ps_pu->b4_pos_x + col)
805                                                + (1 + ps_pu->b4_pos_y + row)
806                                                * nbr_pu_idx_strd] =
807                                                cur_pu_idx;
808                    }
809
810                    /* Updating Top and Left pointers */
811                    {
812                        WORD32 rows_remaining = ps_sps->i2_pic_height_in_luma_samples
813                                        - (ps_proc->i4_ctb_y << ps_sps->i1_log2_ctb_size);
814                        WORD32 ctb_size_left = MIN(ctb_size, rows_remaining);
815
816                        /* Top Left */
817                        /* saving top left before updating top ptr, as updating top ptr will overwrite the top left for the next ctb */
818                        ps_proc->u4_ctb_top_left_pu_idx = ps_proc->pu4_pic_pu_idx_top[((ps_proc->i4_ctb_x + 1) * ctb_size / MIN_PU_SIZE) - 1];
819                        for(i = 0; i < ctb_size / MIN_PU_SIZE; i++)
820                        {
821                            /* Left */
822                            /* Last column of au4_nbr_pu_idx */
823                            ps_proc->pu4_pic_pu_idx_left[i] =
824                                            pu4_nbr_pu_idx[(ctb_size / MIN_PU_SIZE) + (i + 1) * nbr_pu_idx_strd];
825                            /* Top */
826                            /* Last row of au4_nbr_pu_idx */
827                            ps_proc->pu4_pic_pu_idx_top[(ps_proc->i4_ctb_x * ctb_size / MIN_PU_SIZE) + i] =
828                                            pu4_nbr_pu_idx[(ctb_size_left / MIN_PU_SIZE) * nbr_pu_idx_strd + i + 1];
829
830                        }
831                    }
832                }
833            }
834
835            if(ps_proc->ps_pps->i1_tiles_enabled_flag)
836            {
837                /*Update the tile index buffer with tile information for the current ctb*/
838                UWORD16 *pu1_tile_idx = ps_proc->pu1_tile_idx;
839                pu1_tile_idx[(ps_proc->i4_ctb_x + (ps_proc->i4_ctb_y * ps_sps->i2_pic_wd_in_ctb))]
840                                = ps_proc->i4_cur_tile_idx;
841            }
842
843            /*************************************************/
844            /*********** BS, QP and Deblocking  **************/
845            /*************************************************/
846            /* Boundary strength call has to be after IQ IT recon since QP population needs ps_proc->i4_qp_const_inc_ctb flag */
847
848            {
849                slice_header_t *ps_slice_hdr;
850                ps_slice_hdr = ps_proc->ps_slice_hdr;
851
852
853                /* Check if deblock is disabled for the current slice or if it is disabled for the current picture
854                 * because of disable deblock api
855                 */
856                if(0 == ps_codec->i4_disable_deblk_pic)
857                {
858                    if(ps_codec->i4_num_cores > MV_PRED_NUM_CORES_THRESHOLD)
859                    {
860                        if((0 == ps_slice_hdr->i1_slice_disable_deblocking_filter_flag) &&
861                                        (0 == ps_codec->i4_slice_error))
862                        {
863                            ihevcd_update_ctb_tu_cnt(ps_proc);
864                            ps_proc->s_bs_ctxt.ps_pps = ps_proc->ps_pps;
865                            ps_proc->s_bs_ctxt.ps_sps = ps_proc->ps_sps;
866                            ps_proc->s_bs_ctxt.ps_codec = ps_proc->ps_codec;
867                            ps_proc->s_bs_ctxt.i4_ctb_tu_cnt = ps_proc->i4_ctb_tu_cnt;
868                            ps_proc->s_bs_ctxt.i4_ctb_x = ps_proc->i4_ctb_x;
869                            ps_proc->s_bs_ctxt.i4_ctb_y = ps_proc->i4_ctb_y;
870                            ps_proc->s_bs_ctxt.i4_ctb_tile_x = ps_proc->i4_ctb_tile_x;
871                            ps_proc->s_bs_ctxt.i4_ctb_tile_y = ps_proc->i4_ctb_tile_y;
872                            ps_proc->s_bs_ctxt.i4_ctb_slice_x = ps_proc->i4_ctb_slice_x;
873                            ps_proc->s_bs_ctxt.i4_ctb_slice_y = ps_proc->i4_ctb_slice_y;
874                            ps_proc->s_bs_ctxt.ps_tu = ps_proc->ps_tu;
875                            ps_proc->s_bs_ctxt.ps_pu = ps_proc->ps_pu;
876                            ps_proc->s_bs_ctxt.pu4_pic_pu_idx_map = ps_proc->pu4_pic_pu_idx_map;
877                            ps_proc->s_bs_ctxt.i4_next_pu_ctb_cnt = ps_proc->i4_next_pu_ctb_cnt;
878                            ps_proc->s_bs_ctxt.i4_next_tu_ctb_cnt = ps_proc->i4_next_tu_ctb_cnt;
879                            ps_proc->s_bs_ctxt.pu1_slice_idx = ps_proc->pu1_slice_idx;
880                            ps_proc->s_bs_ctxt.ps_slice_hdr = ps_proc->ps_slice_hdr;
881                            ps_proc->s_bs_ctxt.ps_tile = ps_proc->ps_tile;
882
883                            if(ISLICE == ps_slice_hdr->i1_slice_type)
884                            {
885                                ihevcd_ctb_boundary_strength_islice(&ps_proc->s_bs_ctxt);
886                            }
887                            else
888                            {
889                                ihevcd_ctb_boundary_strength_pbslice(&ps_proc->s_bs_ctxt);
890                            }
891                        }
892                        else
893                        {
894                            WORD32 bs_strd = (ps_sps->i2_pic_wd_in_ctb + 1) * (ctb_size * ctb_size / 8 / 16);
895
896                            UWORD32 *pu4_vert_bs = (UWORD32 *)((UWORD8 *)ps_proc->s_bs_ctxt.pu4_pic_vert_bs +
897                                            ps_proc->i4_ctb_x * (ctb_size * ctb_size / 8 / 16) +
898                                            ps_proc->i4_ctb_y * bs_strd);
899                            UWORD32 *pu4_horz_bs = (UWORD32 *)((UWORD8 *)ps_proc->s_bs_ctxt.pu4_pic_horz_bs +
900                                            ps_proc->i4_ctb_x * (ctb_size * ctb_size / 8 / 16) +
901                                            ps_proc->i4_ctb_y * bs_strd);
902
903                            memset(pu4_vert_bs, 0, (ctb_size / 8 + 1) * (ctb_size / 4) / 8 * 2);
904                            memset(pu4_horz_bs, 0, (ctb_size / 8) * (ctb_size / 4) / 8 * 2);
905
906                        }
907                    }
908                }
909            }
910
911            /* Per CTB update the following */
912            {
913                WORD32 cur_ctb_idx = ps_proc->i4_ctb_x
914                                + ps_proc->i4_ctb_y * (ps_sps->i2_pic_wd_in_ctb);
915                cur_ctb_idx++;
916
917                ps_proc->pu1_pu_map += nctb * num_minpu_in_ctb;
918                ps_proc->ps_tu += ps_proc->i4_ctb_tu_cnt;
919                if((1 == ps_codec->i4_num_cores) &&
920                                (0 == cur_ctb_idx % RESET_TU_BUF_NCTB))
921                {
922                    ps_proc->ps_tu = ps_proc->ps_pic_tu;
923                }
924                ps_proc->ps_pu += ps_proc->i4_ctb_pu_cnt;
925            }
926
927            /* Update proc map for recon*/
928            ihevcd_proc_map_update(ps_proc, proc_type, nctb);
929
930            num_ctb_tmp -= nctb;
931            ihevcd_ctb_pos_update(ps_proc, nctb);
932
933        }
934
935        if(cur_slice_idx != ps_proc->i4_cur_slice_idx)
936        {
937#ifdef GPU_BUILD
938            //TODO GPU : Later define it for ARM only version as well
939            ps_proc->ps_slice_hdr = ps_proc->ps_slice_hdr_base + ((cur_slice_idx)&(MAX_SLICE_HDR_CNT - 1));
940#else
941            ps_proc->ps_slice_hdr = ps_codec->ps_slice_hdr_base + ((cur_slice_idx)&(MAX_SLICE_HDR_CNT - 1));
942#endif
943            ps_proc->i4_cur_slice_idx = cur_slice_idx;
944        }
945        /* Restore the saved variables  */
946        num_ctb_tmp = num_ctb;
947        ps_proc->i4_ctb_x -= num_ctb;
948        ps_proc->i4_ctb_tile_x = cur_ctb_tile_x;
949        ps_proc->i4_ctb_slice_x = cur_ctb_slice_x;
950        ps_proc->i4_ctb_tile_y = cur_ctb_tile_y;
951        ps_proc->i4_ctb_slice_y = cur_ctb_slice_y;
952        ps_proc->pu1_pu_map = pu1_pu_map_cur;
953        ps_proc->ps_tu = ps_tu_cur;
954        proc_type = PROC_RECON;
955
956        while(num_ctb_tmp)
957        {
958
959            /* Check proc map to ensure dependencies for recon are met */
960            ihevcd_proc_map_check(ps_proc, proc_type, nctb);
961
962            ihevcd_slice_hdr_update(ps_proc);
963
964            {
965
966                ihevcd_ctb_avail_update(ps_proc);
967
968                /*************************************************/
969                /**************** IQ IT RECON  *******************/
970                /*************************************************/
971
972                ihevcd_update_ctb_tu_cnt(ps_proc);
973
974                /* When scaling matrix is not to be used(scaling_list_enable_flag is zero in SPS),
975                 * default value of 16 has to be used. Since the value is same for all sizes,
976                 * same table is used for all cases.
977                 */
978                if(0 == ps_sps->i1_scaling_list_enable_flag)
979                {
980                    ps_proc->api2_dequant_intra_matrix[0] =
981                                    (WORD16 *)gi2_flat_scale_mat_32x32;
982                    ps_proc->api2_dequant_intra_matrix[1] =
983                                    (WORD16 *)gi2_flat_scale_mat_32x32;
984                    ps_proc->api2_dequant_intra_matrix[2] =
985                                    (WORD16 *)gi2_flat_scale_mat_32x32;
986                    ps_proc->api2_dequant_intra_matrix[3] =
987                                    (WORD16 *)gi2_flat_scale_mat_32x32;
988
989                    ps_proc->api2_dequant_inter_matrix[0] =
990                                    (WORD16 *)gi2_flat_scale_mat_32x32;
991                    ps_proc->api2_dequant_inter_matrix[1] =
992                                    (WORD16 *)gi2_flat_scale_mat_32x32;
993                    ps_proc->api2_dequant_inter_matrix[2] =
994                                    (WORD16 *)gi2_flat_scale_mat_32x32;
995                    ps_proc->api2_dequant_inter_matrix[3] =
996                                    (WORD16 *)gi2_flat_scale_mat_32x32;
997                }
998                else
999                {
1000                    if(0 == ps_sps->i1_sps_scaling_list_data_present_flag)
1001                    {
1002                        ps_proc->api2_dequant_intra_matrix[0] =
1003                                        (WORD16 *)gi2_flat_scale_mat_32x32;
1004                        ps_proc->api2_dequant_intra_matrix[1] =
1005                                        (WORD16 *)gi2_intra_default_scale_mat_8x8;
1006                        ps_proc->api2_dequant_intra_matrix[2] =
1007                                        (WORD16 *)gi2_intra_default_scale_mat_16x16;
1008                        ps_proc->api2_dequant_intra_matrix[3] =
1009                                        (WORD16 *)gi2_intra_default_scale_mat_32x32;
1010
1011                        ps_proc->api2_dequant_inter_matrix[0] =
1012                                        (WORD16 *)gi2_flat_scale_mat_32x32;
1013                        ps_proc->api2_dequant_inter_matrix[1] =
1014                                        (WORD16 *)gi2_inter_default_scale_mat_8x8;
1015                        ps_proc->api2_dequant_inter_matrix[2] =
1016                                        (WORD16 *)gi2_inter_default_scale_mat_16x16;
1017                        ps_proc->api2_dequant_inter_matrix[3] =
1018                                        (WORD16 *)gi2_inter_default_scale_mat_32x32;
1019                    }
1020                    /*TODO: Add support for custom scaling matrices */
1021                }
1022
1023
1024                /* CTB Level pointers */
1025                ps_proc->pu1_cur_ctb_luma = ps_proc->pu1_cur_pic_luma
1026                                + (ps_proc->i4_ctb_x * ctb_size
1027                                + ps_proc->i4_ctb_y * ctb_size
1028                                * ps_codec->i4_strd);
1029                ps_proc->pu1_cur_ctb_chroma = ps_proc->pu1_cur_pic_chroma
1030                                + ps_proc->i4_ctb_x * ctb_size
1031                                + (ps_proc->i4_ctb_y * ctb_size * ps_codec->i4_strd / 2);
1032#if DEBUG_PRINT_IQ_IT_RECON
1033                printf("\nCTB x=%d, y=%d", ps_proc->i4_ctb_x, ps_proc->i4_ctb_y);
1034                printf("\n CTB size= %d,CTB level availability: L=%d,TL=%d,TR=%d,T=%d",
1035                       ctb_size, ps_proc->u1_left_ctb_avail, ps_proc->u1_top_lt_ctb_avail, ps_proc->u1_top_rt_ctb_avail,
1036                       ps_proc->u1_top_ctb_avail);
1037#endif
1038
1039                ihevcd_iquant_itrans_recon_ctb(ps_proc);
1040            }
1041
1042            /* Per CTB update the following */
1043            {
1044                WORD32 cur_ctb_idx = ps_proc->i4_ctb_x
1045                                + ps_proc->i4_ctb_y * (ps_sps->i2_pic_wd_in_ctb);
1046                cur_ctb_idx++;
1047
1048                ps_proc->pu1_pu_map += nctb * num_minpu_in_ctb;
1049                ps_proc->ps_tu += ps_proc->i4_ctb_tu_cnt;
1050                if((1 == ps_codec->i4_num_cores) &&
1051                                (0 == cur_ctb_idx % RESET_TU_BUF_NCTB))
1052                {
1053                    ps_proc->ps_tu = ps_proc->ps_pic_tu;
1054                }
1055                ps_proc->ps_pu += ps_proc->i4_ctb_pu_cnt;
1056            }
1057
1058
1059            /* Update proc map for recon*/
1060            ihevcd_proc_map_update(ps_proc, proc_type, nctb);
1061
1062            num_ctb_tmp -= nctb;
1063            ihevcd_ctb_pos_update(ps_proc, nctb);
1064        }
1065
1066        if(cur_slice_idx != ps_proc->i4_cur_slice_idx)
1067        {
1068#ifdef GPU_BUILD
1069            //TODO GPU : Later define it for ARM only version as well
1070            ps_proc->ps_slice_hdr = ps_proc->ps_slice_hdr_base + ((cur_slice_idx)&(MAX_SLICE_HDR_CNT - 1));
1071#else
1072            ps_proc->ps_slice_hdr = ps_codec->ps_slice_hdr_base + ((cur_slice_idx)&(MAX_SLICE_HDR_CNT - 1));
1073#endif
1074            ps_proc->i4_cur_slice_idx = cur_slice_idx;
1075        }
1076        /* Restore the saved variables  */
1077        num_ctb_tmp = num_ctb;
1078        ps_proc->i4_ctb_x -= num_ctb;
1079        ps_proc->i4_ctb_tile_x = cur_ctb_tile_x;
1080        ps_proc->i4_ctb_slice_x = cur_ctb_slice_x;
1081        ps_proc->i4_ctb_tile_y = cur_ctb_tile_y;
1082        ps_proc->i4_ctb_slice_y = cur_ctb_slice_y;
1083        pu1_pu_map_nxt = ps_proc->pu1_pu_map;
1084        ps_tu_nxt = ps_proc->ps_tu;
1085        ps_proc->pu1_pu_map = pu1_pu_map_cur;
1086        ps_proc->ps_tu = ps_tu_cur;
1087        proc_type = PROC_DEBLK;
1088
1089        while(num_ctb_tmp)
1090        {
1091            slice_header_t *ps_slice_hdr = ps_proc->ps_slice_hdr;
1092
1093            /* Check proc map to ensure dependencies for deblk are met */
1094            ihevcd_proc_map_check(ps_proc, proc_type, nctb);
1095
1096            ihevcd_slice_hdr_update(ps_proc);
1097            ps_slice_hdr = ps_proc->ps_slice_hdr;
1098
1099            if(((0 == FRAME_ILF_PAD || ps_codec->i4_num_cores != 1)) &&
1100               (0 == ps_codec->i4_disable_deblk_pic))
1101            {
1102                WORD32 i4_is_last_ctb_x = 0;
1103                WORD32 i4_is_last_ctb_y = 0;
1104
1105                if(0 == ps_slice_hdr->i1_slice_disable_deblocking_filter_flag ||
1106                                (ps_proc->i4_ctb_slice_x == 0) ||
1107                                (ps_proc->i4_ctb_slice_y == 0))
1108                {
1109                    ps_proc->s_deblk_ctxt.ps_pps = ps_proc->ps_pps;
1110                    ps_proc->s_deblk_ctxt.ps_sps = ps_proc->ps_sps;
1111                    ps_proc->s_deblk_ctxt.ps_codec = ps_proc->ps_codec;
1112                    ps_proc->s_deblk_ctxt.ps_slice_hdr = ps_proc->ps_slice_hdr;
1113                    ps_proc->s_deblk_ctxt.i4_ctb_x = ps_proc->i4_ctb_x;
1114                    ps_proc->s_deblk_ctxt.i4_ctb_y = ps_proc->i4_ctb_y;
1115                    ps_proc->s_deblk_ctxt.pu1_slice_idx = ps_proc->pu1_slice_idx;
1116                    ps_proc->s_deblk_ctxt.is_chroma_yuv420sp_vu = (ps_codec->e_ref_chroma_fmt == IV_YUV_420SP_VU);
1117
1118                    /* Populating Current CTB's no_loop_filter flags */
1119                    {
1120                        WORD32 row;
1121                        WORD32 log2_ctb_size = ps_sps->i1_log2_ctb_size;
1122
1123                        /* Loop filter strd in units of num bits */
1124                        WORD32 loop_filter_strd = ((ps_sps->i2_pic_width_in_luma_samples + 63) >> 6) << 3;
1125                        /* Bit position is the current 8x8 bit offset wrt pic_no_loop_filter
1126                         * bit_pos has to be a WOR32 so that when it is negative, the downshift still retains it to be a negative value */
1127                        WORD32 bit_pos = ((ps_proc->i4_ctb_y << (log2_ctb_size - 3)) - 1) * loop_filter_strd + (ps_proc->i4_ctb_x << (log2_ctb_size - 3)) - 1;
1128
1129                        for(row = 0; row < (ctb_size >> 3) + 1; row++)
1130                        {
1131                            /* Go to the corresponding byte - read 32 bits and downshift */
1132                            ps_proc->s_deblk_ctxt.au2_ctb_no_loop_filter_flag[row] = (*(UWORD32 *)(ps_proc->pu1_pic_no_loop_filter_flag + (bit_pos >> 3))) >> (bit_pos & 7);
1133                            bit_pos += loop_filter_strd;
1134                        }
1135                    }
1136
1137                    ihevcd_deblk_ctb(&ps_proc->s_deblk_ctxt, i4_is_last_ctb_x, i4_is_last_ctb_y);
1138
1139                    /* If the last CTB in the row was a complete CTB then deblocking has to be called from remaining pixels, since deblocking
1140                     * is applied on a shifted CTB structure
1141                     */
1142                    if(ps_proc->i4_ctb_x == ps_sps->i2_pic_wd_in_ctb - 1)
1143                    {
1144                        WORD32 i4_is_last_ctb_x = 1;
1145                        WORD32 i4_is_last_ctb_y = 0;
1146
1147                        WORD32 last_x_pos;
1148                        last_x_pos = (ps_sps->i2_pic_wd_in_ctb << ps_sps->i1_log2_ctb_size);
1149                        if(last_x_pos  ==  ps_sps->i2_pic_width_in_luma_samples)
1150                        {
1151                            ihevcd_deblk_ctb(&ps_proc->s_deblk_ctxt, i4_is_last_ctb_x, i4_is_last_ctb_y);
1152                        }
1153                    }
1154
1155
1156                    /* If the last CTB in the column was a complete CTB then deblocking has to be called from remaining pixels, since deblocking
1157                     * is applied on a shifted CTB structure
1158                     */
1159                    if(ps_proc->i4_ctb_y == ps_sps->i2_pic_ht_in_ctb - 1)
1160                    {
1161                        WORD32 i4_is_last_ctb_x = 0;
1162                        WORD32 i4_is_last_ctb_y = 1;
1163                        WORD32 last_y_pos;
1164                        last_y_pos = (ps_sps->i2_pic_ht_in_ctb << ps_sps->i1_log2_ctb_size);
1165                        if(last_y_pos == ps_sps->i2_pic_height_in_luma_samples)
1166                        {
1167                            ihevcd_deblk_ctb(&ps_proc->s_deblk_ctxt, i4_is_last_ctb_x, i4_is_last_ctb_y);
1168                        }
1169                    }
1170                }
1171            }
1172
1173            /* Update proc map for deblk*/
1174            ihevcd_proc_map_update(ps_proc, proc_type, nctb);
1175
1176            num_ctb_tmp -= nctb;
1177            ihevcd_ctb_pos_update(ps_proc, nctb);
1178        }
1179
1180        if(cur_slice_idx != ps_proc->i4_cur_slice_idx)
1181        {
1182#ifdef GPU_BUILD
1183            //TODO GPU : Later define it for ARM only version as well
1184            ps_proc->ps_slice_hdr = ps_proc->ps_slice_hdr_base + ((cur_slice_idx)&(MAX_SLICE_HDR_CNT - 1));
1185#else
1186            ps_proc->ps_slice_hdr = ps_codec->ps_slice_hdr_base + ((cur_slice_idx)&(MAX_SLICE_HDR_CNT - 1));
1187#endif
1188            ps_proc->i4_cur_slice_idx = cur_slice_idx;
1189        }
1190        /* Restore the saved variables  */
1191        num_ctb_tmp = num_ctb;
1192        ps_proc->i4_ctb_x -= num_ctb;
1193        ps_proc->i4_ctb_tile_x = cur_ctb_tile_x;
1194        ps_proc->i4_ctb_tile_y = cur_ctb_tile_y;
1195        ps_proc->pu1_pu_map = pu1_pu_map_cur;
1196        ps_proc->ps_tu = ps_tu_cur;
1197        nxt_ctb_slice_y = ps_proc->i4_ctb_slice_y;
1198        nxt_ctb_slice_x = ps_proc->i4_ctb_slice_x;
1199        ps_proc->i4_ctb_slice_y = cur_ctb_slice_y;
1200        ps_proc->i4_ctb_slice_x = cur_ctb_slice_x;
1201        proc_type = PROC_SAO;
1202
1203        while(num_ctb_tmp)
1204        {
1205            slice_header_t *ps_slice_hdr = ps_proc->ps_slice_hdr;
1206
1207            /* Check proc map to ensure dependencies for SAO are met */
1208            ihevcd_proc_map_check(ps_proc, proc_type, nctb);
1209
1210            ihevcd_slice_hdr_update(ps_proc);
1211            ps_slice_hdr = ps_proc->ps_slice_hdr;
1212
1213            if(0 == FRAME_ILF_PAD || ps_codec->i4_num_cores != 1)
1214            {
1215                if((0 == ps_codec->i4_disable_sao_pic) &&
1216                                (ps_slice_hdr->i1_slice_sao_luma_flag || ps_slice_hdr->i1_slice_sao_chroma_flag))
1217                {
1218                    ps_proc->s_sao_ctxt.ps_pps = ps_proc->ps_pps;
1219                    ps_proc->s_sao_ctxt.ps_sps = ps_proc->ps_sps;
1220                    ps_proc->s_sao_ctxt.ps_tile = ps_proc->ps_tile;
1221                    ps_proc->s_sao_ctxt.ps_codec = ps_proc->ps_codec;
1222                    ps_proc->s_sao_ctxt.ps_slice_hdr = ps_proc->ps_slice_hdr;
1223                    ps_proc->s_sao_ctxt.i4_cur_slice_idx = ps_proc->i4_cur_slice_idx;
1224
1225
1226#if SAO_PROCESS_SHIFT_CTB
1227                    ps_proc->s_sao_ctxt.i4_ctb_x = ps_proc->i4_ctb_x;
1228                    ps_proc->s_sao_ctxt.i4_ctb_y = ps_proc->i4_ctb_y;
1229                    ps_proc->s_sao_ctxt.is_chroma_yuv420sp_vu = (ps_codec->e_ref_chroma_fmt == IV_YUV_420SP_VU);
1230
1231                    ihevcd_sao_shift_ctb(&ps_proc->s_sao_ctxt);
1232#else
1233                    if(ps_proc->i4_ctb_x > 1 && ps_proc->i4_ctb_y > 0)
1234                    {
1235                        ps_proc->s_sao_ctxt.i4_ctb_x = ps_proc->i4_ctb_x - 2;
1236                        ps_proc->s_sao_ctxt.i4_ctb_y = ps_proc->i4_ctb_y - 1;
1237
1238                        ihevcd_sao_ctb(&ps_proc->s_sao_ctxt);
1239                    }
1240
1241                    if(ps_sps->i2_pic_wd_in_ctb - 1 == ps_proc->i4_ctb_x && ps_proc->i4_ctb_y > 0)
1242                    {
1243                        ps_proc->s_sao_ctxt.i4_ctb_x = ps_proc->i4_ctb_x - 1;
1244                        ps_proc->s_sao_ctxt.i4_ctb_y = ps_proc->i4_ctb_y - 1;
1245
1246                        ihevcd_sao_ctb(&ps_proc->s_sao_ctxt);
1247
1248                        ps_proc->s_sao_ctxt.i4_ctb_x = ps_proc->i4_ctb_x;
1249                        ps_proc->s_sao_ctxt.i4_ctb_y = ps_proc->i4_ctb_y - 1;
1250
1251                        ihevcd_sao_ctb(&ps_proc->s_sao_ctxt);
1252
1253                        if(ps_sps->i2_pic_ht_in_ctb - 1 == ps_proc->i4_ctb_y)
1254                        {
1255                            WORD32 i4_ctb_x;
1256                            ps_proc->s_sao_ctxt.i4_ctb_y = ps_proc->i4_ctb_y;
1257                            for(i4_ctb_x = 0; i4_ctb_x < ps_sps->i2_pic_wd_in_ctb; i4_ctb_x++)
1258                            {
1259                                ps_proc->s_sao_ctxt.i4_ctb_x = i4_ctb_x;
1260                                ihevcd_sao_ctb(&ps_proc->s_sao_ctxt);
1261                            }
1262                        }
1263                    }
1264#endif
1265                }
1266
1267
1268                /* Call padding if required */
1269                {
1270#if SAO_PROCESS_SHIFT_CTB
1271
1272                    if(0 == ps_proc->i4_ctb_x)
1273                    {
1274                        WORD32 pad_ht_luma;
1275                        WORD32 pad_ht_chroma;
1276
1277                        ps_proc->pu1_cur_ctb_luma = ps_proc->pu1_cur_pic_luma
1278                                        + (ps_proc->i4_ctb_x * ctb_size
1279                                        + ps_proc->i4_ctb_y * ctb_size
1280                                        * ps_codec->i4_strd);
1281                        ps_proc->pu1_cur_ctb_chroma = ps_proc->pu1_cur_pic_chroma
1282                                        + ps_proc->i4_ctb_x * ctb_size
1283                                        + (ps_proc->i4_ctb_y * ctb_size * ps_codec->i4_strd / 2);
1284
1285                        pad_ht_luma = ctb_size;
1286                        pad_ht_luma += (ps_sps->i2_pic_ht_in_ctb - 1) == ps_proc->i4_ctb_y ? 8 : 0;
1287                        pad_ht_chroma = ctb_size / 2;
1288                        /* Pad left after 1st CTB is processed */
1289                        ps_codec->s_func_selector.ihevc_pad_left_luma_fptr(ps_proc->pu1_cur_ctb_luma - 8 * ps_codec->i4_strd, ps_codec->i4_strd, pad_ht_luma, PAD_LEFT);
1290                        ps_codec->s_func_selector.ihevc_pad_left_chroma_fptr(ps_proc->pu1_cur_ctb_chroma - 16 * ps_codec->i4_strd, ps_codec->i4_strd, pad_ht_chroma, PAD_LEFT);
1291                    }
1292
1293                    if((ps_sps->i2_pic_wd_in_ctb - 1) == ps_proc->i4_ctb_x)
1294                    {
1295                        WORD32 pad_ht_luma;
1296                        WORD32 pad_ht_chroma;
1297                        WORD32 cols_remaining = ps_sps->i2_pic_width_in_luma_samples - (ps_proc->i4_ctb_x << ps_sps->i1_log2_ctb_size);
1298
1299                        ps_proc->pu1_cur_ctb_luma = ps_proc->pu1_cur_pic_luma
1300                                        + (ps_proc->i4_ctb_x * ctb_size
1301                                        + ps_proc->i4_ctb_y * ctb_size
1302                                        * ps_codec->i4_strd);
1303                        ps_proc->pu1_cur_ctb_chroma = ps_proc->pu1_cur_pic_chroma
1304                                        + ps_proc->i4_ctb_x * ctb_size
1305                                        + (ps_proc->i4_ctb_y * ctb_size * ps_codec->i4_strd / 2);
1306
1307                        pad_ht_luma = ctb_size;
1308                        pad_ht_chroma = ctb_size / 2;
1309                        if((ps_sps->i2_pic_ht_in_ctb - 1) == ps_proc->i4_ctb_y)
1310                        {
1311                            pad_ht_luma += 8;
1312                            pad_ht_chroma += 16;
1313                            ps_codec->s_func_selector.ihevc_pad_left_chroma_fptr(ps_proc->pu1_cur_pic_chroma + (ps_sps->i2_pic_height_in_luma_samples / 2 - 16) * ps_codec->i4_strd,
1314                                                                                 ps_codec->i4_strd, 16, PAD_LEFT);
1315                        }
1316                        /* Pad right after last CTB in the current row is processed */
1317                        ps_codec->s_func_selector.ihevc_pad_right_luma_fptr(ps_proc->pu1_cur_ctb_luma + cols_remaining - 8 * ps_codec->i4_strd, ps_codec->i4_strd, pad_ht_luma, PAD_RIGHT);
1318                        ps_codec->s_func_selector.ihevc_pad_right_chroma_fptr(ps_proc->pu1_cur_ctb_chroma + cols_remaining - 16 * ps_codec->i4_strd, ps_codec->i4_strd, pad_ht_chroma, PAD_RIGHT);
1319
1320                        if((ps_sps->i2_pic_ht_in_ctb - 1) == ps_proc->i4_ctb_y)
1321                        {
1322                            UWORD8 *pu1_buf;
1323                            /* Since SAO is shifted by 8x8, chroma padding can not be done till second row is processed */
1324                            /* Hence moving top padding to to end of frame, Moving it to second row also results in problems when there is only one row */
1325                            /* Pad top after padding left and right for current rows after processing 1st CTB row */
1326                            ihevc_pad_top(ps_proc->pu1_cur_pic_luma - PAD_LEFT, ps_codec->i4_strd, ps_sps->i2_pic_width_in_luma_samples + PAD_WD, PAD_TOP);
1327                            ihevc_pad_top(ps_proc->pu1_cur_pic_chroma - PAD_LEFT, ps_codec->i4_strd, ps_sps->i2_pic_width_in_luma_samples + PAD_WD, PAD_TOP / 2);
1328
1329                            pu1_buf = ps_proc->pu1_cur_pic_luma + ps_codec->i4_strd * ps_sps->i2_pic_height_in_luma_samples - PAD_LEFT;
1330                            /* Pad top after padding left and right for current rows after processing 1st CTB row */
1331                            ihevc_pad_bottom(pu1_buf, ps_codec->i4_strd, ps_sps->i2_pic_width_in_luma_samples + PAD_WD, PAD_BOT);
1332
1333                            pu1_buf = ps_proc->pu1_cur_pic_chroma + ps_codec->i4_strd * (ps_sps->i2_pic_height_in_luma_samples / 2) - PAD_LEFT;
1334                            ihevc_pad_bottom(pu1_buf, ps_codec->i4_strd, ps_sps->i2_pic_width_in_luma_samples + PAD_WD, PAD_BOT / 2);
1335                        }
1336                    }
1337#else
1338                    if(ps_proc->i4_ctb_y > 1)
1339                    {
1340                        if(0 == ps_proc->i4_ctb_x)
1341                        {
1342                            WORD32 pad_ht_luma;
1343                            WORD32 pad_ht_chroma;
1344
1345                            pad_ht_luma = ctb_size;
1346                            pad_ht_chroma = ctb_size / 2;
1347                            /* Pad left after 1st CTB is processed */
1348                            ps_codec->s_func_selector.ihevc_pad_left_luma_fptr(ps_proc->pu1_cur_ctb_luma - 2 * ctb_size * ps_codec->i4_strd, ps_codec->i4_strd, pad_ht_luma, PAD_LEFT);
1349                            ps_codec->s_func_selector.ihevc_pad_left_chroma_fptr(ps_proc->pu1_cur_ctb_chroma - ctb_size * ps_codec->i4_strd, ps_codec->i4_strd, pad_ht_chroma, PAD_LEFT);
1350                        }
1351                        else if((ps_sps->i2_pic_wd_in_ctb - 1) == ps_proc->i4_ctb_x)
1352                        {
1353                            WORD32 pad_ht_luma;
1354                            WORD32 pad_ht_chroma;
1355                            WORD32 cols_remaining = ps_sps->i2_pic_width_in_luma_samples - (ps_proc->i4_ctb_x << ps_sps->i1_log2_ctb_size);
1356
1357                            pad_ht_luma = ((ps_sps->i2_pic_ht_in_ctb - 1) == ps_proc->i4_ctb_y) ? 3 * ctb_size : ctb_size;
1358                            pad_ht_chroma = ((ps_sps->i2_pic_ht_in_ctb - 1) == ps_proc->i4_ctb_y) ? 3 * ctb_size / 2 : ctb_size / 2;
1359                            /* Pad right after last CTB in the current row is processed */
1360                            ps_codec->s_func_selector.ihevc_pad_right_luma_fptr(ps_proc->pu1_cur_ctb_luma + cols_remaining - 2 * ctb_size * ps_codec->i4_strd, ps_codec->i4_strd, pad_ht_luma, PAD_RIGHT);
1361                            ps_codec->s_func_selector.ihevc_pad_right_chroma_fptr(ps_proc->pu1_cur_ctb_chroma + cols_remaining - ctb_size * ps_codec->i4_strd, ps_codec->i4_strd, pad_ht_chroma, PAD_RIGHT);
1362
1363                            if((ps_sps->i2_pic_ht_in_ctb - 1) == ps_proc->i4_ctb_y)
1364                            {
1365                                UWORD8 *pu1_buf;
1366                                WORD32 pad_ht_luma;
1367                                WORD32 pad_ht_chroma;
1368
1369                                pad_ht_luma = 2 * ctb_size;
1370                                pad_ht_chroma = ctb_size;
1371
1372                                ps_codec->s_func_selector.ihevc_pad_left_luma_fptr(ps_proc->pu1_cur_pic_luma + ps_codec->i4_strd * (ps_sps->i2_pic_height_in_luma_samples - 2 * ctb_size),
1373                                                                                   ps_codec->i4_strd, pad_ht_luma, PAD_LEFT);
1374                                ps_codec->s_func_selector.ihevc_pad_left_chroma_fptr(ps_proc->pu1_cur_pic_chroma + ps_codec->i4_strd * (ps_sps->i2_pic_height_in_luma_samples / 2 - ctb_size),
1375                                                                                     ps_codec->i4_strd, pad_ht_chroma, PAD_LEFT);
1376
1377                                /* Since SAO is shifted by 8x8, chroma padding can not be done till second row is processed */
1378                                /* Hence moving top padding to to end of frame, Moving it to second row also results in problems when there is only one row */
1379                                /* Pad top after padding left and right for current rows after processing 1st CTB row */
1380                                ihevc_pad_top(ps_proc->pu1_cur_pic_luma - PAD_LEFT, ps_codec->i4_strd, ps_sps->i2_pic_width_in_luma_samples + PAD_WD, PAD_TOP);
1381                                ihevc_pad_top(ps_proc->pu1_cur_pic_chroma - PAD_LEFT, ps_codec->i4_strd, ps_sps->i2_pic_width_in_luma_samples + PAD_WD, PAD_TOP / 2);
1382
1383                                pu1_buf = ps_proc->pu1_cur_pic_luma + ps_codec->i4_strd * ps_sps->i2_pic_height_in_luma_samples - PAD_LEFT;
1384                                /* Pad top after padding left and right for current rows after processing 1st CTB row */
1385                                ihevc_pad_bottom(pu1_buf, ps_codec->i4_strd, ps_sps->i2_pic_width_in_luma_samples + PAD_WD, PAD_BOT);
1386
1387                                pu1_buf = ps_proc->pu1_cur_pic_chroma + ps_codec->i4_strd * (ps_sps->i2_pic_height_in_luma_samples / 2) - PAD_LEFT;
1388                                ihevc_pad_bottom(pu1_buf, ps_codec->i4_strd, ps_sps->i2_pic_width_in_luma_samples + PAD_WD, PAD_BOT / 2);
1389                            }
1390                        }
1391                    }
1392#endif
1393                }
1394            }
1395
1396
1397            /* Update proc map for SAO*/
1398            ihevcd_proc_map_update(ps_proc, proc_type, nctb);
1399            /* Update proc map for Completion of CTB*/
1400            ihevcd_proc_map_update(ps_proc, PROC_ALL, nctb);
1401            {
1402                tile_t *ps_tile;
1403
1404                ps_tile = ps_proc->ps_tile;
1405                num_ctb_tmp -= nctb;
1406
1407                ps_proc->i4_ctb_tile_x += nctb;
1408                ps_proc->i4_ctb_x += nctb;
1409
1410                ps_proc->i4_ctb_slice_x += nctb;
1411
1412
1413                /* Update tile counters */
1414                if(ps_proc->i4_ctb_tile_x >= (ps_tile->u2_wd))
1415                {
1416                    /*End of tile row*/
1417                    ps_proc->i4_ctb_tile_x = 0;
1418                    ps_proc->i4_ctb_x = ps_tile->u1_pos_x;
1419
1420                    ps_proc->i4_ctb_tile_y++;
1421                    ps_proc->i4_ctb_y++;
1422                    if(ps_proc->i4_ctb_tile_y == ps_tile->u2_ht)
1423                    {
1424                        /* Reached End of Tile */
1425                        ps_proc->i4_ctb_tile_y = 0;
1426                        ps_proc->i4_ctb_tile_x = 0;
1427                        ps_proc->ps_tile++;
1428                        //End of picture
1429                        if(!((ps_tile->u2_ht + ps_tile->u1_pos_y  ==  ps_sps->i2_pic_ht_in_ctb) && (ps_tile->u2_wd + ps_tile->u1_pos_x  ==  ps_sps->i2_pic_wd_in_ctb)))
1430                        {
1431                            ps_tile = ps_proc->ps_tile;
1432                            ps_proc->i4_ctb_x = ps_tile->u1_pos_x;
1433                            ps_proc->i4_ctb_y = ps_tile->u1_pos_y;
1434
1435                        }
1436                    }
1437                }
1438            }
1439        }
1440
1441        ps_proc->i4_ctb_cnt -= num_ctb;
1442    }
1443    return ret;
1444}
1445
1446void ihevcd_init_proc_ctxt(process_ctxt_t *ps_proc, WORD32 tu_coeff_data_ofst)
1447{
1448    codec_t *ps_codec;
1449    slice_header_t *ps_slice_hdr;
1450    pps_t *ps_pps;
1451    sps_t *ps_sps;
1452    tile_t *ps_tile, *ps_tile_prev;
1453    WORD32 tile_idx;
1454    WORD32 ctb_size;
1455    WORD32 num_minpu_in_ctb;
1456    WORD32 num_ctb_in_row;
1457    WORD32 ctb_addr;
1458    WORD32 i4_wd_in_ctb;
1459    WORD32 tile_start_ctb_idx;
1460    WORD32 slice_start_ctb_idx;
1461    WORD32 check_tile_wd;
1462    WORD32 continuous_tiles = 0; //Refers to tiles that are continuous, within a slice, horizontally
1463
1464    ps_codec = ps_proc->ps_codec;
1465
1466#ifdef GPU_BUILD
1467    //TODO GPU : Later define it for ARM only version as well
1468    ps_slice_hdr = ps_proc->ps_slice_hdr_base + ((ps_proc->i4_cur_slice_idx) & (MAX_SLICE_HDR_CNT - 1));
1469#else
1470    ps_slice_hdr = ps_codec->ps_slice_hdr_base + ((ps_proc->i4_cur_slice_idx) & (MAX_SLICE_HDR_CNT - 1));
1471#endif
1472    ps_proc->ps_slice_hdr = ps_slice_hdr;
1473    ps_proc->ps_pps = ps_codec->ps_pps_base + ps_slice_hdr->i1_pps_id;
1474    ps_pps = ps_proc->ps_pps;
1475    ps_proc->ps_sps = ps_codec->ps_sps_base + ps_pps->i1_sps_id;
1476    ps_sps = ps_proc->ps_sps;
1477    ps_proc->i4_init_done = 1;
1478    ctb_size = 1 << ps_sps->i1_log2_ctb_size;
1479    num_minpu_in_ctb = (ctb_size / MIN_PU_SIZE) * (ctb_size / MIN_PU_SIZE);
1480    num_ctb_in_row = ps_sps->i2_pic_wd_in_ctb;
1481
1482    ps_proc->s_sao_ctxt.pu1_slice_idx = ps_proc->pu1_slice_idx;
1483
1484    ihevcd_get_tile_pos(ps_pps, ps_sps, ps_proc->i4_ctb_x, ps_proc->i4_ctb_y,
1485                        &ps_proc->i4_ctb_tile_x, &ps_proc->i4_ctb_tile_y,
1486                        &tile_idx);
1487
1488    ps_proc->ps_tile = ps_pps->ps_tile + tile_idx;
1489    ps_proc->i4_cur_tile_idx = tile_idx;
1490    ps_tile = ps_proc->ps_tile;
1491
1492    if(ps_pps->i1_tiles_enabled_flag)
1493    {
1494        if(tile_idx)
1495            ps_tile_prev = ps_tile - 1;
1496        else
1497            ps_tile_prev = ps_tile;
1498
1499        slice_start_ctb_idx = ps_slice_hdr->i2_ctb_x + (ps_slice_hdr->i2_ctb_y * ps_sps->i2_pic_wd_in_ctb);
1500        tile_start_ctb_idx = ps_tile->u1_pos_x + (ps_tile->u1_pos_y * ps_sps->i2_pic_wd_in_ctb);
1501
1502        /*Check if
1503         * 1. Last tile that ends in frame boundary and 1st tile in next row belongs to same slice
1504         * 1.1. If it does, check if the slice that has these tiles spans across the frame row.
1505         * 2. Vertical tiles are present within a slice */
1506        if(((ps_slice_hdr->i2_ctb_x == ps_tile->u1_pos_x) && (ps_slice_hdr->i2_ctb_y != ps_tile->u1_pos_y)))
1507        {
1508            continuous_tiles = 1;
1509        }
1510        else
1511        {
1512            check_tile_wd = ps_slice_hdr->i2_ctb_x + ps_tile_prev->u2_wd;
1513            if(!(((check_tile_wd >= ps_sps->i2_pic_wd_in_ctb) && (check_tile_wd % ps_sps->i2_pic_wd_in_ctb == ps_tile->u1_pos_x))
1514                                            || ((ps_slice_hdr->i2_ctb_x == ps_tile->u1_pos_x))))
1515            {
1516                continuous_tiles = 1;
1517            }
1518        }
1519
1520        {
1521            WORD32 i2_independent_ctb_x = ps_slice_hdr->i2_independent_ctb_x;
1522            WORD32 i2_independent_ctb_y = ps_slice_hdr->i2_independent_ctb_y;
1523
1524            /* Handles cases where
1525             * 1. Slices begin at the start of each tile
1526             * 2. Tiles lie in the same slice row.i.e, starting tile_x > slice_x, but tile_y == slice_y
1527             * */
1528            if(ps_proc->i4_ctb_x >= i2_independent_ctb_x)
1529            {
1530                ps_proc->i4_ctb_slice_x = ps_proc->i4_ctb_x - i2_independent_ctb_x;
1531            }
1532            else
1533            {
1534                /* Indicates multiple tiles in a slice case where
1535                 * The new tile belongs to an older slice that started in the previous rows-not the present row
1536                 * & (tile_y > slice_y and tile_x < slice_x)
1537                 */
1538                if((slice_start_ctb_idx < tile_start_ctb_idx) && (continuous_tiles))
1539                {
1540                    i4_wd_in_ctb = ps_sps->i2_pic_wd_in_ctb;
1541                }
1542                /* Indicates many-tiles-in-one-slice case, for slices that end without spanning the frame width*/
1543                else
1544                {
1545                    i4_wd_in_ctb = ps_tile->u2_wd;
1546                }
1547
1548                if(continuous_tiles)
1549                {
1550                    ps_proc->i4_ctb_slice_x = i4_wd_in_ctb
1551                                    - (i2_independent_ctb_x - ps_proc->i4_ctb_x);
1552                }
1553                else
1554                {
1555                    ps_proc->i4_ctb_slice_x = ps_proc->i4_ctb_x - ps_tile->u1_pos_x;
1556                }
1557            }
1558            /* Initialize ctb slice y to zero and at the start of slice row initialize it
1559        to difference between ctb_y and slice's start ctb y */
1560
1561            ps_proc->i4_ctb_slice_y = ps_proc->i4_ctb_y - i2_independent_ctb_y;
1562
1563            /*If beginning of tile, check if slice counters are set correctly*/
1564            if((0 == ps_proc->i4_ctb_tile_x) && (0 == ps_proc->i4_ctb_tile_y))
1565            {
1566                if(ps_slice_hdr->i1_dependent_slice_flag)
1567                {
1568                    ps_proc->i4_ctb_slice_x = 0;
1569                    ps_proc->i4_ctb_slice_y = 0;
1570                }
1571                /*For slices that span across multiple tiles*/
1572                else if(slice_start_ctb_idx < tile_start_ctb_idx)
1573                {
1574                    ps_proc->i4_ctb_slice_y = ps_tile->u1_pos_y - i2_independent_ctb_y;
1575                    /* Two Cases
1576                     * 1 - slice spans across frame-width- but dose not start from 1st column
1577                     * 2 - Slice spans across multiple tiles anywhere is a frame
1578                     */
1579                    /*TODO:In a multiple slice clip,  if an independent slice span across more than 2 tiles in a row, it is not supported*/
1580                    if(continuous_tiles) //Case 2-implemented for slices that span not more than 2 tiles
1581                    {
1582                        if(i2_independent_ctb_y <= ps_tile->u1_pos_y)
1583                        {
1584                            //Check if ctb x is before or after
1585                            if(i2_independent_ctb_x > ps_tile->u1_pos_x)
1586                            {
1587                                ps_proc->i4_ctb_slice_y -= 1;
1588                            }
1589                        }
1590                    }
1591                }
1592            }
1593            //Slice starts from a column which is not the starting tile-column, but is within the tile
1594            if(((i2_independent_ctb_x - ps_tile->u1_pos_x) != 0) && ((ps_proc->i4_ctb_slice_y != 0))
1595                            && ((i2_independent_ctb_x >= ps_tile->u1_pos_x) && (i2_independent_ctb_x < ps_tile->u1_pos_x + ps_tile->u2_wd)))
1596            {
1597                ps_proc->i4_ctb_slice_y -= 1;
1598            }
1599        }
1600    }
1601    else
1602    {
1603        WORD32 i2_independent_ctb_x = ps_slice_hdr->i2_independent_ctb_x;
1604        WORD32 i2_independent_ctb_y = ps_slice_hdr->i2_independent_ctb_y;
1605
1606
1607        {
1608            ps_proc->i4_ctb_slice_x = ps_proc->i4_ctb_x - i2_independent_ctb_x;
1609            ps_proc->i4_ctb_slice_y = ps_proc->i4_ctb_y - i2_independent_ctb_y;
1610            if(ps_proc->i4_ctb_slice_x < 0)
1611            {
1612                ps_proc->i4_ctb_slice_x += ps_sps->i2_pic_wd_in_ctb;
1613                ps_proc->i4_ctb_slice_y -= 1;
1614            }
1615
1616            /* Initialize ctb slice y to zero and at the start of slice row initialize it
1617            to difference between ctb_y and slice's start ctb y */
1618        }
1619    }
1620
1621    /* Compute TU offset for the current CTB set */
1622    {
1623
1624        WORD32 ctb_luma_min_tu_cnt;
1625        WORD32 ctb_addr;
1626
1627        ctb_addr = ps_proc->i4_ctb_y * num_ctb_in_row + ps_proc->i4_ctb_x;
1628
1629        ctb_luma_min_tu_cnt = (1 << ps_sps->i1_log2_ctb_size) / MIN_TU_SIZE;
1630        ctb_luma_min_tu_cnt *= ctb_luma_min_tu_cnt;
1631
1632        ps_proc->pu1_tu_map = ps_proc->pu1_pic_tu_map
1633                        + ctb_luma_min_tu_cnt * ctb_addr;
1634        if(1 == ps_codec->i4_num_cores)
1635        {
1636            ps_proc->ps_tu = ps_proc->ps_pic_tu + ps_proc->pu4_pic_tu_idx[ctb_addr % RESET_TU_BUF_NCTB];
1637        }
1638        else
1639        {
1640            ps_proc->ps_tu = ps_proc->ps_pic_tu + ps_proc->pu4_pic_tu_idx[ctb_addr];
1641        }
1642        ps_proc->pv_tu_coeff_data = (UWORD8 *)ps_proc->pv_pic_tu_coeff_data
1643                        + tu_coeff_data_ofst;
1644
1645    }
1646
1647    /* Compute PU related elements for the current CTB set */
1648    {
1649        WORD32 pu_idx;
1650        ctb_addr = ps_proc->i4_ctb_y * num_ctb_in_row + ps_proc->i4_ctb_x;
1651        pu_idx = ps_proc->pu4_pic_pu_idx[ctb_addr];
1652        ps_proc->pu1_pu_map = ps_proc->pu1_pic_pu_map
1653                        + ctb_addr * num_minpu_in_ctb;
1654        ps_proc->ps_pu = ps_proc->ps_pic_pu + pu_idx;
1655    }
1656
1657    /* Number of ctbs processed in one loop of process function */
1658    {
1659        ps_proc->i4_nctb = MIN(ps_codec->u4_nctb, ps_tile->u2_wd);
1660    }
1661
1662}
1663void ihevcd_process_thread(process_ctxt_t *ps_proc)
1664{
1665#ifdef GPU_BUILD
1666    codec_t *ps_codec = ps_proc->ps_codec;
1667#endif
1668    {
1669        ithread_set_affinity(ps_proc->i4_id + 1);
1670    }
1671    while(1)
1672    {
1673        IHEVCD_ERROR_T ret;
1674        proc_job_t s_job;
1675
1676        ret = ihevcd_jobq_dequeue((jobq_t *)ps_proc->pv_proc_jobq, &s_job,
1677                                  sizeof(proc_job_t), 1);
1678        if((IHEVCD_ERROR_T)IHEVCD_SUCCESS != ret)
1679            break;
1680
1681        ps_proc->i4_ctb_cnt = s_job.i2_ctb_cnt;
1682        ps_proc->i4_ctb_x = s_job.i2_ctb_x;
1683        ps_proc->i4_ctb_y = s_job.i2_ctb_y;
1684        ps_proc->i4_cur_slice_idx = s_job.i2_slice_idx;
1685
1686
1687
1688        if(CMD_PROCESS == s_job.i4_cmd)
1689        {
1690            ihevcd_init_proc_ctxt(ps_proc, s_job.i4_tu_coeff_data_ofst);
1691#ifdef GPU_BUILD
1692            if(1) //g_enable_gpu == 1)
1693            {
1694
1695                if(s_job.i2_wait)
1696                {
1697                    //long long start_time, stop_time;
1698                    //start_time = itGetUs();
1699                    //printf("Before MC wait\n");
1700                    ihevcd_gpu_mc_wait(ps_proc, s_job.i2_granularity_idx);
1701                    //printf("After MC wait\n");
1702                    //stop_time = itGetUs();
1703                    //printf("CL Wait time time = %lld us\n", (stop_time - start_time));
1704                }
1705
1706            }
1707#endif
1708            ihevcd_process(ps_proc);
1709        }
1710        else if(CMD_FMTCONV == s_job.i4_cmd)
1711        {
1712            sps_t *ps_sps;
1713            codec_t *ps_codec;
1714            ivd_out_bufdesc_t *ps_out_buffer;
1715            WORD32 num_rows;
1716
1717            if(0 == ps_proc->i4_init_done)
1718            {
1719                ihevcd_init_proc_ctxt(ps_proc, 0);
1720            }
1721            ps_sps = ps_proc->ps_sps;
1722            ps_codec = ps_proc->ps_codec;
1723            ps_out_buffer = ps_proc->ps_out_buffer;
1724            num_rows = 1 << ps_sps->i1_log2_ctb_size;
1725
1726            num_rows = MIN(num_rows, (ps_codec->i4_disp_ht - (s_job.i2_ctb_y << ps_sps->i1_log2_ctb_size)));
1727
1728            if(num_rows < 0)
1729                num_rows = 0;
1730
1731            ihevcd_fmt_conv(ps_proc->ps_codec, ps_proc, ps_out_buffer->pu1_bufs[0], ps_out_buffer->pu1_bufs[1], ps_out_buffer->pu1_bufs[2],
1732                            s_job.i2_ctb_y << ps_sps->i1_log2_ctb_size, num_rows);
1733        }
1734    }
1735    //ithread_exit(0);
1736    return;
1737}
1738
1739