1/******************************************************************************
2 *
3 * Copyright (C) 2015 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19*/
20
21/**
22 *******************************************************************************
23 * @file
24 *  ih264e_core_coding.c
25 *
26 * @brief
27 *  This file contains routines that perform luma and chroma core coding for
28 *  intra macroblocks
29 *
30 * @author
31 *  ittiam
32 *
33 * @par List of Functions:
34 *  - ih264e_pack_l_mb_i16()
35 *  - ih264e_pack_c_mb_i8()
36 *  - ih264e_code_luma_intra_macroblock_16x16()
37 *  - ih264e_code_luma_intra_macroblock_4x4()
38 *  - ih264e_code_chroma_intra_macroblock_8x8()
39 *
40 * @remarks
41 *  None
42 *
43 *******************************************************************************
44 */
45
46/*****************************************************************************/
47/* File Includes                                                             */
48/*****************************************************************************/
49
50/* System include files */
51#include <stdio.h>
52#include <string.h>
53#include <assert.h>
54
55/* User include files */
56#include "ih264e_config.h"
57#include "ih264_typedefs.h"
58#include "ih264_platform_macros.h"
59#include "iv2.h"
60#include "ive2.h"
61#include "ih264_macros.h"
62#include "ih264_defs.h"
63#include "ih264e_defs.h"
64#include "ih264_trans_data.h"
65#include "ih264e_error.h"
66#include "ih264e_bitstream.h"
67#include "ime_distortion_metrics.h"
68#include "ime_defs.h"
69#include "ime_structs.h"
70#include "ih264_structs.h"
71#include "ih264_trans_quant_itrans_iquant.h"
72#include "ih264_inter_pred_filters.h"
73#include "ih264_mem_fns.h"
74#include "ih264_padding.h"
75#include "ih264_intra_pred_filters.h"
76#include "ih264_deblk_edge_filters.h"
77#include "ih264_cabac_tables.h"
78#include "irc_cntrl_param.h"
79#include "irc_frame_info_collector.h"
80#include "ih264e_rate_control.h"
81#include "ih264e_cabac_structs.h"
82#include "ih264e_structs.h"
83#include "ih264e_globals.h"
84#include "ih264e_core_coding.h"
85#include "ih264e_mc.h"
86
87
88/*****************************************************************************/
89/* Function Definitions                                                      */
90/*****************************************************************************/
91
92/**
93*******************************************************************************
94*
95* @brief
96*  This function performs does the DCT transform then Hadamard transform
97*  and quantization for a macroblock when the mb mode is intra 16x16 mode
98*
99* @par Description:
100*  First  cf4 is done on all 16 4x4 blocks of the 16x16 input block.
101*  Then hadamard transform is done on the DC coefficients
102*  Quantization is then performed on the 16x16 block, 4x4 wise
103*
104* @param[in] pu1_src
105*  Pointer to source sub-block
106*
107* @param[in] pu1_pred
108*  Pointer to prediction sub-block
109*
110* @param[in] pi2_out
111*  Pointer to residual sub-block
112*  The output will be in linear format
113*  The first 16 continuous locations will contain the values of Dc block
114*  After DC block and a stride 1st AC block will follow
115*  After one more stride next AC block will follow
116*  The blocks will be in raster scan order
117*
118* @param[in] src_strd
119*  Source stride
120*
121* @param[in] pred_strd
122*  Prediction stride
123*
124* @param[in] dst_strd
125*  Destination stride
126*
127* @param[in] pu2_scale_matrix
128*  The quantization matrix for 4x4 transform
129*
130* @param[in] pu2_threshold_matrix
131*  Threshold matrix
132*
133* @param[in] u4_qbits
134*  15+QP/6
135*
136* @param[in] u4_round_factor
137*  Round factor for quant
138*
139* @param[out] pu1_nnz
140*  Memory to store the non-zeros after transform
141*  The first byte will be the nnz of DC block
142*  From the next byte the AC nnzs will be stored in raster scan order
143*
144* @param u4_dc_flag
145*  Signals if Dc transform is to be done or not
146*   1 -> Dc transform will be done
147*   0 -> Dc transform will not be done
148*
149* @remarks
150*
151*******************************************************************************
152*/
153void ih264e_luma_16x16_resi_trans_dctrans_quant(codec_t *ps_codec,
154                                                UWORD8 *pu1_src,
155                                                UWORD8 *pu1_pred,
156                                                WORD16 *pi2_out,
157                                                WORD32 src_strd,
158                                                WORD32 pred_strd,
159                                                WORD32 dst_strd,
160                                                const UWORD16 *pu2_scale_matrix,
161                                                const UWORD16 *pu2_threshold_matrix,
162                                                UWORD32 u4_qbits,
163                                                UWORD32 u4_round_factor,
164                                                UWORD8 *pu1_nnz,
165                                                UWORD32 u4_dc_flag)
166
167{
168    WORD32 blk_cntr;
169    WORD32 i4_offsetx, i4_offsety;
170    UWORD8 *pu1_curr_src, *pu1_curr_pred;
171
172    WORD16 *pi2_dc_str = pi2_out;
173
174    /* Move to the ac addresses */
175    pu1_nnz++;
176    pi2_out += dst_strd;
177
178    for (blk_cntr = 0; blk_cntr < NUM_LUMA4x4_BLOCKS_IN_MB; blk_cntr++)
179    {
180        IND2SUB_LUMA_MB(blk_cntr, i4_offsetx, i4_offsety);
181
182        pu1_curr_src = pu1_src + i4_offsetx + i4_offsety * src_strd;
183        pu1_curr_pred = pu1_pred + i4_offsetx + i4_offsety * pred_strd;
184
185        ps_codec->pf_resi_trans_quant_4x4(pu1_curr_src, pu1_curr_pred,
186                                          pi2_out + blk_cntr * dst_strd,
187                                          src_strd, pred_strd, pu2_scale_matrix,
188                                          pu2_threshold_matrix, u4_qbits,
189                                          u4_round_factor, &pu1_nnz[blk_cntr],
190                                          &pi2_dc_str[blk_cntr]);
191
192    }
193
194    if (!u4_dc_flag)
195        return;
196
197    /*
198     * In case of i16x16, we need to remove the contribution of dc coeffs into
199     * nnz of each block. We are doing that in the packing function
200     */
201
202    /* Adjust pointers to point to dc values */
203    pi2_out -= dst_strd;
204    pu1_nnz--;
205
206    u4_qbits++;
207    u4_round_factor <<= 1;
208
209    ps_codec->pf_hadamard_quant_4x4(pi2_dc_str, pi2_out, pu2_scale_matrix,
210                                    pu2_threshold_matrix, u4_qbits,
211                                    u4_round_factor, &pu1_nnz[0]);
212}
213
214/**
215*******************************************************************************
216*
217* @brief
218*  This function performs the intra 16x16 inverse transform process for H264
219*  it includes inverse Dc transform, inverse quant and then inverse transform
220*
221* @par Description:
222*
223* @param[in] pi2_src
224*  Input data, 16x16 size
225*  First 16 mem locations will have the Dc coffs in rater scan order in linear fashion
226*  after a stride 1st AC clock will be present again in raster can order
227*  Then each AC block of the 16x16 block will follow in raster scan order
228*
229* @param[in] pu1_pred
230*  The predicted data, 16x16 size
231*  Block by block form
232*
233* @param[in] pu1_out
234*  Output 16x16
235*  In block by block form
236*
237* @param[in] src_strd
238*  Source stride
239*
240* @param[in] pred_strd
241*  input stride for prediction buffer
242*
243* @param[in] out_strd
244*  input stride for output buffer
245*
246* @param[in] pu2_iscale_mat
247*  Inverse quantization matrix for 4x4 transform
248*
249* @param[in] pu2_weigh_mat
250*  weight matrix of 4x4 transform
251*
252* @param[in] qp_div
253*  QP/6
254*
255* @param[in] pi4_tmp
256*  Input temporary buffer
257*  needs to be at least 20 in size
258*
259* @param[in] pu4_cntrl
260*  Controls the transform path
261*  total Last 17 bits are used
262*  the 16th th bit will correspond to DC block
263*  and 32-17 will correspond to the ac blocks in raster scan order
264*  bit equaling zero indicates that the entire 4x4 block is zero for DC
265*  For AC blocks a bit equaling zero will mean that all 15 AC coffs of the block is nonzero
266*
267* @param[in] pi4_tmp
268*  Input temporary buffer
269*  needs to be at least COFF_CNT_SUB_BLK_4x4+COFF_CNT_SUB_BLK_4x4 size
270*
271* @returns
272*  none
273*
274* @remarks
275*  The all zero case must be taken care outside
276*
277*******************************************************************************
278*/
279void ih264e_luma_16x16_idctrans_iquant_itrans_recon(codec_t *ps_codec,
280                                                    WORD16 *pi2_src,
281                                                    UWORD8 *pu1_pred,
282                                                    UWORD8 *pu1_out,
283                                                    WORD32 src_strd,
284                                                    WORD32 pred_strd,
285                                                    WORD32 out_strd,
286                                                    const UWORD16 *pu2_iscale_mat,
287                                                    const UWORD16 *pu2_weigh_mat,
288                                                    UWORD32 qp_div,
289                                                    UWORD32 u4_cntrl,
290                                                    UWORD32 u4_dc_trans_flag,
291                                                    WORD32 *pi4_tmp)
292{
293    /* Start index for inverse quant in a 4x4 block */
294    WORD32 iq_start_idx = (u4_dc_trans_flag == 0) ? 0 : 1;
295
296    /* Cntrl bits for 4x4 transforms
297     * u4_blk_cntrl       : controls if a 4x4 block should be processed in ac path
298     * u4_dc_cntrl        : controls is a 4x4 block is to be processed in dc path
299     *                    : dc block must contain only single dc coefficient
300     * u4_empty_blk_cntrl : control fot 4x4 block with no coeffs, ie no dc and ac
301     *                    : ie not (ac or dc)
302     */
303    UWORD32 u4_blk_cntrl, u4_dc_cntrl, u4_empty_blk_cntrl;
304
305    /* tmp registers for block ids */
306    UWORD32 u4_blk_id;
307
308    /* Subscrripts */
309    WORD32 i4_offset_x, i4_offset_y;
310
311    UWORD8 *pu1_cur_prd_blk, *pu1_cur_out_blk;
312
313    /* Src and stride for dc coeffs */
314    UWORD32 u4_dc_inc;
315    WORD16 *pi2_dc_src;
316
317    /*
318     * For intra blocks we need to do inverse dc transform
319     * In case if intra blocks, its here that we populate the dc bits in cntrl
320     * as they cannot be populated any earlier
321     */
322    if (u4_dc_trans_flag)
323    {
324        UWORD32 cntr, u4_dc_cntrl;
325        /* Do inv hadamard and place the results at the start of each AC block */
326        ps_codec->pf_ihadamard_scaling_4x4(pi2_src, pi2_src, pu2_iscale_mat,
327                                           pu2_weigh_mat, qp_div, pi4_tmp);
328
329        /* Update the cntrl flag */
330        u4_dc_cntrl = 0;
331        for (cntr = 0; cntr < DC_COEFF_CNT_LUMA_MB; cntr++)
332        {
333            u4_dc_cntrl |= ((pi2_src[cntr] != 0) << (15 - cntr));
334        }
335        /* Mark dc bits as 1 if corresponding ac bit is 0 */
336        u4_dc_cntrl = (~(u4_cntrl >> 16) & u4_dc_cntrl);
337        /* Combine both ac and dc bits */
338        u4_cntrl = (u4_cntrl & CNTRL_FLAG_AC_MASK_LUMA)
339                        | (u4_dc_cntrl & CNTRL_FLAG_DC_MASK_LUMA);
340    }
341
342    /* Source for dc coeffs
343     * If the block is intra, we have to read dc values from first row of src
344     * then stride for each block is 1, other wise its src stride
345     */
346    pi2_dc_src = (iq_start_idx == 0) ? (pi2_src + src_strd) : pi2_src;
347    u4_dc_inc = (iq_start_idx == 0) ? src_strd : 1;
348
349    /* The AC blocks starts from 2nd row */
350    pi2_src += src_strd;
351
352    /* Get the block bits */
353    u4_blk_cntrl = (u4_cntrl & CNTRL_FLAG_AC_MASK_LUMA);
354    u4_dc_cntrl = (u4_cntrl & CNTRL_FLAG_DC_MASK_LUMA) << 16;
355    u4_empty_blk_cntrl = (~(u4_dc_cntrl | u4_blk_cntrl)) & 0xFFFF0000;
356
357    /* Get first block to process */
358    DEQUEUE_BLKID_FROM_CONTROL(u4_dc_cntrl, u4_blk_id);
359    while (u4_blk_id < NUM_LUMA4x4_BLOCKS_IN_MB)
360    {
361        /* Compute address of src blocks */
362        WORD32 i4_src_offset = u4_dc_inc * u4_blk_id;
363
364        IND2SUB_LUMA_MB(u4_blk_id, i4_offset_x, i4_offset_y);
365
366        /* Compute address of out and pred blocks */
367        pu1_cur_prd_blk = pu1_pred + i4_offset_x + i4_offset_y * pred_strd;
368        pu1_cur_out_blk = pu1_out + i4_offset_x + i4_offset_y * out_strd;
369
370        /* Do inv dc transform */
371        ps_codec->pf_iquant_itrans_recon_4x4_dc(pi2_dc_src + i4_src_offset,
372                                                pu1_cur_prd_blk,
373                                                pu1_cur_out_blk, pred_strd,
374                                                out_strd, pu2_iscale_mat,
375                                                pu2_weigh_mat, qp_div, NULL,
376                                                iq_start_idx,
377                                                pi2_dc_src + i4_src_offset);
378        /* Get next DC block to process */
379        DEQUEUE_BLKID_FROM_CONTROL(u4_dc_cntrl, u4_blk_id);
380    }
381
382    /* now process ac/mixed blocks */
383    DEQUEUE_BLKID_FROM_CONTROL(u4_blk_cntrl, u4_blk_id);
384    while (u4_blk_id < NUM_LUMA4x4_BLOCKS_IN_MB)
385    {
386
387        WORD32 i4_src_offset = src_strd * u4_blk_id;
388
389        IND2SUB_LUMA_MB(u4_blk_id, i4_offset_x, i4_offset_y);
390
391        pu1_cur_prd_blk = pu1_pred + i4_offset_x + i4_offset_y * pred_strd;
392        pu1_cur_out_blk = pu1_out + i4_offset_x + i4_offset_y * out_strd;
393
394        ps_codec->pf_iquant_itrans_recon_4x4(pi2_src + i4_src_offset,
395                                             pu1_cur_prd_blk, pu1_cur_out_blk,
396                                             pred_strd, out_strd,
397                                             pu2_iscale_mat, pu2_weigh_mat,
398                                             qp_div, (WORD16*) pi4_tmp,
399                                             iq_start_idx,
400                                             pi2_dc_src + u4_blk_id);
401
402        DEQUEUE_BLKID_FROM_CONTROL(u4_blk_cntrl, u4_blk_id);
403    }
404
405    /* Now process empty blocks */
406    DEQUEUE_BLKID_FROM_CONTROL(u4_empty_blk_cntrl, u4_blk_id);
407    while (u4_blk_id < NUM_LUMA4x4_BLOCKS_IN_MB)
408    {
409        IND2SUB_LUMA_MB(u4_blk_id, i4_offset_x, i4_offset_y);
410
411        pu1_cur_prd_blk = pu1_pred + i4_offset_x + i4_offset_y * pred_strd;
412        pu1_cur_out_blk = pu1_out + i4_offset_x + i4_offset_y * out_strd;
413
414        ps_codec->pf_inter_pred_luma_copy(pu1_cur_prd_blk, pu1_cur_out_blk,
415                                          pred_strd, out_strd, SIZE_4X4_BLK_HRZ,
416                                          SIZE_4X4_BLK_VERT, 0, 0);
417
418        DEQUEUE_BLKID_FROM_CONTROL(u4_empty_blk_cntrl, u4_blk_id);
419    }
420}
421
422/**
423*******************************************************************************
424*
425* @brief
426*  This function performs does the DCT transform then Hadamard transform
427*  and quantization for a chroma macroblock
428*
429* @par Description:
430*  First  cf4 is done on all 16 4x4 blocks of the 8x8input block
431*  Then hadamard transform is done on the DC coefficients
432*  Quantization is then performed on the 8x8 block, 4x4 wise
433*
434* @param[in] pu1_src
435*  Pointer to source sub-block
436*  The input is in interleaved format for two chroma planes
437*
438* @param[in] pu1_pred
439*  Pointer to prediction sub-block
440*  Prediction is in inter leaved format
441*
442* @param[in] pi2_out
443*  Pointer to residual sub-block
444*  The output will be in linear format
445*  The first 4 continuous locations will contain the values of DC block for U
446*  and then next 4 will contain for V.
447*  After DC block and a stride 1st AC block of U plane will follow
448*  After one more stride next AC block of V plane will follow
449*  The blocks will be in raster scan order
450*
451*  After all the AC blocks of U plane AC blocks of V plane will follow in exact
452*  same way
453*
454* @param[in] src_strd
455*  Source stride
456*
457* @param[in] pred_strd
458*  Prediction stride
459*
460* @param[in] dst_strd
461*  Destination stride
462*
463* @param[in] pu2_scale_matrix
464*  The quantization matrix for 4x4 transform
465*
466* @param[in] pu2_threshold_matrix
467*  Threshold matrix
468*
469* @param[in] u4_qbits
470*  15+QP/6
471*
472* @param[in] u4_round_factor
473*  Round factor for quant
474*
475* @param[out] pu1_nnz
476*  Memory to store the non-zeros after transform
477*  The first byte will be the nnz od DC block for U plane
478*  From the next byte the AC nnzs will be storerd in raster scan order
479*  The fifth byte will be nnz of Dc block of V plane
480*  Then Ac blocks will follow
481*
482* @param u4_dc_flag
483*  Signals if Dc transform is to be done or not
484*   1 -> Dc transform will be done
485*   0 -> Dc transform will not be done
486*
487* @remarks
488*
489*******************************************************************************
490*/
491void ih264e_chroma_8x8_resi_trans_dctrans_quant(codec_t *ps_codec,
492                                                UWORD8 *pu1_src,
493                                                UWORD8 *pu1_pred,
494                                                WORD16 *pi2_out,
495                                                WORD32 src_strd,
496                                                WORD32 pred_strd,
497                                                WORD32 out_strd,
498                                                const UWORD16 *pu2_scale_matrix,
499                                                const UWORD16 *pu2_threshold_matrix,
500                                                UWORD32 u4_qbits,
501                                                UWORD32 u4_round_factor,
502                                                UWORD8 *pu1_nnz_c)
503{
504    WORD32 blk_cntr;
505    WORD32 i4_offsetx, i4_offsety;
506    UWORD8 *pu1_curr_src, *pu1_curr_pred;
507
508    WORD16 pi2_dc_str[8];
509    UWORD8 au1_dcnnz[2];
510
511    /* Move to the ac addresses */
512    pu1_nnz_c++;
513    pi2_out += out_strd;
514
515    for (blk_cntr = 0; blk_cntr < NUM_CHROMA4x4_BLOCKS_IN_MB; blk_cntr++)
516    {
517        IND2SUB_CHROMA_MB(blk_cntr, i4_offsetx, i4_offsety);
518
519        pu1_curr_src = pu1_src + i4_offsetx + i4_offsety * src_strd;
520        pu1_curr_pred = pu1_pred + i4_offsetx + i4_offsety * pred_strd;
521
522        /* For chroma, v plane nnz is populated from position 5 */
523        ps_codec->pf_resi_trans_quant_chroma_4x4(
524                        pu1_curr_src, pu1_curr_pred,
525                        pi2_out + blk_cntr * out_strd, src_strd, pred_strd,
526                        pu2_scale_matrix, pu2_threshold_matrix, u4_qbits,
527                        u4_round_factor, &pu1_nnz_c[blk_cntr + (blk_cntr > 3)],
528                        &pi2_dc_str[blk_cntr]);
529    }
530
531    /* Adjust pointers to point to dc values */
532    pi2_out -= out_strd;
533    pu1_nnz_c--;
534
535    u4_qbits++;
536    u4_round_factor <<= 1;
537
538    ps_codec->pf_hadamard_quant_2x2_uv(pi2_dc_str, pi2_out, pu2_scale_matrix,
539                                       pu2_threshold_matrix, u4_qbits,
540                                       u4_round_factor, au1_dcnnz);
541
542    /* Copy the dc nnzs */
543    pu1_nnz_c[0] = au1_dcnnz[0];
544    pu1_nnz_c[5] = au1_dcnnz[1];
545
546}
547
548/**
549*******************************************************************************
550* @brief
551*  This function performs the inverse transform with process for chroma MB of H264
552*
553* @par Description:
554*  Does inverse DC transform ,inverse quantization inverse transform
555*
556* @param[in] pi2_src
557*  Input data, 16x16 size
558*  The input is in the form of, first 4 locations will contain DC coeffs of
559*  U plane, next 4 will contain DC coeffs of V plane, then AC blocks of U plane
560*  in raster scan order will follow, each block as linear array in raster scan order.
561*  After a stride next AC block will follow. After all AC blocks of U plane
562*  V plane AC blocks will follow in exact same order.
563*
564* @param[in] pu1_pred
565*  The predicted data, 8x16 size, U and V interleaved
566*
567* @param[in] pu1_out
568*  Output 8x16, U and V interleaved
569*
570* @param[in] src_strd
571*  Source stride
572*
573* @param[in] pred_strd
574*  input stride for prediction buffer
575*
576* @param[in] out_strd
577*  input stride for output buffer
578*
579* @param[in] pu2_iscale_mat
580*  Inverse quantization martix for 4x4 transform
581*
582* @param[in] pu2_weigh_mat
583*  weight matrix of 4x4 transform
584*
585* @param[in] qp_div
586*  QP/6
587*
588* @param[in] pi4_tmp
589*  Input temporary buffer
590*  needs to be at least COFF_CNT_SUB_BLK_4x4 + Number of Dc cofss for chroma * number of planes
591*  in size
592*
593* @param[in] pu4_cntrl
594*  Controls the transform path
595*  the 15 th bit will correspond to DC block of U plane , 14th will indicate the V plane Dc block
596*  32-28 bits will indicate AC blocks of U plane in raster scan order
597*  27-23 bits will indicate AC blocks of V plane in rater scan order
598*  The bit 1 implies that there is at least one non zero coeff in a block
599*
600* @returns
601*  none
602*
603* @remarks
604*******************************************************************************
605*/
606void ih264e_chroma_8x8_idctrans_iquant_itrans_recon(codec_t *ps_codec,
607                                                    WORD16 *pi2_src,
608                                                    UWORD8 *pu1_pred,
609                                                    UWORD8 *pu1_out,
610                                                    WORD32 src_strd,
611                                                    WORD32 pred_strd,
612                                                    WORD32 out_strd,
613                                                    const UWORD16 *pu2_iscale_mat,
614                                                    const UWORD16 *pu2_weigh_mat,
615                                                    UWORD32 qp_div,
616                                                    UWORD32 u4_cntrl,
617                                                    WORD32 *pi4_tmp)
618{
619    /* Cntrl bits for 4x4 transforms
620     * u4_blk_cntrl       : controls if a 4x4 block should be processed in ac path
621     * u4_dc_cntrl        : controls is a 4x4 block is to be processed in dc path
622     *                    : dc block must contain only single dc coefficient
623     * u4_empty_blk_cntrl : control fot 4x4 block with no coeffs, ie no dc and ac
624     *                    : ie not (ac or dc)
625     */
626
627    UWORD32 u4_blk_cntrl, u4_dc_cntrl, u4_empty_blk_cntrl;
628
629    /* tmp registers for block ids */
630    WORD32 u4_blk_id;
631
632    /* Offsets for pointers */
633    WORD32 i4_offset_x, i4_offset_y;
634
635    /* Pointer to 4x4 blocks */
636    UWORD8 *pu1_cur_4x4_prd_blk, *pu1_cur_4x4_out_blk;
637
638    /* Tmp register for pointer to dc coffs */
639    WORD16 *pi2_dc_src;
640
641    WORD16 i2_zero = 0;
642
643    /* Increment for dc block */
644    WORD32 i4_dc_inc;
645
646    /*
647     * Lets do the inverse transform for dc coeffs in chroma
648     */
649    if (u4_cntrl & CNTRL_FLAG_DCBLK_MASK_CHROMA)
650    {
651        UWORD32 cntr, u4_dc_cntrl;
652        /* Do inv hadamard for u an v block */
653
654        ps_codec->pf_ihadamard_scaling_2x2_uv(pi2_src, pi2_src, pu2_iscale_mat,
655                                              pu2_weigh_mat, qp_div, NULL);
656        /*
657         * Update the cntrl flag
658         * Flag is updated as follows bits 15-11 -> u block dc bits
659         */
660        u4_dc_cntrl = 0;
661        for (cntr = 0; cntr < 8; cntr++)
662        {
663            u4_dc_cntrl |= ((pi2_src[cntr] != 0) << (15 - cntr));
664        }
665
666        /* Mark dc bits as 1 if corresponding ac bit is 0 */
667        u4_dc_cntrl = (~(u4_cntrl >> 16) & u4_dc_cntrl);
668        /* Combine both ac and dc bits */
669        u4_cntrl = (u4_cntrl & CNTRL_FLAG_AC_MASK_CHROMA)
670                        | (u4_dc_cntrl & CNTRL_FLAG_DC_MASK_CHROMA);
671
672        /* Since we populated the dc coffs, we have to read them from there */
673        pi2_dc_src = pi2_src;
674        i4_dc_inc = 1;
675    }
676    else
677    {
678        u4_cntrl = u4_cntrl & CNTRL_FLAG_AC_MASK_CHROMA;
679        pi2_dc_src = &i2_zero;
680        i4_dc_inc = 0;
681    }
682
683    /* Get the block bits */
684    u4_blk_cntrl = (u4_cntrl & CNTRL_FLAG_AC_MASK_CHROMA);
685    u4_dc_cntrl = (u4_cntrl & CNTRL_FLAG_DC_MASK_CHROMA) << 16;
686    u4_empty_blk_cntrl = (~(u4_dc_cntrl | u4_blk_cntrl)) & 0xFF000000;
687
688    /* The AC blocks starts from 2nd row */
689    pi2_src += src_strd;
690
691    DEQUEUE_BLKID_FROM_CONTROL(u4_dc_cntrl, u4_blk_id);
692    while (u4_blk_id < 8)
693    {
694        WORD32 dc_src_offset = u4_blk_id * i4_dc_inc;
695
696        IND2SUB_CHROMA_MB(u4_blk_id, i4_offset_x, i4_offset_y);
697
698        pu1_cur_4x4_prd_blk = pu1_pred + i4_offset_x + i4_offset_y * pred_strd;
699        pu1_cur_4x4_out_blk = pu1_out + i4_offset_x + i4_offset_y * out_strd;
700
701        ps_codec->pf_iquant_itrans_recon_chroma_4x4_dc(
702                        pi2_dc_src + dc_src_offset, pu1_cur_4x4_prd_blk,
703                        pu1_cur_4x4_out_blk, pred_strd, out_strd, NULL, NULL, 0,
704                        NULL, pi2_dc_src + dc_src_offset);
705        /* Get next DC block to process */
706        DEQUEUE_BLKID_FROM_CONTROL(u4_dc_cntrl, u4_blk_id);
707    }
708
709    /* now process ac/mixed blocks */
710    DEQUEUE_BLKID_FROM_CONTROL(u4_blk_cntrl, u4_blk_id);
711    while (u4_blk_id < 8)
712    {
713        WORD32 i4_src_offset = src_strd * u4_blk_id;
714        WORD32 dc_src_offset = i4_dc_inc * u4_blk_id;
715
716        IND2SUB_CHROMA_MB(u4_blk_id, i4_offset_x, i4_offset_y);
717
718        pu1_cur_4x4_prd_blk = pu1_pred + i4_offset_x + i4_offset_y * pred_strd;
719        pu1_cur_4x4_out_blk = pu1_out + i4_offset_x + i4_offset_y * out_strd;
720
721        ps_codec->pf_iquant_itrans_recon_chroma_4x4(pi2_src + i4_src_offset,
722                                                    pu1_cur_4x4_prd_blk,
723                                                    pu1_cur_4x4_out_blk,
724                                                    pred_strd, out_strd,
725                                                    pu2_iscale_mat,
726                                                    pu2_weigh_mat, qp_div,
727                                                    (WORD16 *) pi4_tmp,
728                                                    pi2_dc_src + dc_src_offset);
729
730        DEQUEUE_BLKID_FROM_CONTROL(u4_blk_cntrl, u4_blk_id);
731    }
732
733    /* Now process empty blocks */
734    DEQUEUE_BLKID_FROM_CONTROL(u4_empty_blk_cntrl, u4_blk_id);
735    while (u4_blk_id < 8)
736    {
737        IND2SUB_CHROMA_MB(u4_blk_id, i4_offset_x, i4_offset_y);
738
739        pu1_cur_4x4_prd_blk = pu1_pred + i4_offset_x + i4_offset_y * pred_strd;
740        pu1_cur_4x4_out_blk = pu1_out + i4_offset_x + i4_offset_y * out_strd;
741
742        ps_codec->pf_interleave_copy(pu1_cur_4x4_prd_blk, pu1_cur_4x4_out_blk,
743                                     pred_strd, out_strd, SIZE_4X4_BLK_VERT,
744                                     SIZE_4X4_BLK_HRZ);
745
746        DEQUEUE_BLKID_FROM_CONTROL(u4_empty_blk_cntrl, u4_blk_id);
747    }
748}
749
750/**
751******************************************************************************
752*
753* @brief  This function packs residue of an i16x16 luma mb for entropy coding
754*
755* @par   Description
756*  An i16 macro block contains two classes of units, dc 4x4 block and
757*  4x4 ac blocks. while packing the mb, the dc block is sent first, and
758*  the 16 ac blocks are sent next in scan order. Each and every block is
759*  represented by 3 parameters (nnz, significant coefficient map and the
760*  residue coefficients itself). If a 4x4 unit does not have any coefficients
761*  then only nnz is sent. Inside a 4x4 block the individual coefficients are
762*  sent in scan order.
763*
764*  The first byte of each block will be nnz of the block, if it is non zero,
765*  a 2 byte significance map is sent. This is followed by nonzero coefficients.
766*  This is repeated for 1 dc + 16 ac blocks.
767*
768* @param[in]  pi2_res_mb
769*  pointer to residue mb
770*
771* @param[in, out]  pv_mb_coeff_data
772*  buffer pointing to packed residue coefficients
773*
774* @param[in]  u4_res_strd
775*  residual block stride
776*
777* @param[out]  u1_cbp_l
778*  coded block pattern luma
779*
780* @param[in]   pu1_nnz
781*  number of non zero coefficients in each 4x4 unit
782*
783* @param[out]
784*  Control signal for inverse transform of 16x16 blocks
785*
786* @return none
787*
788* @ remarks
789*
790******************************************************************************
791*/
792void ih264e_pack_l_mb_i16(WORD16 *pi2_res_mb,
793                          void **pv_mb_coeff_data,
794                          WORD32 i4_res_strd,
795                          UWORD8 *u1_cbp_l,
796                          UWORD8 *pu1_nnz,
797                          UWORD32 *pu4_cntrl)
798{
799    /* pointer to packed sub block buffer space */
800    tu_sblk_coeff_data_t *ps_mb_coeff_data = (*pv_mb_coeff_data), *ps_mb_coeff_data_ac;
801
802    /* no of non zero coefficients in the current sub block */
803    UWORD32 u4_nnz_cnt;
804
805    /* significant coefficient map */
806    UWORD32 u4_s_map;
807
808    /* pointer to scanning matrix */
809    const UWORD8 *pu1_scan_order;
810
811    /* number of non zeros in sub block */
812    UWORD32 u4_nnz;
813
814    /* coeff scan order */
815    const UWORD8 u1_scan_order[16] = {0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15};
816
817    /* temp var */
818    UWORD32 coeff_cnt, mask, b4,u4_cntrl=0;
819
820    /*DC and AC coeff pointers*/
821    WORD16 *pi2_res_mb_ac,*pi2_res_mb_dc;
822
823    /********************************************************/
824    /*  pack dc coeff data for entropy coding               */
825    /********************************************************/
826
827    pi2_res_mb_dc = pi2_res_mb;
828    pu1_scan_order = gu1_luma_scan_order_dc;
829
830    u4_nnz = *pu1_nnz;
831    u4_cntrl = 0;
832
833    /* write number of non zero coefficients */
834    ps_mb_coeff_data->i4_sig_map_nnz = u4_nnz;
835
836    if (u4_nnz)
837    {
838        for (u4_nnz_cnt = 0, coeff_cnt = 0, mask = 1, u4_s_map = 0; u4_nnz_cnt < u4_nnz; coeff_cnt++)
839        {
840            if (pi2_res_mb_dc[pu1_scan_order[coeff_cnt]])
841            {
842                /* write residue */
843                ps_mb_coeff_data->ai2_residue[u4_nnz_cnt++] = pi2_res_mb_dc[pu1_scan_order[coeff_cnt]];
844                u4_s_map |= mask;
845            }
846            mask <<= 1;
847        }
848        /* write significant coeff map */
849        ps_mb_coeff_data->i4_sig_map_nnz |= (u4_s_map << 16);
850        (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + ALIGN2(u4_nnz_cnt);
851
852        u4_cntrl = 0x00008000;// Set DC bit in ctrl code
853    }
854    else
855    {
856        (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue;
857    }
858
859    /********************************************************/
860    /*  pack ac coeff data for entropy coding               */
861    /********************************************************/
862
863    pu1_nnz ++;
864    pu1_scan_order = gu1_luma_scan_order;
865    pi2_res_mb += i4_res_strd; /*Move to AC block*/
866
867    ps_mb_coeff_data_ac = (*pv_mb_coeff_data);
868
869    for (b4 = 0; b4 < 16; b4++)
870    {
871        ps_mb_coeff_data = (*pv_mb_coeff_data);
872
873        u4_nnz = pu1_nnz[u1_scan_order[b4]];
874
875        /* Jump according to the scan order */
876        pi2_res_mb_ac = pi2_res_mb + (i4_res_strd * u1_scan_order[b4]);
877
878        /*
879         * Since this is a i16x16 block, we should not count dc coeff on indi
880         * vidual 4x4 blocks to nnz. But due to the implementation of 16x16
881         * trans function, we add dc's nnz to u4_nnz too. Hence we adjust that
882         * here
883         */
884        u4_nnz -= (pi2_res_mb_ac[0] != 0);
885
886        /* write number of non zero coefficients */
887        ps_mb_coeff_data->i4_sig_map_nnz = u4_nnz;
888
889        if (u4_nnz)
890        {
891            for (u4_nnz_cnt = 0, coeff_cnt = 1, mask = 1, u4_s_map = 0; u4_nnz_cnt < u4_nnz; coeff_cnt++)
892            {
893                if (pi2_res_mb_ac[pu1_scan_order[coeff_cnt]])
894                {
895                    /* write residue */
896                    ps_mb_coeff_data->ai2_residue[u4_nnz_cnt++] = pi2_res_mb_ac[pu1_scan_order[coeff_cnt]];
897                    u4_s_map |= mask;
898                }
899                mask <<= 1;
900            }
901            /* write significant coeff map */
902            ps_mb_coeff_data->i4_sig_map_nnz |= (u4_s_map << 16);
903            (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + ALIGN2(u4_nnz_cnt);
904            *u1_cbp_l = 15;
905
906            u4_cntrl |= (1 << (31 - u1_scan_order[b4]));
907        }
908        else
909        {
910            (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue;
911        }
912
913    }
914
915    if (!(*u1_cbp_l))
916    {
917        (*pv_mb_coeff_data) = ps_mb_coeff_data_ac;
918    }
919
920    /* Store the cntrl signal */
921    (*pu4_cntrl) = u4_cntrl;
922    return;
923}
924
925/**
926******************************************************************************
927*
928* @brief  This function packs residue of an p16x16 luma mb for entropy coding
929*
930* @par   Description
931*  A p16x16 macro block contains two classes of units 16  4x4 ac blocks.
932*  while packing the mb, the dc block is sent first, and
933*  the 16 ac blocks are sent next in scan order. Each and every block is
934*  represented by 3 parameters (nnz, significant coefficient map and the
935*  residue coefficients itself). If a 4x4 unit does not have any coefficients
936*  then only nnz is sent. Inside a 4x4 block the individual coefficients are
937*  sent in scan order.
938*
939*  The first byte of each block will be nnz of the block, if it is non zero,
940*  a 2 byte significance map is sent. This is followed by nonzero coefficients.
941*  This is repeated for 1 dc + 16 ac blocks.
942*
943* @param[in]  pi2_res_mb
944*  pointer to residue mb
945*
946* @param[in, out]  pv_mb_coeff_data
947*  buffer pointing to packed residue coefficients
948*
949* @param[in]  i4_res_strd
950*  residual block stride
951*
952* @param[out]  u1_cbp_l
953*  coded block pattern luma
954*
955* @param[in]   pu1_nnz
956*  number of non zero coefficients in each 4x4 unit
957*
958* @param[out] pu4_cntrl
959*  Control signal for inverse transform
960*
961* @return none
962*
963* @remarks Killing coffs not yet coded
964*
965******************************************************************************
966*/
967void ih264e_pack_l_mb(WORD16 *pi2_res_mb,
968                      void **pv_mb_coeff_data,
969                      WORD32 i4_res_strd,
970                      UWORD8 *u1_cbp_l,
971                      UWORD8 *pu1_nnz,
972                      UWORD32 u4_thres_resi,
973                      UWORD32 *pu4_cntrl)
974{
975    /* pointer to packed sub block buffer space */
976    tu_sblk_coeff_data_t *ps_mb_coeff_data, *ps_mb_coeff_data_b8, *ps_mb_coeff_data_mb;
977
978    /* no of non zero coefficients in the current sub block */
979    UWORD32 u4_nnz_cnt;
980
981    /* significant coefficient map */
982    UWORD32 u4_s_map;
983
984    /* pointer to scanning matrix */
985    const UWORD8 *pu1_scan_order = gu1_luma_scan_order;
986
987    /* number of non zeros in sub block */
988    UWORD32 u4_nnz;
989
990    /* pointer to residual sub block */
991    WORD16  *pi2_res_sb;
992
993    /* coeff scan order */
994    const UWORD8 u1_scan_order[16] = {0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15};
995
996    /* coeff cost */
997    const UWORD8  *pu1_coeff_cost = gu1_coeff_cost;
998
999    /* temp var */
1000    UWORD32 u4_mb_coeff_cost = 0, u4_b8_coeff_cost = 0, coeff_cnt, mask, u4_cntrl = 0, b4, b8;
1001
1002    /* temp var */
1003    WORD32 i4_res_val, i4_run = -1, dcac_block;
1004
1005    /* When Hadamard transform is disabled, first row values are dont care, ignore them */
1006    pi2_res_mb += i4_res_strd;
1007
1008    /* When Hadamard transform is disabled, first unit value is dont care, ignore this */
1009    pu1_nnz ++;
1010
1011    ps_mb_coeff_data_mb = ps_mb_coeff_data_b8 = (*pv_mb_coeff_data);
1012
1013    /********************************************************/
1014    /*  pack coeff data for entropy coding                  */
1015    /********************************************************/
1016
1017    for (b4 = 0; b4 < 16; b4++)
1018    {
1019        ps_mb_coeff_data = (*pv_mb_coeff_data);
1020
1021        b8 = b4 >> 2;
1022
1023        u4_nnz = pu1_nnz[u1_scan_order[b4]];
1024
1025        /* Jump according to the scan order */
1026        pi2_res_sb = pi2_res_mb + (i4_res_strd * u1_scan_order[b4]);
1027
1028        /* write number of non zero coefficients */
1029        ps_mb_coeff_data->i4_sig_map_nnz = u4_nnz;
1030
1031        if (u4_nnz)
1032        {
1033            for (u4_nnz_cnt = 0, coeff_cnt = 0, mask = 1, u4_s_map = 0; u4_nnz_cnt < u4_nnz; coeff_cnt++)
1034            {
1035                /* number of runs of zero before, this is used to compute coeff cost */
1036                i4_run++;
1037
1038                i4_res_val = pi2_res_sb[pu1_scan_order[coeff_cnt]];
1039
1040                if (i4_res_val)
1041                {
1042                    /* write residue */
1043                    ps_mb_coeff_data->ai2_residue[u4_nnz_cnt++] = i4_res_val;
1044                    u4_s_map |= mask;
1045
1046                    if (u4_thres_resi)
1047                    {
1048                        /* compute coeff cost */
1049                        if (i4_res_val == 1 || i4_res_val == -1)
1050                        {
1051                            if (i4_run < 6)
1052                                u4_b8_coeff_cost += pu1_coeff_cost[i4_run];
1053                        }
1054                        else
1055                            u4_b8_coeff_cost += 9;
1056
1057                        i4_run = -1;
1058                    }
1059                }
1060
1061                mask <<= 1;
1062            }
1063
1064            /* write significant coeff map */
1065            ps_mb_coeff_data->i4_sig_map_nnz |= (u4_s_map << 16);
1066            (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + ALIGN2(u4_nnz_cnt);
1067
1068            /* cbp */
1069            *u1_cbp_l |= (1 << b8);
1070
1071            /* Cntrl map for inverse transform computation
1072             *
1073             * If coeff_cnt is zero, it means that only nonzero was a dc coeff
1074             * Hence we have to set the 16 - u1_scan_order[b4]) position instead
1075             * of 31 - u1_scan_order[b4]
1076             */
1077            dcac_block = (coeff_cnt == 0)?16:31;
1078            u4_cntrl |= (1 << (dcac_block - u1_scan_order[b4]));
1079        }
1080        else
1081        {
1082            (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue;
1083        }
1084
1085        /* Decide if the 8x8 unit has to be sent for entropy coding? */
1086        if ((b4+1) % 4 == 0)
1087        {
1088            if ( u4_thres_resi && (u4_b8_coeff_cost <= LUMA_SUB_BLOCK_SKIP_THRESHOLD) &&
1089                            (*u1_cbp_l & (1 << b8)) )
1090            {
1091
1092
1093                /*
1094                 * When we want to reset the full 8x8 block, we have to reset
1095                 * both the dc and ac coeff bits hence we have the symmetric
1096                 * arrangement of bits
1097                 */
1098                const UWORD32 cntrl_mask_map[4] = {0xcc00cc00, 0x33003300, 0x00cc00cc, 0x00330033};
1099
1100                /* restore cbp */
1101                *u1_cbp_l = (*u1_cbp_l & (~(1 << b8)));
1102
1103                /* correct cntrl flag */
1104                u4_cntrl = u4_cntrl & (~cntrl_mask_map[(b4 >> 2)]);
1105
1106                /* correct nnz */
1107                pu1_nnz[u1_scan_order[b4 - 3]] = 0;
1108                pu1_nnz[u1_scan_order[b4 - 2]] = 0;
1109                pu1_nnz[u1_scan_order[b4 - 1]] = 0;
1110                pu1_nnz[u1_scan_order[b4]] = 0;
1111
1112                /* reset blk cost */
1113                u4_b8_coeff_cost = 0;
1114            }
1115
1116            if (!(*u1_cbp_l & (1 << b8)))
1117            {
1118                (*pv_mb_coeff_data) = ps_mb_coeff_data_b8;
1119            }
1120
1121            u4_mb_coeff_cost += u4_b8_coeff_cost;
1122
1123            u4_b8_coeff_cost = 0;
1124            i4_run = -1;
1125            ps_mb_coeff_data_b8 = (*pv_mb_coeff_data);
1126        }
1127    }
1128
1129    if (u4_thres_resi && (u4_mb_coeff_cost <= LUMA_BLOCK_SKIP_THRESHOLD)
1130                    && (*u1_cbp_l))
1131    {
1132        (*pv_mb_coeff_data) = ps_mb_coeff_data_mb;
1133        *u1_cbp_l = 0;
1134        u4_cntrl = 0;
1135        memset(pu1_nnz, 0, 16);
1136    }
1137
1138    (*pu4_cntrl) = u4_cntrl;
1139
1140    return;
1141}
1142
1143/**
1144******************************************************************************
1145*
1146* @brief  This function packs residue of an i8x8 chroma mb for entropy coding
1147*
1148* @par   Description
1149*  An i8 chroma macro block contains two classes of units, dc 2x2 block and
1150*  4x4 ac blocks. while packing the mb, the dc block is sent first, and
1151*  the 4 ac blocks are sent next in scan order. Each and every block is
1152*  represented by 3 parameters (nnz, significant coefficient map and the
1153*  residue coefficients itself). If a 4x4 unit does not have any coefficients
1154*  then only nnz is sent. Inside a 4x4 block the individual coefficients are
1155*  sent in scan order.
1156*
1157*  The first byte of each block will be nnz of the block, if it is non zero,
1158*  a 2 byte significance map is sent. This is followed by nonzero coefficients.
1159*  This is repeated for 1 dc + 4 ac blocks.
1160*
1161* @param[in]  pi2_res_mb
1162*  pointer to residue mb
1163*
1164* @param[in, out]  pv_mb_coeff_data
1165*  buffer pointing to packed residue coefficients
1166*
1167* @param[in]  u4_res_strd
1168*  residual block stride
1169*
1170* @param[out]  u1_cbp_c
1171*  coded block pattern chroma
1172*
1173* @param[in]   pu1_nnz
1174*  number of non zero coefficients in each 4x4 unit
1175*
1176* @param[out]   pu1_nnz
1177*  Control signal for inverse transform
1178*
1179* @param[in]   u4_swap_uv
1180*  Swaps the order of U and V planes in entropy bitstream
1181*
1182* @return none
1183*
1184* @ remarks
1185*
1186******************************************************************************
1187*/
1188void ih264e_pack_c_mb(WORD16 *pi2_res_mb,
1189                      void **pv_mb_coeff_data,
1190                      WORD32 i4_res_strd,
1191                      UWORD8 *u1_cbp_c,
1192                      UWORD8 *pu1_nnz,
1193                      UWORD32 u4_thres_resi,
1194                      UWORD32 *pu4_cntrl,
1195                      UWORD32 u4_swap_uv)
1196{
1197    /* pointer to packed sub block buffer space */
1198    tu_sblk_coeff_data_t *ps_mb_coeff_data = (*pv_mb_coeff_data);
1199    tu_sblk_coeff_data_t *ps_mb_coeff_data_dc, *ps_mb_coeff_data_ac;
1200
1201    /* nnz pointer */
1202    UWORD8 *pu1_nnz_ac, *pu1_nnz_dc;
1203
1204    /* nnz counter */
1205    UWORD32 u4_nnz_cnt;
1206
1207    /* significant coefficient map */
1208    UWORD32 u4_s_map;
1209
1210    /* pointer to scanning matrix */
1211    const UWORD8 *pu1_scan_order;
1212
1213    /* no of non zero coefficients in the current sub block */
1214    UWORD32 u4_nnz;
1215
1216    /* pointer to residual sub block, res val */
1217    WORD16 *pi2_res_sb, i2_res_val;
1218
1219    /* temp var */
1220    UWORD32 coeff_cnt, mask, b4,plane;
1221
1222    /* temp var */
1223    UWORD32 u4_coeff_cost;
1224    WORD32 i4_run;
1225
1226    /* coeff cost */
1227    const UWORD8 *pu1_coeff_cost = gu1_coeff_cost;
1228
1229    /* pointer to packed buffer space */
1230    UWORD32 *pu4_mb_coeff_data = NULL;
1231
1232    /* ac coded block pattern */
1233    UWORD8 u1_cbp_ac;
1234
1235    /* Variable to store the current bit pos in cntrl variable*/
1236    UWORD32 cntrl_pos = 0;
1237
1238    /********************************************************/
1239    /*  pack dc coeff data for entropy coding               */
1240    /********************************************************/
1241    pu1_scan_order = gu1_chroma_scan_order_dc;
1242    pi2_res_sb = pi2_res_mb;
1243    pu1_nnz_dc = pu1_nnz;
1244    (*pu4_cntrl) = 0;
1245    cntrl_pos = 15;
1246    ps_mb_coeff_data_dc = (*pv_mb_coeff_data);
1247
1248    /* Color space conversion between SP_UV and SP_VU
1249     * We always assume SP_UV for all the processing
1250     * Hence to get proper stream output we need to swap U and V channels here
1251     *
1252     * For that there are two paths we need to look for
1253     * One is the path to bitstream , these variables should have the proper input
1254     * configured UV or VU
1255     * For the other path the inverse transform variables should have what ever ordering the
1256     * input had
1257     */
1258
1259    if (u4_swap_uv)
1260    {
1261        pu1_nnz_dc += 5;/* Move to NNZ of V planve */
1262        pi2_res_sb += 4;/* Move to DC coff of V plane */
1263
1264        cntrl_pos = 14; /* Control bit for V plane */
1265    }
1266
1267    for (plane = 0; plane < 2; plane++)
1268    {
1269        ps_mb_coeff_data = (*pv_mb_coeff_data);
1270
1271        u4_nnz = *pu1_nnz_dc;
1272        /* write number of non zero coefficients U/V */
1273        ps_mb_coeff_data->i4_sig_map_nnz = u4_nnz;
1274
1275        if (u4_nnz)
1276        {
1277            for (u4_nnz_cnt = 0, coeff_cnt = 0, mask = 1, u4_s_map = 0; u4_nnz_cnt < u4_nnz; coeff_cnt++)
1278            {
1279                i2_res_val = pi2_res_sb[pu1_scan_order[coeff_cnt]];
1280                if (i2_res_val)
1281                {
1282                    /* write residue U/V */
1283                    ps_mb_coeff_data->ai2_residue[u4_nnz_cnt++] = i2_res_val;
1284                    u4_s_map |= mask;
1285                }
1286                mask <<= 1;
1287            }
1288            /* write significant coeff map U/V */
1289            ps_mb_coeff_data->i4_sig_map_nnz |= (u4_s_map << 16);
1290            (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + ALIGN2(u4_nnz_cnt);
1291            *u1_cbp_c = 1;
1292
1293            (*pu4_cntrl) |= (1 << cntrl_pos);
1294        }
1295        else
1296        {
1297            (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue;
1298        }
1299
1300        if (u4_swap_uv)
1301        {
1302            cntrl_pos++; /* Control bit for U plane */
1303            pu1_nnz_dc -= 5; /* Move to NNZ of U plane */
1304            pi2_res_sb -= 4; /* Move to DC coff of U plane */
1305
1306        }
1307        else
1308        {
1309            cntrl_pos--; /* Control bit for U plane */
1310            pu1_nnz_dc += 5; /* 4 for AC NNZ and 1 for DC */
1311            pi2_res_sb += 4; /* Move to DC coff of V plane */
1312        }
1313    }
1314
1315    /********************************************************/
1316    /*  pack ac coeff data for entropy coding               */
1317    /********************************************************/
1318
1319    pu1_scan_order = gu1_chroma_scan_order;
1320    ps_mb_coeff_data_ac = (*pv_mb_coeff_data);
1321
1322    if (u4_swap_uv)
1323    {
1324        pi2_res_sb = pi2_res_mb + i4_res_strd * 5; /* Move to V plane ,ie 1dc row+ 4 ac row */
1325        cntrl_pos = 27; /* The control bits are to be added for V bloc ie 31-4 th bit */
1326        pu1_nnz_ac = pu1_nnz + 6;/*Move the nnz to V block NNZ 1 dc + 1dc + 4 ac */
1327    }
1328    else
1329    {
1330        pi2_res_sb = pi2_res_mb + i4_res_strd; /* Move to U plane ,ie 1dc row */
1331        cntrl_pos = 31;
1332        pu1_nnz_ac = pu1_nnz + 1; /* Move the nnz to V block NNZ 1 dc */
1333    }
1334
1335    for (plane = 0; plane < 2; plane++)
1336    {
1337        pu4_mb_coeff_data = (*pv_mb_coeff_data);
1338
1339        u4_coeff_cost = 0;
1340        i4_run = -1;
1341
1342        /* get the current cbp, so that it automatically
1343         * gets reverted in case of zero ac values */
1344        u1_cbp_ac = *u1_cbp_c;
1345
1346        for (b4 = 0; b4 < 4; b4++)
1347        {
1348            ps_mb_coeff_data = (*pv_mb_coeff_data);
1349
1350            u4_nnz = *pu1_nnz_ac;
1351
1352            /*
1353             * We are scanning only ac coeffs, but the nnz is for the
1354             * complete 4x4 block. Hence we have to discount the nnz contributed
1355             * by the dc coefficient
1356             */
1357            u4_nnz -= (pi2_res_sb[0]!=0);
1358
1359            /* write number of non zero coefficients U/V */
1360            ps_mb_coeff_data->i4_sig_map_nnz = u4_nnz;
1361
1362            if (u4_nnz)
1363            {
1364                for (u4_nnz_cnt = 0, coeff_cnt = 0, mask = 1, u4_s_map = 0; u4_nnz_cnt < u4_nnz; coeff_cnt++)
1365                {
1366                    i2_res_val = pi2_res_sb[pu1_scan_order[coeff_cnt]];
1367
1368                    i4_run++;
1369
1370                    if (i2_res_val)
1371                    {
1372                        /* write residue U/V */
1373                        ps_mb_coeff_data->ai2_residue[u4_nnz_cnt++] = i2_res_val;
1374                        u4_s_map |= mask;
1375
1376                        if ( u4_thres_resi && (u4_coeff_cost < CHROMA_BLOCK_SKIP_THRESHOLD) )
1377                        {
1378                            /* compute coeff cost */
1379                            if (i2_res_val == 1 || i2_res_val == -1)
1380                            {
1381                                if (i4_run < 6)
1382                                    u4_coeff_cost += pu1_coeff_cost[i4_run];
1383                            }
1384                            else
1385                                u4_coeff_cost += 9;
1386
1387                            i4_run = -1;
1388                        }
1389                    }
1390                    mask <<= 1;
1391                }
1392
1393                /* write significant coeff map U/V */
1394                ps_mb_coeff_data->i4_sig_map_nnz |= (u4_s_map << 16);
1395                (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + ALIGN2(u4_nnz_cnt);
1396                u1_cbp_ac = 2;
1397
1398                (*pu4_cntrl) |= 1 << cntrl_pos;
1399            }
1400            else
1401            {
1402                (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue;
1403            }
1404
1405            pu1_nnz_ac++;
1406            pi2_res_sb += i4_res_strd;
1407            cntrl_pos--;
1408        }
1409
1410        /* reset block */
1411        if (u4_thres_resi && (u4_coeff_cost < CHROMA_BLOCK_SKIP_THRESHOLD))
1412        {
1413            pu4_mb_coeff_data[0] = 0;
1414            pu4_mb_coeff_data[1] = 0;
1415            pu4_mb_coeff_data[2] = 0;
1416            pu4_mb_coeff_data[3] = 0;
1417            (*pv_mb_coeff_data) = pu4_mb_coeff_data + 4;
1418
1419            /* Generate the control signal */
1420            /* Zero out the current plane's AC coefficients */
1421            (*pu4_cntrl) &= ((plane == u4_swap_uv) ? 0x0FFFFFFF : 0xF0FFFFFF);
1422
1423            /* Similarly do for the NNZ also */
1424            *(pu1_nnz_ac - 4) = 0;
1425            *(pu1_nnz_ac - 3) = 0;
1426            *(pu1_nnz_ac - 2) = 0;
1427            *(pu1_nnz_ac - 1) = 0;
1428        }
1429        else
1430        {
1431            *u1_cbp_c = u1_cbp_ac;
1432        }
1433
1434        if (u4_swap_uv)
1435        {
1436            pi2_res_sb = pi2_res_mb + i4_res_strd; /* Move to V plane ,ie 1dc row+ 4 ac row + 1 dc row */
1437            cntrl_pos = 31; /* The control bits are to be added for V bloc ie 31-4 th bit */
1438            pu1_nnz_ac = pu1_nnz + 1; /* Move the nnz to V block NNZ 1 dc + 1dc + 4 ac */
1439
1440            pu1_nnz_ac = pu1_nnz + 1;
1441        }
1442        else
1443            pu1_nnz_ac = pu1_nnz + 6; /* Go to nnz of V plane */
1444    }
1445
1446    /* restore the ptr basing on cbp */
1447    if (*u1_cbp_c == 0)
1448    {
1449        (*pv_mb_coeff_data) = ps_mb_coeff_data_dc;
1450    }
1451    else if (*u1_cbp_c == 1)
1452    {
1453        (*pv_mb_coeff_data) = ps_mb_coeff_data_ac;
1454    }
1455
1456    return ;
1457}
1458
1459/**
1460*******************************************************************************
1461*
1462* @brief performs luma core coding when intra mode is i16x16
1463*
1464* @par Description:
1465*  If the current mb is to be coded as intra of mb type i16x16, the mb is first
1466*  predicted using one of i16x16 prediction filters, basing on the intra mode
1467*  chosen. Then, error is computed between the input blk and the estimated blk.
1468*  This error is transformed (hierarchical transform i.e., dct followed by hada-
1469*  -mard), quantized. The quantized coefficients are packed in scan order for
1470*  entropy coding.
1471*
1472* @param[in] ps_proc_ctxt
1473*  pointer to the current macro block context
1474*
1475* @returns u1_cbp_l
1476*  coded block pattern luma
1477*
1478* @remarks none
1479*
1480*******************************************************************************
1481*/
1482
1483UWORD8 ih264e_code_luma_intra_macroblock_16x16(process_ctxt_t *ps_proc)
1484{
1485    /* Codec Context */
1486    codec_t *ps_codec = ps_proc->ps_codec;
1487
1488    /* pointer to ref macro block */
1489    UWORD8 *pu1_ref_mb = ps_proc->pu1_rec_buf_luma;
1490
1491    /* pointer to src macro block */
1492    UWORD8 *pu1_curr_mb = ps_proc->pu1_src_buf_luma;
1493
1494    /* pointer to prediction macro block */
1495    UWORD8 *pu1_pred_mb = NULL;
1496
1497    /* pointer to residual macro block */
1498    WORD16 *pi2_res_mb = ps_proc->pi2_res_buf;
1499
1500    /* strides */
1501    WORD32 i4_src_strd = ps_proc->i4_src_strd;
1502    WORD32 i4_rec_strd = ps_proc->i4_rec_strd;
1503    WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
1504    WORD32 i4_res_strd = ps_proc->i4_res_strd;
1505
1506    /* intra mode */
1507    UWORD8 u1_intra_mode = ps_proc->u1_l_i16_mode;
1508
1509    /* coded block pattern */
1510    UWORD8 u1_cbp_l = 0;
1511
1512    /* number of non zero coeffs*/
1513    UWORD32 au4_nnz[5];
1514    UWORD8  *pu1_nnz = (UWORD8 *)au4_nnz;
1515
1516    /*Cntrol signal for itrans*/
1517    UWORD32 u4_cntrl;
1518
1519    /* quantization parameters */
1520    quant_params_t *ps_qp_params = ps_proc->ps_qp_params[0];
1521
1522    /* pointer to packed mb coeff data */
1523    void **pv_mb_coeff_data = &(ps_proc->pv_mb_coeff_data);
1524
1525    /* init nnz */
1526    au4_nnz[0] = 0;
1527    au4_nnz[1] = 0;
1528    au4_nnz[2] = 0;
1529    au4_nnz[3] = 0;
1530    au4_nnz[4] = 0;
1531
1532    if (u1_intra_mode == PLANE_I16x16)
1533    {
1534        pu1_pred_mb = ps_proc->pu1_pred_mb_intra_16x16_plane;
1535    }
1536    else
1537    {
1538        pu1_pred_mb = ps_proc->pu1_pred_mb_intra_16x16;
1539    }
1540
1541    /********************************************************/
1542    /*  error estimation,                                   */
1543    /*  transform                                           */
1544    /*  quantization                                        */
1545    /********************************************************/
1546    ih264e_luma_16x16_resi_trans_dctrans_quant(ps_codec, pu1_curr_mb,
1547                                               pu1_pred_mb, pi2_res_mb,
1548                                               i4_src_strd, i4_pred_strd,
1549                                               i4_res_strd,
1550                                               ps_qp_params->pu2_scale_mat,
1551                                               ps_qp_params->pu2_thres_mat,
1552                                               ps_qp_params->u1_qbits,
1553                                               ps_qp_params->u4_dead_zone,
1554                                               pu1_nnz, ENABLE_DC_TRANSFORM);
1555
1556    /********************************************************/
1557    /*  pack coeff data for entropy coding                  */
1558    /********************************************************/
1559    ih264e_pack_l_mb_i16(pi2_res_mb, pv_mb_coeff_data, i4_res_strd, &u1_cbp_l,
1560                         pu1_nnz, &u4_cntrl);
1561
1562    /********************************************************/
1563    /*  ierror estimation,                                  */
1564    /*  itransform                                          */
1565    /*  iquantization                                       */
1566    /********************************************************/
1567    /*
1568     *if refernce frame is not to be computed
1569     *we only need the right and bottom border 4x4 blocks to predict next intra
1570     *blocks, hence only compute them
1571     */
1572    if (!ps_proc->u4_compute_recon)
1573    {
1574        u4_cntrl &= 0x111F8000;
1575    }
1576
1577    if (u4_cntrl)
1578    {
1579        ih264e_luma_16x16_idctrans_iquant_itrans_recon(
1580                        ps_codec, pi2_res_mb, pu1_pred_mb, pu1_ref_mb,
1581                        i4_res_strd, i4_pred_strd, i4_rec_strd,
1582                        ps_qp_params->pu2_iscale_mat,
1583                        ps_qp_params->pu2_weigh_mat, ps_qp_params->u1_qp_div,
1584                        u4_cntrl, ENABLE_DC_TRANSFORM,
1585                        ps_proc->pv_scratch_buff);
1586    }
1587    else
1588    {
1589        ps_codec->pf_inter_pred_luma_copy(pu1_pred_mb, pu1_ref_mb, i4_pred_strd,
1590                                          i4_rec_strd, MB_SIZE, MB_SIZE, NULL,
1591                                          0);
1592    }
1593
1594    return (u1_cbp_l);
1595}
1596
1597
1598/**
1599*******************************************************************************
1600*
1601* @brief performs luma core coding when intra mode is i4x4
1602*
1603* @par Description:
1604*  If the current mb is to be coded as intra of mb type i4x4, the mb is first
1605*  predicted using one of i4x4 prediction filters, basing on the intra mode
1606*  chosen. Then, error is computed between the input blk and the estimated blk.
1607*  This error is dct transformed and quantized. The quantized coefficients are
1608*  packed in scan order for entropy coding.
1609*
1610* @param[in] ps_proc_ctxt
1611*  pointer to the current macro block context
1612*
1613* @returns u1_cbp_l
1614*  coded block pattern luma
1615*
1616* @remarks
1617*  The traversal of 4x4 subblocks in the 16x16 macroblock is as per the scan order
1618*  mentioned in h.264 specification
1619*
1620*******************************************************************************
1621*/
1622UWORD8 ih264e_code_luma_intra_macroblock_4x4(process_ctxt_t *ps_proc)
1623{
1624    /* Codec Context */
1625    codec_t *ps_codec = ps_proc->ps_codec;
1626
1627    /* pointer to ref macro block */
1628    UWORD8 *pu1_ref_mb = ps_proc->pu1_rec_buf_luma;
1629
1630    /* pointer to src macro block */
1631    UWORD8 *pu1_curr_mb = ps_proc->pu1_src_buf_luma;
1632
1633    /* pointer to prediction macro block */
1634    UWORD8 *pu1_pred_mb = ps_proc->pu1_pred_mb;
1635
1636    /* pointer to residual macro block */
1637    WORD16 *pi2_res_mb = ps_proc->pi2_res_buf;
1638
1639    /* strides */
1640    WORD32 i4_src_strd = ps_proc->i4_src_strd;
1641    WORD32 i4_rec_strd = ps_proc->i4_rec_strd;
1642    WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
1643
1644    /* pointer to neighbors: left, top, top-left */
1645    UWORD8 *pu1_mb_a;
1646    UWORD8 *pu1_mb_b;
1647    UWORD8 *pu1_mb_c;
1648    UWORD8 *pu1_mb_d;
1649
1650    /* intra mode */
1651    UWORD8 u1_intra_mode = ps_proc->u1_l_i16_mode;
1652
1653    /* neighbor availability */
1654    WORD32 i4_ngbr_avbl;
1655
1656    /* neighbor pels for intra prediction */
1657    UWORD8 *pu1_ngbr_pels_i4 = ps_proc->au1_ngbr_pels;
1658
1659    /* coded block pattern */
1660    UWORD8 u1_cbp_l = 0;
1661
1662    /* number of non zero coeffs*/
1663    UWORD8  u1_nnz;
1664
1665    /* quantization parameters */
1666    quant_params_t *ps_qp_params = ps_proc->ps_qp_params[0];
1667
1668    /* pointer to packed mb coeff data */
1669    void **pv_mb_coeff_data = &(ps_proc->pv_mb_coeff_data);
1670
1671    /* pointer to packed mb coeff data */
1672    tu_sblk_coeff_data_t *ps_mb_coeff_data, *ps_mb_coeff_data_b8;
1673
1674    /* no of non zero coefficients in the current sub block */
1675    UWORD32 u4_nnz_cnt;
1676
1677    /* significant coefficient map */
1678    UWORD32 u4_s_map;
1679
1680    /* pointer to scanning matrix */
1681    const UWORD8 *pu1_scan_order = gu1_luma_scan_order;
1682
1683    /*Dummy variable for 4x4 trans fucntion*/
1684    WORD16 i2_dc_dummy;
1685
1686    /* temp var */
1687    UWORD32 i, b8, b4, u1_blk_x, u1_blk_y, u1_pix_x, u1_pix_y, coeff_cnt, mask;
1688
1689    /* Process 16 4x4 lum sub-blocks of the MB in scan order */
1690    for (b8 = 0; b8 < 4; b8++)
1691    {
1692        u1_blk_x = GET_BLK_RASTER_POS_X(b8) << 3;
1693        u1_blk_y = GET_BLK_RASTER_POS_Y(b8) << 3;
1694
1695        /* if in case cbp for the 8x8 block is zero, send no residue */
1696        ps_mb_coeff_data_b8 = *pv_mb_coeff_data;
1697
1698        for (b4 = 0; b4 < 4; b4++)
1699        {
1700            /* index of pel in MB */
1701            u1_pix_x = u1_blk_x + (GET_SUB_BLK_RASTER_POS_X(b4) << 2);
1702            u1_pix_y = u1_blk_y + (GET_SUB_BLK_RASTER_POS_Y(b4) << 2);
1703
1704            /* Initialize source and reference pointers */
1705            pu1_curr_mb = ps_proc->pu1_src_buf_luma + u1_pix_x + (u1_pix_y * i4_src_strd);
1706            pu1_ref_mb = ps_proc->pu1_rec_buf_luma + u1_pix_x + (u1_pix_y * i4_rec_strd);
1707
1708            /* pointer to left of ref macro block */
1709            pu1_mb_a = pu1_ref_mb - 1;
1710            /* pointer to top of ref macro block */
1711            pu1_mb_b = pu1_ref_mb - i4_rec_strd;
1712            /* pointer to topright of ref macro block */
1713            pu1_mb_c = pu1_mb_b + 4;
1714            /* pointer to topleft macro block */
1715            pu1_mb_d = pu1_mb_b - 1;
1716
1717            /* compute neighbor availability */
1718            i4_ngbr_avbl = ps_proc->au1_ngbr_avbl_4x4_subblks[(b8 << 2) + b4];
1719
1720            /* sub block intra mode */
1721            u1_intra_mode = ps_proc->au1_intra_luma_mb_4x4_modes[(b8 << 2) + b4];
1722
1723            /********************************************************/
1724            /* gather prediction pels from neighbors for prediction */
1725            /********************************************************/
1726            /* left pels */
1727            if (i4_ngbr_avbl & LEFT_MB_AVAILABLE_MASK)
1728            {
1729                for (i = 0; i < 4; i++)
1730                    pu1_ngbr_pels_i4[4 - 1 - i] = pu1_mb_a[i * i4_rec_strd];
1731            }
1732            else
1733            {
1734                memset(pu1_ngbr_pels_i4, 0, 4);
1735            }
1736
1737            /* top pels */
1738            if (i4_ngbr_avbl & TOP_MB_AVAILABLE_MASK)
1739            {
1740                memcpy(pu1_ngbr_pels_i4 + 4 + 1, pu1_mb_b, 4);
1741            }
1742            else
1743            {
1744                memset(pu1_ngbr_pels_i4 + 5, 0, 4);
1745            }
1746            /* top left pels */
1747            if (i4_ngbr_avbl & TOP_LEFT_MB_AVAILABLE_MASK)
1748            {
1749                pu1_ngbr_pels_i4[4] = *pu1_mb_d;
1750            }
1751            else
1752            {
1753                pu1_ngbr_pels_i4[4] = 0;
1754            }
1755            /* top right pels */
1756            if (i4_ngbr_avbl & TOP_RIGHT_MB_AVAILABLE_MASK)
1757            {
1758                memcpy(pu1_ngbr_pels_i4+8+1,pu1_mb_c,4);
1759            }
1760            else if (i4_ngbr_avbl & TOP_MB_AVAILABLE_MASK)
1761            {
1762                memset(pu1_ngbr_pels_i4+8+1,pu1_ngbr_pels_i4[8],4);
1763            }
1764
1765            /********************************************************/
1766            /*  prediction                                          */
1767            /********************************************************/
1768            (ps_codec->apf_intra_pred_4_l)[u1_intra_mode](pu1_ngbr_pels_i4,
1769                                                          pu1_pred_mb, 0,
1770                                                          i4_pred_strd,
1771                                                          i4_ngbr_avbl);
1772
1773            /********************************************************/
1774            /*  error estimation,                                   */
1775            /*  transform                                           */
1776            /*  quantization                                        */
1777            /********************************************************/
1778            ps_codec->pf_resi_trans_quant_4x4(pu1_curr_mb, pu1_pred_mb,
1779                                              pi2_res_mb, i4_src_strd,
1780                                              i4_pred_strd,
1781                                              ps_qp_params->pu2_scale_mat,
1782                                              ps_qp_params->pu2_thres_mat,
1783                                              ps_qp_params->u1_qbits,
1784                                              ps_qp_params->u4_dead_zone,
1785                                              &u1_nnz, &i2_dc_dummy);
1786
1787            /********************************************************/
1788            /*  pack coeff data for entropy coding                  */
1789            /********************************************************/
1790            ps_mb_coeff_data = *pv_mb_coeff_data;
1791
1792            /* write number of non zero coefficients */
1793            ps_mb_coeff_data->i4_sig_map_nnz = u1_nnz;
1794
1795            if (u1_nnz)
1796            {
1797                for (u4_nnz_cnt = 0, coeff_cnt = 0, mask = 1, u4_s_map = 0; u4_nnz_cnt < u1_nnz; coeff_cnt++)
1798                {
1799                    if (pi2_res_mb[pu1_scan_order[coeff_cnt]])
1800                    {
1801                        /* write residue */
1802                        ps_mb_coeff_data->ai2_residue[u4_nnz_cnt++] = pi2_res_mb[pu1_scan_order[coeff_cnt]];
1803                        u4_s_map |= mask;
1804                    }
1805                    mask <<= 1;
1806                }
1807                /* write significant coeff map */
1808                ps_mb_coeff_data->i4_sig_map_nnz |= (u4_s_map << 16);
1809
1810                /* update ptr to coeff data */
1811                (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + ALIGN2(u4_nnz_cnt);
1812
1813                /* cbp */
1814                u1_cbp_l |= (1 << b8);
1815            }
1816            else
1817            {
1818                (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue;
1819            }
1820
1821            /********************************************************/
1822            /*  ierror estimation,                                  */
1823            /*  itransform                                          */
1824            /*  iquantization                                       */
1825            /********************************************************/
1826            if (u1_nnz)
1827                ps_codec->pf_iquant_itrans_recon_4x4(
1828                                pi2_res_mb, pu1_pred_mb, pu1_ref_mb,
1829                                /*No input stride,*/i4_pred_strd,
1830                                i4_rec_strd, ps_qp_params->pu2_iscale_mat,
1831                                ps_qp_params->pu2_weigh_mat,
1832                                ps_qp_params->u1_qp_div,
1833                                ps_proc->pv_scratch_buff, 0, 0);
1834            else
1835                ps_codec->pf_inter_pred_luma_copy(pu1_pred_mb, pu1_ref_mb,
1836                                                  i4_pred_strd, i4_rec_strd,
1837                                                  BLK_SIZE, BLK_SIZE, NULL,
1838                                                  0);
1839
1840        }
1841
1842        /* if the 8x8 block has no residue, nothing needs to be sent to entropy */
1843        if (!(u1_cbp_l & (1 << b8)))
1844        {
1845            *pv_mb_coeff_data = ps_mb_coeff_data_b8;
1846        }
1847    }
1848
1849    return (u1_cbp_l);
1850}
1851
1852/**
1853*******************************************************************************
1854*
1855* @brief performs luma core coding when intra mode is i4x4
1856*
1857* @par Description:
1858*  If the current mb is to be coded as intra of mb type i4x4, the mb is first
1859*  predicted using one of i4x4 prediction filters, basing on the intra mode
1860*  chosen. Then, error is computed between the input blk and the estimated blk.
1861*  This error is dct transformed and quantized. The quantized coefficients are
1862*  packed in scan order for entropy coding.
1863*
1864* @param[in] ps_proc_ctxt
1865*  pointer to the current macro block context
1866*
1867* @returns u1_cbp_l
1868*  coded block pattern luma
1869*
1870* @remarks
1871*  The traversal of 4x4 subblocks in the 16x16 macroblock is as per the scan order
1872*  mentioned in h.264 specification
1873*
1874*******************************************************************************
1875*/
1876UWORD8 ih264e_code_luma_intra_macroblock_4x4_rdopt_on(process_ctxt_t *ps_proc)
1877{
1878    /* Codec Context */
1879    codec_t *ps_codec = ps_proc->ps_codec;
1880
1881    /* pointer to ref macro block */
1882    UWORD8 *pu1_ref_mb_intra_4x4 = ps_proc->pu1_ref_mb_intra_4x4;
1883
1884    /* pointer to recon buffer */
1885    UWORD8 *pu1_rec_mb = ps_proc->pu1_rec_buf_luma;
1886
1887    /* pointer to residual macro block */
1888    WORD16 *pi2_res_mb = ps_proc->pi2_res_buf_intra_4x4;
1889
1890    /* strides */
1891    WORD32 i4_rec_strd = ps_proc->i4_rec_strd;
1892
1893    /* number of non zero coeffs*/
1894    UWORD8  *pu1_nnz = (UWORD8 *)ps_proc->au4_nnz_intra_4x4;
1895
1896    /* coded block pattern */
1897    UWORD8 u1_cbp_l = 0;
1898
1899    /* pointer to packed mb coeff data */
1900    void **pv_mb_coeff_data = &(ps_proc->pv_mb_coeff_data);
1901
1902    /* pointer to packed mb coeff data */
1903    tu_sblk_coeff_data_t *ps_mb_coeff_data, *ps_mb_coeff_data_b8;
1904
1905    /* no of non zero coefficients in the current sub block */
1906    UWORD32 u4_nnz_cnt;
1907
1908    /* significant coefficient map */
1909    UWORD32 u4_s_map;
1910
1911    /* pointer to scanning matrix */
1912    const UWORD8 *pu1_scan_order = gu1_luma_scan_order;
1913
1914    /* temp var */
1915    UWORD32 b8, b4, coeff_cnt, mask;
1916
1917    /* Process 16 4x4 lum sub-blocks of the MB in scan order */
1918    for (b8 = 0; b8 < 4; b8++)
1919    {
1920        /* if in case cbp for the 8x8 block is zero, send no residue */
1921        ps_mb_coeff_data_b8 = *pv_mb_coeff_data;
1922
1923        for (b4 = 0; b4 < 4; b4++, pu1_nnz++, pi2_res_mb += MB_SIZE)
1924        {
1925            /********************************************************/
1926            /*  pack coeff data for entropy coding                  */
1927            /********************************************************/
1928            ps_mb_coeff_data = *pv_mb_coeff_data;
1929
1930            /* write number of non zero coefficients */
1931            ps_mb_coeff_data->i4_sig_map_nnz = *pu1_nnz;
1932
1933            if (*pu1_nnz)
1934            {
1935                for (u4_nnz_cnt = 0, coeff_cnt = 0, mask = 1, u4_s_map = 0; u4_nnz_cnt < *pu1_nnz; coeff_cnt++)
1936                {
1937                    if (pi2_res_mb[pu1_scan_order[coeff_cnt]])
1938                    {
1939                        /* write residue */
1940                        ps_mb_coeff_data->ai2_residue[u4_nnz_cnt++] = pi2_res_mb[pu1_scan_order[coeff_cnt]];
1941                        u4_s_map |= mask;
1942                    }
1943                    mask <<= 1;
1944                }
1945                /* write significant coeff map */
1946                ps_mb_coeff_data->i4_sig_map_nnz |= (u4_s_map << 16);
1947
1948                /* update ptr to coeff data */
1949                (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + ALIGN2(u4_nnz_cnt);
1950
1951                /* cbp */
1952                u1_cbp_l |= (1 << b8);
1953            }
1954            else
1955            {
1956                (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue;
1957            }
1958        }
1959
1960        /* if the 8x8 block has no residue, nothing needs to be sent to entropy */
1961        if (!(u1_cbp_l & (1 << b8)))
1962        {
1963            *pv_mb_coeff_data = ps_mb_coeff_data_b8;
1964        }
1965    }
1966
1967    /* memcpy recon */
1968    ps_codec->pf_inter_pred_luma_copy(pu1_ref_mb_intra_4x4, pu1_rec_mb, MB_SIZE, i4_rec_strd, MB_SIZE, MB_SIZE, NULL, 0);
1969
1970    return (u1_cbp_l);
1971}
1972
1973
1974/**
1975*******************************************************************************
1976*
1977* @brief performs chroma core coding for intra macro blocks
1978*
1979* @par Description:
1980*  If the current MB is to be intra coded with mb type chroma I8x8, the MB is
1981*  first predicted using intra 8x8 prediction filters. The predicted data is
1982*  compared with the input for error and the error is transformed. The DC
1983*  coefficients of each transformed sub blocks are further transformed using
1984*  Hadamard transform. The resulting coefficients are quantized, packed and sent
1985*  for entropy coding.
1986*
1987* @param[in] ps_proc_ctxt
1988*  pointer to the current macro block context
1989*
1990* @returns u1_cbp_c
1991*  coded block pattern chroma
1992*
1993* @remarks
1994*  The traversal of 4x4 subblocks in the 8x8 macroblock is as per the scan order
1995*  mentioned in h.264 specification
1996*
1997*******************************************************************************
1998*/
1999UWORD8 ih264e_code_chroma_intra_macroblock_8x8(process_ctxt_t *ps_proc)
2000{
2001    /* Codec Context */
2002    codec_t *ps_codec = ps_proc->ps_codec;
2003
2004    /* pointer to ref macro block */
2005    UWORD8 *pu1_ref_mb = ps_proc->pu1_rec_buf_chroma;
2006
2007    /* pointer to src macro block */
2008    UWORD8 *pu1_curr_mb = ps_proc->pu1_src_buf_chroma;
2009
2010    /* pointer to prediction macro block */
2011    UWORD8 *pu1_pred_mb = NULL;
2012
2013    /* pointer to residual macro block */
2014    WORD16 *pi2_res_mb = ps_proc->pi2_res_buf;
2015
2016    /* strides */
2017    WORD32 i4_src_strd = ps_proc->i4_src_chroma_strd;
2018    WORD32 i4_rec_strd = ps_proc->i4_rec_strd;
2019    WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
2020    WORD32 i4_res_strd = ps_proc->i4_res_strd;
2021
2022    /* intra mode */
2023    UWORD8 u1_intra_mode = ps_proc->u1_c_i8_mode;
2024
2025    /* coded block pattern */
2026    UWORD8 u1_cbp_c = 0;
2027
2028    /* number of non zero coeffs*/
2029    UWORD8 au1_nnz[18] = {0};
2030
2031    /* quantization parameters */
2032    quant_params_t *ps_qp_params = ps_proc->ps_qp_params[1];
2033
2034    /* Control signal for inverse transform */
2035    UWORD32 u4_cntrl;
2036
2037    /* pointer to packed mb coeff data */
2038    void **pv_mb_coeff_data = &(ps_proc->pv_mb_coeff_data);
2039
2040    /* See if we need to swap U and V plances for entropy */
2041    UWORD32 u4_swap_uv = ps_codec->s_cfg.e_inp_color_fmt == IV_YUV_420SP_VU;
2042
2043    if (PLANE_CH_I8x8 == u1_intra_mode)
2044    {
2045        pu1_pred_mb = ps_proc->pu1_pred_mb_intra_chroma_plane;
2046    }
2047    else
2048    {
2049        pu1_pred_mb = ps_proc->pu1_pred_mb_intra_chroma;
2050    }
2051
2052    /********************************************************/
2053    /*  error estimation,                                   */
2054    /*  transform                                           */
2055    /*  quantization                                        */
2056    /********************************************************/
2057    ih264e_chroma_8x8_resi_trans_dctrans_quant(ps_codec, pu1_curr_mb,
2058                                               pu1_pred_mb, pi2_res_mb,
2059                                               i4_src_strd, i4_pred_strd,
2060                                               i4_res_strd,
2061                                               ps_qp_params->pu2_scale_mat,
2062                                               ps_qp_params->pu2_thres_mat,
2063                                               ps_qp_params->u1_qbits,
2064                                               ps_qp_params->u4_dead_zone,
2065                                               au1_nnz);
2066
2067    /********************************************************/
2068    /*  pack coeff data for entropy coding                  */
2069    /********************************************************/
2070    ih264e_pack_c_mb(pi2_res_mb, pv_mb_coeff_data, i4_res_strd, &u1_cbp_c,
2071                     au1_nnz, ps_codec->u4_thres_resi, &u4_cntrl, u4_swap_uv);
2072
2073    /********************************************************/
2074    /*  ierror estimation,                                  */
2075    /*  itransform                                          */
2076    /*  iquantization                                       */
2077    /********************************************************/
2078    ih264e_chroma_8x8_idctrans_iquant_itrans_recon(ps_codec, pi2_res_mb,
2079                                                   pu1_pred_mb, pu1_ref_mb,
2080                                                   i4_res_strd, i4_pred_strd,
2081                                                   i4_rec_strd,
2082                                                   ps_qp_params->pu2_iscale_mat,
2083                                                   ps_qp_params->pu2_weigh_mat,
2084                                                   ps_qp_params->u1_qp_div,
2085                                                   u4_cntrl,
2086                                                   ps_proc->pv_scratch_buff);
2087    return (u1_cbp_c);
2088}
2089
2090
2091/**
2092*******************************************************************************
2093*
2094* @brief performs luma core coding when  mode is inter
2095*
2096* @par Description:
2097*  If the current mb is to be coded as inter the mb is predicted based on the
2098*  sub mb partitions and corresponding motion vectors generated by ME. Then,
2099*  error is computed between the input blk and the estimated blk. This error is
2100*  transformed, quantized. The quantized coefficients are packed in scan order
2101*  for entropy coding
2102*
2103* @param[in] ps_proc_ctxt
2104*  pointer to the current macro block context
2105*
2106* @returns u1_cbp_l
2107*  coded block pattern luma
2108*
2109* @remarks none
2110*
2111*******************************************************************************
2112*/
2113
2114UWORD8 ih264e_code_luma_inter_macroblock_16x16(process_ctxt_t *ps_proc)
2115{
2116    /* Codec Context */
2117    codec_t *ps_codec = ps_proc->ps_codec;
2118
2119    /* pointer to ref macro block */
2120    UWORD8 *pu1_rec_mb = ps_proc->pu1_rec_buf_luma;
2121
2122    /* pointer to src macro block */
2123    UWORD8 *pu1_curr_mb = ps_proc->pu1_src_buf_luma;
2124
2125    /* pointer to prediction macro block */
2126    UWORD8 *pu1_pred_mb = ps_proc->pu1_pred_mb;
2127
2128    /* pointer to residual macro block */
2129    WORD16 *pi2_res_mb = ps_proc->pi2_res_buf;
2130
2131    /* strides */
2132    WORD32 i4_src_strd = ps_proc->i4_src_strd;
2133    WORD32 i4_rec_strd = ps_proc->i4_rec_strd;
2134    WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
2135    WORD32 i4_res_strd = ps_proc->i4_res_strd;
2136
2137    /* coded block pattern */
2138    UWORD8 u1_cbp_l = 0;
2139
2140    /*Control signal of itrans*/
2141    UWORD32 u4_cntrl;
2142
2143    /* number of non zero coeffs*/
2144    UWORD8  *pu1_nnz = (UWORD8 *)ps_proc->au4_nnz;
2145
2146    /* quantization parameters */
2147    quant_params_t *ps_qp_params = ps_proc->ps_qp_params[0];
2148
2149    /* pointer to packed mb coeff data */
2150    void **pv_mb_coeff_data = &(ps_proc->pv_mb_coeff_data);
2151
2152    /* pseudo pred buffer */
2153    UWORD8 *pu1_pseudo_pred = pu1_pred_mb;
2154
2155    /* pseudo pred buffer stride */
2156    WORD32 i4_pseudo_pred_strd = i4_pred_strd;
2157
2158    /* init nnz */
2159    ps_proc->au4_nnz[0] = 0;
2160    ps_proc->au4_nnz[1] = 0;
2161    ps_proc->au4_nnz[2] = 0;
2162    ps_proc->au4_nnz[3] = 0;
2163    ps_proc->au4_nnz[4] = 0;
2164
2165    /********************************************************/
2166    /*  prediction                                          */
2167    /********************************************************/
2168    ih264e_motion_comp_luma(ps_proc, &pu1_pseudo_pred, &i4_pseudo_pred_strd);
2169
2170    /********************************************************/
2171    /*  error estimation,                                   */
2172    /*  transform                                           */
2173    /*  quantization                                        */
2174    /********************************************************/
2175    if (ps_proc->u4_min_sad_reached == 0 || ps_proc->u4_min_sad != 0)
2176    {
2177        ih264e_luma_16x16_resi_trans_dctrans_quant(ps_codec, pu1_curr_mb,
2178                                                   pu1_pseudo_pred, pi2_res_mb,
2179                                                   i4_src_strd,
2180                                                   i4_pseudo_pred_strd,
2181                                                   i4_res_strd,
2182                                                   ps_qp_params->pu2_scale_mat,
2183                                                   ps_qp_params->pu2_thres_mat,
2184                                                   ps_qp_params->u1_qbits,
2185                                                   ps_qp_params->u4_dead_zone,
2186                                                   pu1_nnz,
2187                                                   DISABLE_DC_TRANSFORM);
2188
2189        /********************************************************/
2190        /*  pack coeff data for entropy coding                  */
2191        /********************************************************/
2192        ih264e_pack_l_mb(pi2_res_mb, pv_mb_coeff_data, i4_res_strd, &u1_cbp_l,
2193                         pu1_nnz, ps_codec->u4_thres_resi, &u4_cntrl);
2194    }
2195    else
2196    {
2197        u1_cbp_l = 0;
2198        u4_cntrl = 0;
2199    }
2200
2201    /********************************************************/
2202    /*  ierror estimation,                                  */
2203    /*  itransform                                          */
2204    /*  iquantization                                       */
2205    /********************************************************/
2206
2207    /*If the frame is not to be used for P frame reference or dumping recon
2208     * we only will use the reocn for only predicting intra Mbs
2209     * THis will need only right and bottom edge 4x4 blocks recon
2210     * Hence we selectively enable them using control signal(including DC)
2211     */
2212    if (ps_proc->u4_compute_recon != 1)
2213    {
2214        u4_cntrl &= 0x111F0000;
2215    }
2216
2217    if (u4_cntrl)
2218    {
2219        ih264e_luma_16x16_idctrans_iquant_itrans_recon(
2220                        ps_codec, pi2_res_mb, pu1_pseudo_pred, pu1_rec_mb,
2221                        i4_res_strd, i4_pseudo_pred_strd, i4_rec_strd,
2222                        ps_qp_params->pu2_iscale_mat,
2223                        ps_qp_params->pu2_weigh_mat, ps_qp_params->u1_qp_div,
2224                        u4_cntrl /*Cntrl*/, DISABLE_DC_TRANSFORM,
2225                        ps_proc->pv_scratch_buff);
2226    }
2227    else
2228    {
2229        ps_codec->pf_inter_pred_luma_copy(pu1_pseudo_pred, pu1_rec_mb,
2230                                          i4_pseudo_pred_strd, i4_rec_strd,
2231                                          MB_SIZE, MB_SIZE, NULL, 0);
2232    }
2233
2234
2235    return (u1_cbp_l);
2236}
2237
2238/**
2239*******************************************************************************
2240*
2241* @brief performs chroma core coding for inter macro blocks
2242*
2243* @par Description:
2244*  If the current mb is to be coded as inter predicted mb,based on the sub mb partitions
2245*  and corresponding motion vectors generated by ME  ,prediction is done.
2246*  Then, error is computed between the input blk and the estimated blk.
2247*  This error is transformed , quantized. The quantized coefficients
2248*  are packed in scan order for
2249*  entropy coding.
2250*
2251* @param[in] ps_proc_ctxt
2252*  pointer to the current macro block context
2253*
2254* @returns u1_cbp_l
2255*  coded block pattern chroma
2256*
2257* @remarks none
2258*
2259*******************************************************************************
2260*/
2261UWORD8 ih264e_code_chroma_inter_macroblock_8x8(process_ctxt_t *ps_proc)
2262{
2263    /* Codec Context */
2264    codec_t *ps_codec = ps_proc->ps_codec;
2265
2266    /* pointer to ref macro block */
2267    UWORD8 *pu1_rec_mb = ps_proc->pu1_rec_buf_chroma;
2268
2269    /* pointer to src macro block */
2270    UWORD8 *pu1_curr_mb = ps_proc->pu1_src_buf_chroma;
2271
2272    /* pointer to prediction macro block */
2273    UWORD8 *pu1_pred_mb = ps_proc->pu1_pred_mb;
2274
2275    /* pointer to residual macro block */
2276    WORD16 *pi2_res_mb = ps_proc->pi2_res_buf;
2277
2278    /* strides */
2279    WORD32 i4_src_strd = ps_proc->i4_src_chroma_strd;
2280    WORD32 i4_rec_strd = ps_proc->i4_rec_strd;
2281    WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
2282    WORD32 i4_res_strd = ps_proc->i4_res_strd;
2283
2284    /* coded block pattern */
2285    UWORD8 u1_cbp_c = 0;
2286
2287    /*Control signal for inverse transform*/
2288    UWORD32 u4_cntrl;
2289
2290    /* number of non zero coeffs*/
2291    UWORD8 au1_nnz[10] = {0};
2292
2293    /* quantization parameters */
2294    quant_params_t *ps_qp_params = ps_proc->ps_qp_params[1];
2295
2296    /* pointer to packed mb coeff data */
2297    void **pv_mb_coeff_data = &(ps_proc->pv_mb_coeff_data);
2298
2299    /*See if we need to swap U and V plances for entropy*/
2300    UWORD32 u4_swap_uv = ps_codec->s_cfg.e_inp_color_fmt == IV_YUV_420SP_VU;
2301
2302    /********************************************************/
2303    /*  prediction                                          */
2304    /********************************************************/
2305    ih264e_motion_comp_chroma(ps_proc);
2306
2307    /********************************************************/
2308    /*  error estimation,                                   */
2309    /*  transform                                           */
2310    /*  quantization                                        */
2311    /********************************************************/
2312    ih264e_chroma_8x8_resi_trans_dctrans_quant(ps_codec, pu1_curr_mb,
2313                                               pu1_pred_mb, pi2_res_mb,
2314                                               i4_src_strd, i4_pred_strd,
2315                                               i4_res_strd,
2316                                               ps_qp_params->pu2_scale_mat,
2317                                               ps_qp_params->pu2_thres_mat,
2318                                               ps_qp_params->u1_qbits,
2319                                               ps_qp_params->u4_dead_zone,
2320                                               au1_nnz);
2321
2322    /********************************************************/
2323    /*  pack coeff data for entropy coding                  */
2324    /********************************************************/
2325    ih264e_pack_c_mb(pi2_res_mb, pv_mb_coeff_data, i4_res_strd, &u1_cbp_c,
2326                     au1_nnz, ps_codec->u4_thres_resi, &u4_cntrl, u4_swap_uv);
2327
2328    /********************************************************/
2329    /*  ierror estimation,                                  */
2330    /*  itransform                                          */
2331    /*  iquantization                                       */
2332    /********************************************************/
2333
2334    /* If the frame is not to be used for P frame reference or dumping recon
2335     * we only will use the reocn for only predicting intra Mbs
2336     * THis will need only right and bottom edge 4x4 blocks recon
2337     * Hence we selectively enable them using control signal(including DC)
2338     */
2339    if (!ps_proc->u4_compute_recon)
2340    {
2341        u4_cntrl &= 0x7700C000;
2342    }
2343
2344    if (u4_cntrl)
2345    {
2346        ih264e_chroma_8x8_idctrans_iquant_itrans_recon(
2347                        ps_codec, pi2_res_mb, pu1_pred_mb, pu1_rec_mb,
2348                        i4_res_strd, i4_pred_strd, i4_rec_strd,
2349                        ps_qp_params->pu2_iscale_mat,
2350                        ps_qp_params->pu2_weigh_mat, ps_qp_params->u1_qp_div,
2351                        u4_cntrl, ps_proc->pv_scratch_buff);
2352    }
2353    else
2354    {
2355        ps_codec->pf_inter_pred_luma_copy(pu1_pred_mb, pu1_rec_mb, i4_pred_strd,
2356                                          i4_rec_strd, MB_SIZE >> 1, MB_SIZE,
2357                                          NULL, 0);
2358    }
2359
2360    return (u1_cbp_c);
2361}
2362