1/******************************************************************************
2*
3* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4*
5* Licensed under the Apache License, Version 2.0 (the "License");
6* you may not use this file except in compliance with the License.
7* You may obtain a copy of the License at:
8*
9* http://www.apache.org/licenses/LICENSE-2.0
10*
11* Unless required by applicable law or agreed to in writing, software
12* distributed under the License is distributed on an "AS IS" BASIS,
13* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14* See the License for the specific language governing permissions and
15* limitations under the License.
16*
17******************************************************************************/
18/**
19 *******************************************************************************
20 * @file
21 *  ihevc_itrans_recon_16x16.c
22 *
23 * @brief
24 *  Contains function definitions for inverse transform  and reconstruction 16x16
25 *
26 *
27 * @author
28 *  100470
29 *
30 * @par List of Functions:
31 *  - ihevc_itrans_recon_16x16()
32 *
33 * @remarks
34 *  None
35 *
36 *******************************************************************************
37 */
38#include <stdio.h>
39#include <string.h>
40#include "ihevc_typedefs.h"
41#include "ihevc_macros.h"
42#include "ihevc_platform_macros.h"
43#include "ihevc_defs.h"
44#include "ihevc_trans_tables.h"
45#include "ihevc_itrans_recon.h"
46#include "ihevc_func_selector.h"
47#include "ihevc_trans_macros.h"
48
49/**
50 *******************************************************************************
51 *
52 * @brief
53 *  This function performs Inverse transform  and reconstruction for 16x16
54 * input block
55 *
56 * @par Description:
57 *  Performs inverse transform and adds the prediction  data and clips output
58 * to 8 bit
59 *
60 * @param[in] pi2_src
61 *  Input 16x16 coefficients
62 *
63 * @param[in] pi2_tmp
64 *  Temporary 16x16 buffer for storing inverse
65 *
66 *  transform
67 *  1st stage output
68 *
69 * @param[in] pu1_pred
70 *  Prediction 16x16 block
71 *
72 * @param[out] pu1_dst
73 *  Output 16x16 block
74 *
75 * @param[in] src_strd
76 *  Input stride
77 *
78 * @param[in] pred_strd
79 *  Prediction stride
80 *
81 * @param[in] dst_strd
82 *  Output Stride
83 *
84 * @param[in] shift
85 *  Output shift
86 *
87 * @param[in] zero_cols
88 *  Zero columns in pi2_src
89 *
90 * @returns  Void
91 *
92 * @remarks
93 *  None
94 *
95 *******************************************************************************
96 */
97
98void ihevc_itrans_recon_16x16(WORD16 *pi2_src,
99                              WORD16 *pi2_tmp,
100                              UWORD8 *pu1_pred,
101                              UWORD8 *pu1_dst,
102                              WORD32 src_strd,
103                              WORD32 pred_strd,
104                              WORD32 dst_strd,
105                              WORD32 zero_cols,
106                              WORD32 zero_rows)
107{
108    WORD32 j, k;
109    WORD32 e[8], o[8];
110    WORD32 ee[4], eo[4];
111    WORD32 eee[2], eeo[2];
112    WORD32 add;
113    WORD32 shift;
114    WORD16 *pi2_tmp_orig;
115    WORD32 trans_size;
116    WORD32 zero_rows_2nd_stage = zero_cols;
117    WORD32 row_limit_2nd_stage;
118
119    if((zero_cols & 0xFFF0) == 0xFFF0)
120        row_limit_2nd_stage = 4;
121    else if((zero_cols & 0xFF00) == 0xFF00)
122        row_limit_2nd_stage = 8;
123    else
124        row_limit_2nd_stage = TRANS_SIZE_16;
125
126    trans_size = TRANS_SIZE_16;
127    pi2_tmp_orig = pi2_tmp;
128    if((zero_rows & 0xFFF0) == 0xFFF0)  /* First 4 rows of input are non-zero */
129    {
130        /* Inverse Transform 1st stage */
131        /************************************************************************************************/
132        /**********************************START - IT_RECON_16x16****************************************/
133        /************************************************************************************************/
134
135        shift = IT_SHIFT_STAGE_1;
136        add = 1 << (shift - 1);
137
138        for(j = 0; j < row_limit_2nd_stage; j++)
139        {
140            /* Checking for Zero Cols */
141            if((zero_cols & 1) == 1)
142            {
143                memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
144            }
145            else
146            {
147                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
148                for(k = 0; k < 8; k++)
149                {
150                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_src[src_strd]
151                                    + g_ai2_ihevc_trans_16[3][k]
152                                                    * pi2_src[3 * src_strd];
153                }
154                for(k = 0; k < 4; k++)
155                {
156                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_src[2 * src_strd];
157                }
158                eeo[0] = 0;
159                eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_src[0];
160                eeo[1] = 0;
161                eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_src[0];
162
163                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
164                for(k = 0; k < 2; k++)
165                {
166                    ee[k] = eee[k] + eeo[k];
167                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
168                }
169                for(k = 0; k < 4; k++)
170                {
171                    e[k] = ee[k] + eo[k];
172                    e[k + 4] = ee[3 - k] - eo[3 - k];
173                }
174                for(k = 0; k < 8; k++)
175                {
176                    pi2_tmp[k] =
177                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
178                    pi2_tmp[k + 8] =
179                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
180                }
181            }
182            pi2_src++;
183            pi2_tmp += trans_size;
184            zero_cols = zero_cols >> 1;
185        }
186
187        pi2_tmp = pi2_tmp_orig;
188
189        /* Inverse Transform 2nd stage */
190        shift = IT_SHIFT_STAGE_2;
191        add = 1 << (shift - 1);
192
193        if((zero_rows_2nd_stage & 0xFFF0) == 0xFFF0) /* First 4 rows of output of 1st stage are non-zero */
194        {
195            for(j = 0; j < trans_size; j++)
196            {
197                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
198                for(k = 0; k < 8; k++)
199                {
200                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
201                                    + g_ai2_ihevc_trans_16[3][k]
202                                                    * pi2_tmp[3 * trans_size];
203                }
204                for(k = 0; k < 4; k++)
205                {
206                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size];
207                }
208                eeo[0] = 0;
209                eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
210                eeo[1] = 0;
211                eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
212
213                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
214                for(k = 0; k < 2; k++)
215                {
216                    ee[k] = eee[k] + eeo[k];
217                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
218                }
219                for(k = 0; k < 4; k++)
220                {
221                    e[k] = ee[k] + eo[k];
222                    e[k + 4] = ee[3 - k] - eo[3 - k];
223                }
224                for(k = 0; k < 8; k++)
225                {
226                    WORD32 itrans_out;
227                    itrans_out =
228                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
229                    pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
230                    itrans_out =
231                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
232                    pu1_dst[k + 8] = CLIP_U8((itrans_out + pu1_pred[k + 8]));
233                }
234                pi2_tmp++;
235                pu1_pred += pred_strd;
236                pu1_dst += dst_strd;
237            }
238        }
239        else if((zero_rows_2nd_stage & 0xFF00) == 0xFF00) /* First 4 rows of output of 1st stage are non-zero */
240        {
241            for(j = 0; j < trans_size; j++)
242            {
243                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
244                for(k = 0; k < 8; k++)
245                {
246                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
247                                    + g_ai2_ihevc_trans_16[3][k]
248                                                    * pi2_tmp[3 * trans_size]
249                                    + g_ai2_ihevc_trans_16[5][k]
250                                                    * pi2_tmp[5 * trans_size]
251                                    + g_ai2_ihevc_trans_16[7][k]
252                                                    * pi2_tmp[7 * trans_size];
253                }
254                for(k = 0; k < 4; k++)
255                {
256                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
257                                    + g_ai2_ihevc_trans_16[6][k]
258                                                    * pi2_tmp[6 * trans_size];
259                }
260                eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size];
261                eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
262                eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size];
263                eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
264
265                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
266                for(k = 0; k < 2; k++)
267                {
268                    ee[k] = eee[k] + eeo[k];
269                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
270                }
271                for(k = 0; k < 4; k++)
272                {
273                    e[k] = ee[k] + eo[k];
274                    e[k + 4] = ee[3 - k] - eo[3 - k];
275                }
276                for(k = 0; k < 8; k++)
277                {
278                    WORD32 itrans_out;
279                    itrans_out =
280                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
281                    pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
282                    itrans_out =
283                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
284                    pu1_dst[k + 8] = CLIP_U8((itrans_out + pu1_pred[k + 8]));
285                }
286                pi2_tmp++;
287                pu1_pred += pred_strd;
288                pu1_dst += dst_strd;
289            }
290        }
291        else /* All rows of output of 1st stage are non-zero */
292        {
293            for(j = 0; j < trans_size; j++)
294            {
295                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
296                for(k = 0; k < 8; k++)
297                {
298                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
299                                    + g_ai2_ihevc_trans_16[3][k]
300                                                    * pi2_tmp[3 * trans_size]
301                                    + g_ai2_ihevc_trans_16[5][k]
302                                                    * pi2_tmp[5 * trans_size]
303                                    + g_ai2_ihevc_trans_16[7][k]
304                                                    * pi2_tmp[7 * trans_size]
305                                    + g_ai2_ihevc_trans_16[9][k]
306                                                    * pi2_tmp[9 * trans_size]
307                                    + g_ai2_ihevc_trans_16[11][k]
308                                                    * pi2_tmp[11 * trans_size]
309                                    + g_ai2_ihevc_trans_16[13][k]
310                                                    * pi2_tmp[13 * trans_size]
311                                    + g_ai2_ihevc_trans_16[15][k]
312                                                    * pi2_tmp[15 * trans_size];
313                }
314                for(k = 0; k < 4; k++)
315                {
316                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
317                                    + g_ai2_ihevc_trans_16[6][k]
318                                                    * pi2_tmp[6 * trans_size]
319                                    + g_ai2_ihevc_trans_16[10][k]
320                                                    * pi2_tmp[10 * trans_size]
321                                    + g_ai2_ihevc_trans_16[14][k]
322                                                    * pi2_tmp[14 * trans_size];
323                }
324                eeo[0] =
325                                g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size]
326                                                + g_ai2_ihevc_trans_16[12][0]
327                                                                * pi2_tmp[12
328                                                                                * trans_size];
329                eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0]
330                                + g_ai2_ihevc_trans_16[8][0] * pi2_tmp[8 * trans_size];
331                eeo[1] =
332                                g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size]
333                                                + g_ai2_ihevc_trans_16[12][1]
334                                                                * pi2_tmp[12
335                                                                                * trans_size];
336                eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0]
337                                + g_ai2_ihevc_trans_16[8][1] * pi2_tmp[8 * trans_size];
338
339                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
340                for(k = 0; k < 2; k++)
341                {
342                    ee[k] = eee[k] + eeo[k];
343                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
344                }
345                for(k = 0; k < 4; k++)
346                {
347                    e[k] = ee[k] + eo[k];
348                    e[k + 4] = ee[3 - k] - eo[3 - k];
349                }
350                for(k = 0; k < 8; k++)
351                {
352                    WORD32 itrans_out;
353                    itrans_out =
354                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
355                    pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
356                    itrans_out =
357                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
358                    pu1_dst[k + 8] = CLIP_U8((itrans_out + pu1_pred[k + 8]));
359                }
360                pi2_tmp++;
361                pu1_pred += pred_strd;
362                pu1_dst += dst_strd;
363            }
364        }
365        /************************************************************************************************/
366        /************************************END - IT_RECON_16x16****************************************/
367        /************************************************************************************************/
368    }
369    else if((zero_rows & 0xFF00) == 0xFF00)  /* First 8 rows of input are non-zero */
370    {
371        /* Inverse Transform 1st stage */
372        /************************************************************************************************/
373        /**********************************START - IT_RECON_16x16****************************************/
374        /************************************************************************************************/
375
376        shift = IT_SHIFT_STAGE_1;
377        add = 1 << (shift - 1);
378
379        for(j = 0; j < row_limit_2nd_stage; j++)
380        {
381            /* Checking for Zero Cols */
382            if((zero_cols & 1) == 1)
383            {
384                memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
385            }
386            else
387            {
388                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
389                for(k = 0; k < 8; k++)
390                {
391                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_src[src_strd]
392                                    + g_ai2_ihevc_trans_16[3][k]
393                                                    * pi2_src[3 * src_strd]
394                                    + g_ai2_ihevc_trans_16[5][k]
395                                                    * pi2_src[5 * src_strd]
396                                    + g_ai2_ihevc_trans_16[7][k]
397                                                    * pi2_src[7 * src_strd];
398                }
399                for(k = 0; k < 4; k++)
400                {
401                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_src[2 * src_strd]
402                                    + g_ai2_ihevc_trans_16[6][k]
403                                                    * pi2_src[6 * src_strd];
404                }
405                eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_src[4 * src_strd];
406                eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_src[0];
407                eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_src[4 * src_strd];
408                eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_src[0];
409
410                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
411                for(k = 0; k < 2; k++)
412                {
413                    ee[k] = eee[k] + eeo[k];
414                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
415                }
416                for(k = 0; k < 4; k++)
417                {
418                    e[k] = ee[k] + eo[k];
419                    e[k + 4] = ee[3 - k] - eo[3 - k];
420                }
421                for(k = 0; k < 8; k++)
422                {
423                    pi2_tmp[k] =
424                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
425                    pi2_tmp[k + 8] =
426                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
427                }
428            }
429            pi2_src++;
430            pi2_tmp += trans_size;
431            zero_cols = zero_cols >> 1;
432        }
433
434        pi2_tmp = pi2_tmp_orig;
435
436        /* Inverse Transform 2nd stage */
437        shift = IT_SHIFT_STAGE_2;
438        add = 1 << (shift - 1);
439
440        if((zero_rows_2nd_stage & 0xFFF0) == 0xFFF0) /* First 4 rows of output of 1st stage are non-zero */
441        {
442            for(j = 0; j < trans_size; j++)
443            {
444                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
445                for(k = 0; k < 8; k++)
446                {
447                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
448                                    + g_ai2_ihevc_trans_16[3][k]
449                                                    * pi2_tmp[3 * trans_size];
450                }
451                for(k = 0; k < 4; k++)
452                {
453                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size];
454                }
455                eeo[0] = 0;
456                eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
457                eeo[1] = 0;
458                eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
459
460                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
461                for(k = 0; k < 2; k++)
462                {
463                    ee[k] = eee[k] + eeo[k];
464                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
465                }
466                for(k = 0; k < 4; k++)
467                {
468                    e[k] = ee[k] + eo[k];
469                    e[k + 4] = ee[3 - k] - eo[3 - k];
470                }
471                for(k = 0; k < 8; k++)
472                {
473                    WORD32 itrans_out;
474                    itrans_out =
475                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
476                    pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
477                    itrans_out =
478                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
479                    pu1_dst[k + 8] = CLIP_U8((itrans_out + pu1_pred[k + 8]));
480                }
481                pi2_tmp++;
482                pu1_pred += pred_strd;
483                pu1_dst += dst_strd;
484            }
485        }
486        else if((zero_rows_2nd_stage & 0xFF00) == 0xFF00) /* First 4 rows of output of 1st stage are non-zero */
487        {
488            for(j = 0; j < trans_size; j++)
489            {
490                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
491                for(k = 0; k < 8; k++)
492                {
493                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
494                                    + g_ai2_ihevc_trans_16[3][k]
495                                                    * pi2_tmp[3 * trans_size]
496                                    + g_ai2_ihevc_trans_16[5][k]
497                                                    * pi2_tmp[5 * trans_size]
498                                    + g_ai2_ihevc_trans_16[7][k]
499                                                    * pi2_tmp[7 * trans_size];
500                }
501                for(k = 0; k < 4; k++)
502                {
503                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
504                                    + g_ai2_ihevc_trans_16[6][k]
505                                                    * pi2_tmp[6 * trans_size];
506                }
507                eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size];
508                eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
509                eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size];
510                eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
511
512                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
513                for(k = 0; k < 2; k++)
514                {
515                    ee[k] = eee[k] + eeo[k];
516                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
517                }
518                for(k = 0; k < 4; k++)
519                {
520                    e[k] = ee[k] + eo[k];
521                    e[k + 4] = ee[3 - k] - eo[3 - k];
522                }
523                for(k = 0; k < 8; k++)
524                {
525                    WORD32 itrans_out;
526                    itrans_out =
527                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
528                    pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
529                    itrans_out =
530                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
531                    pu1_dst[k + 8] = CLIP_U8((itrans_out + pu1_pred[k + 8]));
532                }
533                pi2_tmp++;
534                pu1_pred += pred_strd;
535                pu1_dst += dst_strd;
536            }
537        }
538        else /* All rows of output of 1st stage are non-zero */
539        {
540            for(j = 0; j < trans_size; j++)
541            {
542                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
543                for(k = 0; k < 8; k++)
544                {
545                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
546                                    + g_ai2_ihevc_trans_16[3][k]
547                                                    * pi2_tmp[3 * trans_size]
548                                    + g_ai2_ihevc_trans_16[5][k]
549                                                    * pi2_tmp[5 * trans_size]
550                                    + g_ai2_ihevc_trans_16[7][k]
551                                                    * pi2_tmp[7 * trans_size]
552                                    + g_ai2_ihevc_trans_16[9][k]
553                                                    * pi2_tmp[9 * trans_size]
554                                    + g_ai2_ihevc_trans_16[11][k]
555                                                    * pi2_tmp[11 * trans_size]
556                                    + g_ai2_ihevc_trans_16[13][k]
557                                                    * pi2_tmp[13 * trans_size]
558                                    + g_ai2_ihevc_trans_16[15][k]
559                                                    * pi2_tmp[15 * trans_size];
560                }
561                for(k = 0; k < 4; k++)
562                {
563                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
564                                    + g_ai2_ihevc_trans_16[6][k]
565                                                    * pi2_tmp[6 * trans_size]
566                                    + g_ai2_ihevc_trans_16[10][k]
567                                                    * pi2_tmp[10 * trans_size]
568                                    + g_ai2_ihevc_trans_16[14][k]
569                                                    * pi2_tmp[14 * trans_size];
570                }
571                eeo[0] =
572                                g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size]
573                                                + g_ai2_ihevc_trans_16[12][0]
574                                                                * pi2_tmp[12
575                                                                                * trans_size];
576                eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0]
577                                + g_ai2_ihevc_trans_16[8][0] * pi2_tmp[8 * trans_size];
578                eeo[1] =
579                                g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size]
580                                                + g_ai2_ihevc_trans_16[12][1]
581                                                                * pi2_tmp[12
582                                                                                * trans_size];
583                eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0]
584                                + g_ai2_ihevc_trans_16[8][1] * pi2_tmp[8 * trans_size];
585
586                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
587                for(k = 0; k < 2; k++)
588                {
589                    ee[k] = eee[k] + eeo[k];
590                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
591                }
592                for(k = 0; k < 4; k++)
593                {
594                    e[k] = ee[k] + eo[k];
595                    e[k + 4] = ee[3 - k] - eo[3 - k];
596                }
597                for(k = 0; k < 8; k++)
598                {
599                    WORD32 itrans_out;
600                    itrans_out =
601                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
602                    pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
603                    itrans_out =
604                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
605                    pu1_dst[k + 8] = CLIP_U8((itrans_out + pu1_pred[k + 8]));
606                }
607                pi2_tmp++;
608                pu1_pred += pred_strd;
609                pu1_dst += dst_strd;
610            }
611        }
612        /************************************************************************************************/
613        /************************************END - IT_RECON_16x16****************************************/
614        /************************************************************************************************/
615    }
616    else  /* All rows of input are non-zero */
617    {
618        /* Inverse Transform 1st stage */
619        /************************************************************************************************/
620        /**********************************START - IT_RECON_16x16****************************************/
621        /************************************************************************************************/
622
623        shift = IT_SHIFT_STAGE_1;
624        add = 1 << (shift - 1);
625
626        for(j = 0; j < row_limit_2nd_stage; j++)
627        {
628            /* Checking for Zero Cols */
629            if((zero_cols & 1) == 1)
630            {
631                memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
632            }
633            else
634            {
635                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
636                for(k = 0; k < 8; k++)
637                {
638                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_src[src_strd]
639                                    + g_ai2_ihevc_trans_16[3][k]
640                                                    * pi2_src[3 * src_strd]
641                                    + g_ai2_ihevc_trans_16[5][k]
642                                                    * pi2_src[5 * src_strd]
643                                    + g_ai2_ihevc_trans_16[7][k]
644                                                    * pi2_src[7 * src_strd]
645                                    + g_ai2_ihevc_trans_16[9][k]
646                                                    * pi2_src[9 * src_strd]
647                                    + g_ai2_ihevc_trans_16[11][k]
648                                                    * pi2_src[11 * src_strd]
649                                    + g_ai2_ihevc_trans_16[13][k]
650                                                    * pi2_src[13 * src_strd]
651                                    + g_ai2_ihevc_trans_16[15][k]
652                                                    * pi2_src[15 * src_strd];
653                }
654                for(k = 0; k < 4; k++)
655                {
656                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_src[2 * src_strd]
657                                    + g_ai2_ihevc_trans_16[6][k]
658                                                    * pi2_src[6 * src_strd]
659                                    + g_ai2_ihevc_trans_16[10][k]
660                                                    * pi2_src[10 * src_strd]
661                                    + g_ai2_ihevc_trans_16[14][k]
662                                                    * pi2_src[14 * src_strd];
663                }
664                eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_src[4 * src_strd]
665                                + g_ai2_ihevc_trans_16[12][0]
666                                                * pi2_src[12 * src_strd];
667                eee[0] =
668                                g_ai2_ihevc_trans_16[0][0] * pi2_src[0]
669                                                + g_ai2_ihevc_trans_16[8][0]
670                                                                * pi2_src[8
671                                                                                * src_strd];
672                eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_src[4 * src_strd]
673                                + g_ai2_ihevc_trans_16[12][1]
674                                                * pi2_src[12 * src_strd];
675                eee[1] =
676                                g_ai2_ihevc_trans_16[0][1] * pi2_src[0]
677                                                + g_ai2_ihevc_trans_16[8][1]
678                                                                * pi2_src[8
679                                                                                * src_strd];
680
681                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
682                for(k = 0; k < 2; k++)
683                {
684                    ee[k] = eee[k] + eeo[k];
685                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
686                }
687                for(k = 0; k < 4; k++)
688                {
689                    e[k] = ee[k] + eo[k];
690                    e[k + 4] = ee[3 - k] - eo[3 - k];
691                }
692                for(k = 0; k < 8; k++)
693                {
694                    pi2_tmp[k] =
695                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
696                    pi2_tmp[k + 8] =
697                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
698                }
699            }
700            pi2_src++;
701            pi2_tmp += trans_size;
702            zero_cols = zero_cols >> 1;
703        }
704
705        pi2_tmp = pi2_tmp_orig;
706
707        /* Inverse Transform 2nd stage */
708        shift = IT_SHIFT_STAGE_2;
709        add = 1 << (shift - 1);
710
711        if((zero_rows_2nd_stage & 0xFFF0) == 0xFFF0) /* First 4 rows of output of 1st stage are non-zero */
712        {
713            for(j = 0; j < trans_size; j++)
714            {
715                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
716                for(k = 0; k < 8; k++)
717                {
718                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
719                                    + g_ai2_ihevc_trans_16[3][k]
720                                                    * pi2_tmp[3 * trans_size];
721                }
722                for(k = 0; k < 4; k++)
723                {
724                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size];
725                }
726                eeo[0] = 0;
727                eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
728                eeo[1] = 0;
729                eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
730
731                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
732                for(k = 0; k < 2; k++)
733                {
734                    ee[k] = eee[k] + eeo[k];
735                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
736                }
737                for(k = 0; k < 4; k++)
738                {
739                    e[k] = ee[k] + eo[k];
740                    e[k + 4] = ee[3 - k] - eo[3 - k];
741                }
742                for(k = 0; k < 8; k++)
743                {
744                    WORD32 itrans_out;
745                    itrans_out =
746                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
747                    pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
748                    itrans_out =
749                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
750                    pu1_dst[k + 8] = CLIP_U8((itrans_out + pu1_pred[k + 8]));
751                }
752                pi2_tmp++;
753                pu1_pred += pred_strd;
754                pu1_dst += dst_strd;
755            }
756        }
757        else if((zero_rows_2nd_stage & 0xFF00) == 0xFF00) /* First 4 rows of output of 1st stage are non-zero */
758        {
759            for(j = 0; j < trans_size; j++)
760            {
761                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
762                for(k = 0; k < 8; k++)
763                {
764                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
765                                    + g_ai2_ihevc_trans_16[3][k]
766                                                    * pi2_tmp[3 * trans_size]
767                                    + g_ai2_ihevc_trans_16[5][k]
768                                                    * pi2_tmp[5 * trans_size]
769                                    + g_ai2_ihevc_trans_16[7][k]
770                                                    * pi2_tmp[7 * trans_size];
771                }
772                for(k = 0; k < 4; k++)
773                {
774                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
775                                    + g_ai2_ihevc_trans_16[6][k]
776                                                    * pi2_tmp[6 * trans_size];
777                }
778                eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size];
779                eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
780                eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size];
781                eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
782
783                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
784                for(k = 0; k < 2; k++)
785                {
786                    ee[k] = eee[k] + eeo[k];
787                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
788                }
789                for(k = 0; k < 4; k++)
790                {
791                    e[k] = ee[k] + eo[k];
792                    e[k + 4] = ee[3 - k] - eo[3 - k];
793                }
794                for(k = 0; k < 8; k++)
795                {
796                    WORD32 itrans_out;
797                    itrans_out =
798                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
799                    pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
800                    itrans_out =
801                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
802                    pu1_dst[k + 8] = CLIP_U8((itrans_out + pu1_pred[k + 8]));
803                }
804                pi2_tmp++;
805                pu1_pred += pred_strd;
806                pu1_dst += dst_strd;
807            }
808        }
809        else /* All rows of output of 1st stage are non-zero */
810        {
811            for(j = 0; j < trans_size; j++)
812            {
813                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
814                for(k = 0; k < 8; k++)
815                {
816                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
817                                    + g_ai2_ihevc_trans_16[3][k]
818                                                    * pi2_tmp[3 * trans_size]
819                                    + g_ai2_ihevc_trans_16[5][k]
820                                                    * pi2_tmp[5 * trans_size]
821                                    + g_ai2_ihevc_trans_16[7][k]
822                                                    * pi2_tmp[7 * trans_size]
823                                    + g_ai2_ihevc_trans_16[9][k]
824                                                    * pi2_tmp[9 * trans_size]
825                                    + g_ai2_ihevc_trans_16[11][k]
826                                                    * pi2_tmp[11 * trans_size]
827                                    + g_ai2_ihevc_trans_16[13][k]
828                                                    * pi2_tmp[13 * trans_size]
829                                    + g_ai2_ihevc_trans_16[15][k]
830                                                    * pi2_tmp[15 * trans_size];
831                }
832                for(k = 0; k < 4; k++)
833                {
834                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
835                                    + g_ai2_ihevc_trans_16[6][k]
836                                                    * pi2_tmp[6 * trans_size]
837                                    + g_ai2_ihevc_trans_16[10][k]
838                                                    * pi2_tmp[10 * trans_size]
839                                    + g_ai2_ihevc_trans_16[14][k]
840                                                    * pi2_tmp[14 * trans_size];
841                }
842                eeo[0] =
843                                g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size]
844                                                + g_ai2_ihevc_trans_16[12][0]
845                                                                * pi2_tmp[12
846                                                                                * trans_size];
847                eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0]
848                                + g_ai2_ihevc_trans_16[8][0] * pi2_tmp[8 * trans_size];
849                eeo[1] =
850                                g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size]
851                                                + g_ai2_ihevc_trans_16[12][1]
852                                                                * pi2_tmp[12
853                                                                                * trans_size];
854                eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0]
855                                + g_ai2_ihevc_trans_16[8][1] * pi2_tmp[8 * trans_size];
856
857                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
858                for(k = 0; k < 2; k++)
859                {
860                    ee[k] = eee[k] + eeo[k];
861                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
862                }
863                for(k = 0; k < 4; k++)
864                {
865                    e[k] = ee[k] + eo[k];
866                    e[k + 4] = ee[3 - k] - eo[3 - k];
867                }
868                for(k = 0; k < 8; k++)
869                {
870                    WORD32 itrans_out;
871                    itrans_out =
872                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
873                    pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
874                    itrans_out =
875                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
876                    pu1_dst[k + 8] = CLIP_U8((itrans_out + pu1_pred[k + 8]));
877                }
878                pi2_tmp++;
879                pu1_pred += pred_strd;
880                pu1_dst += dst_strd;
881            }
882        }
883        /************************************************************************************************/
884        /************************************END - IT_RECON_16x16****************************************/
885        /************************************************************************************************/
886    }
887
888}
889
890