1/******************************************************************************
2*
3* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4*
5* Licensed under the Apache License, Version 2.0 (the "License");
6* you may not use this file except in compliance with the License.
7* You may obtain a copy of the License at:
8*
9* http://www.apache.org/licenses/LICENSE-2.0
10*
11* Unless required by applicable law or agreed to in writing, software
12* distributed under the License is distributed on an "AS IS" BASIS,
13* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14* See the License for the specific language governing permissions and
15* limitations under the License.
16*
17******************************************************************************/
18/**
19 *******************************************************************************
20 * @file
21 *  ihevc_chroma_itrans_recon_16x16.c
22 *
23 * @brief
24 *  Contains function definitions for 16x16 inverse transform  and reconstruction
25 * of chroma interleaved data.
26 *
27 * @author
28 *  100470
29 *
30 * @par List of Functions:
31 *  - ihevc_chroma_itrans_recon_16x16()
32 *
33 * @remarks
34 *  None
35 *
36 *******************************************************************************
37 */
38
39#include <stdio.h>
40#include <string.h>
41#include "ihevc_typedefs.h"
42#include "ihevc_macros.h"
43#include "ihevc_platform_macros.h"
44#include "ihevc_defs.h"
45#include "ihevc_trans_tables.h"
46#include "ihevc_chroma_itrans_recon.h"
47#include "ihevc_func_selector.h"
48#include "ihevc_trans_macros.h"
49
50/* All the functions work one component(U or V) of interleaved data depending upon pointers passed to it */
51/* Data visualization */
52/* U V U V U V U V */
53/* U V U V U V U V */
54/* U V U V U V U V */
55/* U V U V U V U V */
56/* If the pointer points to first byte of above stream (U) , functions will operate on U component */
57/* If the pointer points to second byte of above stream (V) , functions will operate on V component */
58
59
60/**
61 *******************************************************************************
62 *
63 * @brief
64 *  This function performs Inverse transform  and reconstruction for 16x16
65 * input block
66 *
67 * @par Description:
68 *  Performs inverse transform and adds the prediction  data and clips output
69 * to 8 bit
70 *
71 * @param[in] pi2_src
72 *  Input 16x16 coefficients
73 *
74 * @param[in] pi2_tmp
75 *  Temporary 16x16 buffer for storing inverse transform
76 *  1st stage output
77 *
78 * @param[in] pu1_pred
79 *  Prediction 16x16 block
80 *
81 * @param[out] pu1_dst
82 *  Output 16x16 block
83 *
84 * @param[in] src_strd
85 *  Input stride
86 *
87 * @param[in] pred_strd
88 *  Prediction stride
89 *
90 * @param[in] dst_strd
91 *  Output Stride
92 *
93 * @param[in] shift
94 *  Output shift
95 *
96 * @param[in] zero_cols
97 *  Zero columns in pi2_src
98 *
99 * @returns  Void
100 *
101 * @remarks
102 *  None
103 *
104 *******************************************************************************
105 */
106
107
108void ihevc_chroma_itrans_recon_16x16(WORD16 *pi2_src,
109                                     WORD16 *pi2_tmp,
110                                     UWORD8 *pu1_pred,
111                                     UWORD8 *pu1_dst,
112                                     WORD32 src_strd,
113                                     WORD32 pred_strd,
114                                     WORD32 dst_strd,
115                                     WORD32 zero_cols,
116                                     WORD32 zero_rows)
117{
118    WORD32 j, k;
119    WORD32 e[8], o[8];
120    WORD32 ee[4], eo[4];
121    WORD32 eee[2], eeo[2];
122    WORD32 add;
123    WORD32 shift;
124    WORD16 *pi2_tmp_orig;
125    WORD32 trans_size;
126    WORD32 row_limit_2nd_stage, zero_rows_2nd_stage = zero_cols;
127
128    trans_size = TRANS_SIZE_16;
129    pi2_tmp_orig = pi2_tmp;
130
131    if((zero_cols & 0xFFF0) == 0xFFF0)
132        row_limit_2nd_stage = 4;
133    else if((zero_cols & 0xFF00) == 0xFF00)
134        row_limit_2nd_stage = 8;
135    else
136        row_limit_2nd_stage = TRANS_SIZE_16;
137
138    if((zero_rows & 0xFFF0) == 0xFFF0) /* First 4 rows of input are non-zero */
139    {
140        /************************************************************************************************/
141        /**********************************START - IT_RECON_16x16****************************************/
142        /************************************************************************************************/
143
144        /* Inverse Transform 1st stage */
145        shift = IT_SHIFT_STAGE_1;
146        add = 1 << (shift - 1);
147
148        for(j = 0; j < row_limit_2nd_stage; j++)
149        {
150            /* Checking for Zero Cols */
151            if((zero_cols & 1) == 1)
152            {
153                memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
154            }
155            else
156            {
157                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
158                for(k = 0; k < 8; k++)
159                {
160                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_src[src_strd]
161                                    + g_ai2_ihevc_trans_16[3][k]
162                                                    * pi2_src[3 * src_strd];
163                }
164                for(k = 0; k < 4; k++)
165                {
166                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_src[2 * src_strd];
167                }
168                eeo[0] = 0;
169                eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_src[0];
170                eeo[1] = 0;
171                eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_src[0];
172
173                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
174                for(k = 0; k < 2; k++)
175                {
176                    ee[k] = eee[k] + eeo[k];
177                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
178                }
179                for(k = 0; k < 4; k++)
180                {
181                    e[k] = ee[k] + eo[k];
182                    e[k + 4] = ee[3 - k] - eo[3 - k];
183                }
184                for(k = 0; k < 8; k++)
185                {
186                    pi2_tmp[k] =
187                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
188                    pi2_tmp[k + 8] =
189                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
190                }
191            }
192            pi2_src++;
193            pi2_tmp += trans_size;
194            zero_cols = zero_cols >> 1;
195        }
196
197        pi2_tmp = pi2_tmp_orig;
198
199        /* Inverse Transform 2nd stage */
200        shift = IT_SHIFT_STAGE_2;
201        add = 1 << (shift - 1);
202        if((zero_rows_2nd_stage & 0xFFF0) == 0xFFF0) /* First 4 rows of output of 1st stage are non-zero */
203        {
204            for(j = 0; j < trans_size; j++)
205            {
206                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
207                for(k = 0; k < 8; k++)
208                {
209                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
210                                    + g_ai2_ihevc_trans_16[3][k]
211                                                    * pi2_tmp[3 * trans_size];
212                }
213                for(k = 0; k < 4; k++)
214                {
215                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size];
216                }
217                eeo[0] = 0;
218                eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
219                eeo[1] = 0;
220                eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
221
222                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
223                for(k = 0; k < 2; k++)
224                {
225                    ee[k] = eee[k] + eeo[k];
226                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
227                }
228                for(k = 0; k < 4; k++)
229                {
230                    e[k] = ee[k] + eo[k];
231                    e[k + 4] = ee[3 - k] - eo[3 - k];
232                }
233                for(k = 0; k < 8; k++)
234                {
235                    WORD32 itrans_out;
236                    itrans_out =
237                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
238                    pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2]));
239                    itrans_out =
240                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
241                    pu1_dst[(k + 8) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 8) * 2]));
242                }
243                pi2_tmp++;
244                pu1_pred += pred_strd;
245                pu1_dst += dst_strd;
246            }
247        }
248        else if((zero_rows_2nd_stage & 0xFF00) == 0xFF00) /* First 8 rows of output of 1st stage are non-zero */
249        {
250            for(j = 0; j < trans_size; j++)
251            {
252                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
253                for(k = 0; k < 8; k++)
254                {
255                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
256                                    + g_ai2_ihevc_trans_16[3][k]
257                                                    * pi2_tmp[3 * trans_size]
258                                    + g_ai2_ihevc_trans_16[5][k]
259                                                    * pi2_tmp[5 * trans_size]
260                                    + g_ai2_ihevc_trans_16[7][k]
261                                                    * pi2_tmp[7 * trans_size];
262                }
263                for(k = 0; k < 4; k++)
264                {
265                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
266                                    + g_ai2_ihevc_trans_16[6][k]
267                                                    * pi2_tmp[6 * trans_size];
268                }
269                eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size];
270                eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
271                eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size];
272                eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
273
274                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
275                for(k = 0; k < 2; k++)
276                {
277                    ee[k] = eee[k] + eeo[k];
278                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
279                }
280                for(k = 0; k < 4; k++)
281                {
282                    e[k] = ee[k] + eo[k];
283                    e[k + 4] = ee[3 - k] - eo[3 - k];
284                }
285                for(k = 0; k < 8; k++)
286                {
287                    WORD32 itrans_out;
288                    itrans_out =
289                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
290                    pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2]));
291                    itrans_out =
292                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
293                    pu1_dst[(k + 8) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 8) * 2]));
294                }
295                pi2_tmp++;
296                pu1_pred += pred_strd;
297                pu1_dst += dst_strd;
298            }
299        }
300        else /* All rows of output of 1st stage are non-zero */
301        {
302            for(j = 0; j < trans_size; j++)
303            {
304                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
305                for(k = 0; k < 8; k++)
306                {
307                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
308                                    + g_ai2_ihevc_trans_16[3][k]
309                                                    * pi2_tmp[3 * trans_size]
310                                    + g_ai2_ihevc_trans_16[5][k]
311                                                    * pi2_tmp[5 * trans_size]
312                                    + g_ai2_ihevc_trans_16[7][k]
313                                                    * pi2_tmp[7 * trans_size]
314                                    + g_ai2_ihevc_trans_16[9][k]
315                                                    * pi2_tmp[9 * trans_size]
316                                    + g_ai2_ihevc_trans_16[11][k]
317                                                    * pi2_tmp[11 * trans_size]
318                                    + g_ai2_ihevc_trans_16[13][k]
319                                                    * pi2_tmp[13 * trans_size]
320                                    + g_ai2_ihevc_trans_16[15][k]
321                                                    * pi2_tmp[15 * trans_size];
322                }
323                for(k = 0; k < 4; k++)
324                {
325                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
326                                    + g_ai2_ihevc_trans_16[6][k]
327                                                    * pi2_tmp[6 * trans_size]
328                                    + g_ai2_ihevc_trans_16[10][k]
329                                                    * pi2_tmp[10 * trans_size]
330                                    + g_ai2_ihevc_trans_16[14][k]
331                                                    * pi2_tmp[14 * trans_size];
332                }
333                eeo[0] =
334                                g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size]
335                                                + g_ai2_ihevc_trans_16[12][0]
336                                                                * pi2_tmp[12
337                                                                                * trans_size];
338                eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0]
339                                + g_ai2_ihevc_trans_16[8][0] * pi2_tmp[8 * trans_size];
340                eeo[1] =
341                                g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size]
342                                                + g_ai2_ihevc_trans_16[12][1]
343                                                                * pi2_tmp[12
344                                                                                * trans_size];
345                eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0]
346                                + g_ai2_ihevc_trans_16[8][1] * pi2_tmp[8 * trans_size];
347
348                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
349                for(k = 0; k < 2; k++)
350                {
351                    ee[k] = eee[k] + eeo[k];
352                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
353                }
354                for(k = 0; k < 4; k++)
355                {
356                    e[k] = ee[k] + eo[k];
357                    e[k + 4] = ee[3 - k] - eo[3 - k];
358                }
359                for(k = 0; k < 8; k++)
360                {
361                    WORD32 itrans_out;
362                    itrans_out =
363                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
364                    pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2]));
365                    itrans_out =
366                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
367                    pu1_dst[(k + 8) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 8) * 2]));
368                }
369                pi2_tmp++;
370                pu1_pred += pred_strd;
371                pu1_dst += dst_strd;
372            }
373        }
374        /************************************************************************************************/
375        /************************************END - IT_RECON_16x16****************************************/
376        /************************************************************************************************/
377    }
378    else if((zero_rows & 0xFF00) == 0xFF00) /* First 8 rows of input are non-zero */
379    {
380        /************************************************************************************************/
381        /**********************************START - IT_RECON_16x16****************************************/
382        /************************************************************************************************/
383
384        /* Inverse Transform 1st stage */
385        shift = IT_SHIFT_STAGE_1;
386        add = 1 << (shift - 1);
387
388        for(j = 0; j < row_limit_2nd_stage; j++)
389        {
390            /* Checking for Zero Cols */
391            if((zero_cols & 1) == 1)
392            {
393                memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
394            }
395            else
396            {
397                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
398                for(k = 0; k < 8; k++)
399                {
400                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_src[src_strd]
401                                    + g_ai2_ihevc_trans_16[3][k]
402                                                    * pi2_src[3 * src_strd]
403                                    + g_ai2_ihevc_trans_16[5][k]
404                                                    * pi2_src[5 * src_strd]
405                                    + g_ai2_ihevc_trans_16[7][k]
406                                                    * pi2_src[7 * src_strd];
407                }
408                for(k = 0; k < 4; k++)
409                {
410                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_src[2 * src_strd]
411                                    + g_ai2_ihevc_trans_16[6][k]
412                                                    * pi2_src[6 * src_strd];
413                }
414                eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_src[4 * src_strd];
415                eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_src[0];
416                eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_src[4 * src_strd];
417                eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_src[0];
418
419                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
420                for(k = 0; k < 2; k++)
421                {
422                    ee[k] = eee[k] + eeo[k];
423                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
424                }
425                for(k = 0; k < 4; k++)
426                {
427                    e[k] = ee[k] + eo[k];
428                    e[k + 4] = ee[3 - k] - eo[3 - k];
429                }
430                for(k = 0; k < 8; k++)
431                {
432                    pi2_tmp[k] =
433                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
434                    pi2_tmp[k + 8] =
435                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
436                }
437            }
438            pi2_src++;
439            pi2_tmp += trans_size;
440            zero_cols = zero_cols >> 1;
441        }
442
443        pi2_tmp = pi2_tmp_orig;
444
445        /* Inverse Transform 2nd stage */
446        shift = IT_SHIFT_STAGE_2;
447        add = 1 << (shift - 1);
448        if((zero_rows_2nd_stage & 0xFFF0) == 0xFFF0) /* First 4 rows of output of 1st stage are non-zero */
449        {
450            for(j = 0; j < trans_size; j++)
451            {
452                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
453                for(k = 0; k < 8; k++)
454                {
455                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
456                                    + g_ai2_ihevc_trans_16[3][k]
457                                                    * pi2_tmp[3 * trans_size];
458                }
459                for(k = 0; k < 4; k++)
460                {
461                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size];
462                }
463                eeo[0] = 0;
464                eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
465                eeo[1] = 0;
466                eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
467
468                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
469                for(k = 0; k < 2; k++)
470                {
471                    ee[k] = eee[k] + eeo[k];
472                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
473                }
474                for(k = 0; k < 4; k++)
475                {
476                    e[k] = ee[k] + eo[k];
477                    e[k + 4] = ee[3 - k] - eo[3 - k];
478                }
479                for(k = 0; k < 8; k++)
480                {
481                    WORD32 itrans_out;
482                    itrans_out =
483                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
484                    pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2]));
485                    itrans_out =
486                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
487                    pu1_dst[(k + 8) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 8) * 2]));
488                }
489                pi2_tmp++;
490                pu1_pred += pred_strd;
491                pu1_dst += dst_strd;
492            }
493        }
494        else if((zero_rows_2nd_stage & 0xFF00) == 0xFF00) /* First 8 rows of output of 1st stage are non-zero */
495        {
496            for(j = 0; j < trans_size; j++)
497            {
498                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
499                for(k = 0; k < 8; k++)
500                {
501                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
502                                    + g_ai2_ihevc_trans_16[3][k]
503                                                    * pi2_tmp[3 * trans_size]
504                                    + g_ai2_ihevc_trans_16[5][k]
505                                                    * pi2_tmp[5 * trans_size]
506                                    + g_ai2_ihevc_trans_16[7][k]
507                                                    * pi2_tmp[7 * trans_size];
508                }
509                for(k = 0; k < 4; k++)
510                {
511                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
512                                    + g_ai2_ihevc_trans_16[6][k]
513                                                    * pi2_tmp[6 * trans_size];
514                }
515                eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size];
516                eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
517                eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size];
518                eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
519
520                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
521                for(k = 0; k < 2; k++)
522                {
523                    ee[k] = eee[k] + eeo[k];
524                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
525                }
526                for(k = 0; k < 4; k++)
527                {
528                    e[k] = ee[k] + eo[k];
529                    e[k + 4] = ee[3 - k] - eo[3 - k];
530                }
531                for(k = 0; k < 8; k++)
532                {
533                    WORD32 itrans_out;
534                    itrans_out =
535                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
536                    pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2]));
537                    itrans_out =
538                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
539                    pu1_dst[(k + 8) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 8) * 2]));
540                }
541                pi2_tmp++;
542                pu1_pred += pred_strd;
543                pu1_dst += dst_strd;
544            }
545        }
546        else /* All rows of output of 1st stage are non-zero */
547        {
548            for(j = 0; j < trans_size; j++)
549            {
550                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
551                for(k = 0; k < 8; k++)
552                {
553                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
554                                    + g_ai2_ihevc_trans_16[3][k]
555                                                    * pi2_tmp[3 * trans_size]
556                                    + g_ai2_ihevc_trans_16[5][k]
557                                                    * pi2_tmp[5 * trans_size]
558                                    + g_ai2_ihevc_trans_16[7][k]
559                                                    * pi2_tmp[7 * trans_size]
560                                    + g_ai2_ihevc_trans_16[9][k]
561                                                    * pi2_tmp[9 * trans_size]
562                                    + g_ai2_ihevc_trans_16[11][k]
563                                                    * pi2_tmp[11 * trans_size]
564                                    + g_ai2_ihevc_trans_16[13][k]
565                                                    * pi2_tmp[13 * trans_size]
566                                    + g_ai2_ihevc_trans_16[15][k]
567                                                    * pi2_tmp[15 * trans_size];
568                }
569                for(k = 0; k < 4; k++)
570                {
571                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
572                                    + g_ai2_ihevc_trans_16[6][k]
573                                                    * pi2_tmp[6 * trans_size]
574                                    + g_ai2_ihevc_trans_16[10][k]
575                                                    * pi2_tmp[10 * trans_size]
576                                    + g_ai2_ihevc_trans_16[14][k]
577                                                    * pi2_tmp[14 * trans_size];
578                }
579                eeo[0] =
580                                g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size]
581                                                + g_ai2_ihevc_trans_16[12][0]
582                                                                * pi2_tmp[12
583                                                                                * trans_size];
584                eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0]
585                                + g_ai2_ihevc_trans_16[8][0] * pi2_tmp[8 * trans_size];
586                eeo[1] =
587                                g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size]
588                                                + g_ai2_ihevc_trans_16[12][1]
589                                                                * pi2_tmp[12
590                                                                                * trans_size];
591                eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0]
592                                + g_ai2_ihevc_trans_16[8][1] * pi2_tmp[8 * trans_size];
593
594                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
595                for(k = 0; k < 2; k++)
596                {
597                    ee[k] = eee[k] + eeo[k];
598                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
599                }
600                for(k = 0; k < 4; k++)
601                {
602                    e[k] = ee[k] + eo[k];
603                    e[k + 4] = ee[3 - k] - eo[3 - k];
604                }
605                for(k = 0; k < 8; k++)
606                {
607                    WORD32 itrans_out;
608                    itrans_out =
609                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
610                    pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2]));
611                    itrans_out =
612                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
613                    pu1_dst[(k + 8) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 8) * 2]));
614                }
615                pi2_tmp++;
616                pu1_pred += pred_strd;
617                pu1_dst += dst_strd;
618            }
619        }
620        /************************************************************************************************/
621        /************************************END - IT_RECON_16x16****************************************/
622        /************************************************************************************************/
623    }
624    else /* All rows of input are non-zero */
625    {
626        /************************************************************************************************/
627        /**********************************START - IT_RECON_16x16****************************************/
628        /************************************************************************************************/
629
630        /* Inverse Transform 1st stage */
631        shift = IT_SHIFT_STAGE_1;
632        add = 1 << (shift - 1);
633
634        for(j = 0; j < row_limit_2nd_stage; j++)
635        {
636            /* Checking for Zero Cols */
637            if((zero_cols & 1) == 1)
638            {
639                memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
640            }
641            else
642            {
643                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
644                for(k = 0; k < 8; k++)
645                {
646                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_src[src_strd]
647                                    + g_ai2_ihevc_trans_16[3][k]
648                                                    * pi2_src[3 * src_strd]
649                                    + g_ai2_ihevc_trans_16[5][k]
650                                                    * pi2_src[5 * src_strd]
651                                    + g_ai2_ihevc_trans_16[7][k]
652                                                    * pi2_src[7 * src_strd]
653                                    + g_ai2_ihevc_trans_16[9][k]
654                                                    * pi2_src[9 * src_strd]
655                                    + g_ai2_ihevc_trans_16[11][k]
656                                                    * pi2_src[11 * src_strd]
657                                    + g_ai2_ihevc_trans_16[13][k]
658                                                    * pi2_src[13 * src_strd]
659                                    + g_ai2_ihevc_trans_16[15][k]
660                                                    * pi2_src[15 * src_strd];
661                }
662                for(k = 0; k < 4; k++)
663                {
664                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_src[2 * src_strd]
665                                    + g_ai2_ihevc_trans_16[6][k]
666                                                    * pi2_src[6 * src_strd]
667                                    + g_ai2_ihevc_trans_16[10][k]
668                                                    * pi2_src[10 * src_strd]
669                                    + g_ai2_ihevc_trans_16[14][k]
670                                                    * pi2_src[14 * src_strd];
671                }
672                eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_src[4 * src_strd]
673                                + g_ai2_ihevc_trans_16[12][0]
674                                                * pi2_src[12 * src_strd];
675                eee[0] =
676                                g_ai2_ihevc_trans_16[0][0] * pi2_src[0]
677                                                + g_ai2_ihevc_trans_16[8][0]
678                                                                * pi2_src[8
679                                                                                * src_strd];
680                eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_src[4 * src_strd]
681                                + g_ai2_ihevc_trans_16[12][1]
682                                                * pi2_src[12 * src_strd];
683                eee[1] =
684                                g_ai2_ihevc_trans_16[0][1] * pi2_src[0]
685                                                + g_ai2_ihevc_trans_16[8][1]
686                                                                * pi2_src[8
687                                                                                * src_strd];
688
689                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
690                for(k = 0; k < 2; k++)
691                {
692                    ee[k] = eee[k] + eeo[k];
693                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
694                }
695                for(k = 0; k < 4; k++)
696                {
697                    e[k] = ee[k] + eo[k];
698                    e[k + 4] = ee[3 - k] - eo[3 - k];
699                }
700                for(k = 0; k < 8; k++)
701                {
702                    pi2_tmp[k] =
703                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
704                    pi2_tmp[k + 8] =
705                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
706                }
707            }
708            pi2_src++;
709            pi2_tmp += trans_size;
710            zero_cols = zero_cols >> 1;
711        }
712
713        pi2_tmp = pi2_tmp_orig;
714
715        /* Inverse Transform 2nd stage */
716        shift = IT_SHIFT_STAGE_2;
717        add = 1 << (shift - 1);
718        if((zero_rows_2nd_stage & 0xFFF0) == 0xFFF0) /* First 4 rows of output of 1st stage are non-zero */
719        {
720            for(j = 0; j < trans_size; j++)
721            {
722                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
723                for(k = 0; k < 8; k++)
724                {
725                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
726                                    + g_ai2_ihevc_trans_16[3][k]
727                                                    * pi2_tmp[3 * trans_size];
728                }
729                for(k = 0; k < 4; k++)
730                {
731                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size];
732                }
733                eeo[0] = 0;
734                eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
735                eeo[1] = 0;
736                eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
737
738                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
739                for(k = 0; k < 2; k++)
740                {
741                    ee[k] = eee[k] + eeo[k];
742                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
743                }
744                for(k = 0; k < 4; k++)
745                {
746                    e[k] = ee[k] + eo[k];
747                    e[k + 4] = ee[3 - k] - eo[3 - k];
748                }
749                for(k = 0; k < 8; k++)
750                {
751                    WORD32 itrans_out;
752                    itrans_out =
753                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
754                    pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2]));
755                    itrans_out =
756                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
757                    pu1_dst[(k + 8) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 8) * 2]));
758                }
759                pi2_tmp++;
760                pu1_pred += pred_strd;
761                pu1_dst += dst_strd;
762            }
763        }
764        else if((zero_rows_2nd_stage & 0xFF00) == 0xFF00) /* First 8 rows of output of 1st stage are non-zero */
765        {
766            for(j = 0; j < trans_size; j++)
767            {
768                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
769                for(k = 0; k < 8; k++)
770                {
771                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
772                                    + g_ai2_ihevc_trans_16[3][k]
773                                                    * pi2_tmp[3 * trans_size]
774                                    + g_ai2_ihevc_trans_16[5][k]
775                                                    * pi2_tmp[5 * trans_size]
776                                    + g_ai2_ihevc_trans_16[7][k]
777                                                    * pi2_tmp[7 * trans_size];
778                }
779                for(k = 0; k < 4; k++)
780                {
781                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
782                                    + g_ai2_ihevc_trans_16[6][k]
783                                                    * pi2_tmp[6 * trans_size];
784                }
785                eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size];
786                eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
787                eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size];
788                eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
789
790                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
791                for(k = 0; k < 2; k++)
792                {
793                    ee[k] = eee[k] + eeo[k];
794                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
795                }
796                for(k = 0; k < 4; k++)
797                {
798                    e[k] = ee[k] + eo[k];
799                    e[k + 4] = ee[3 - k] - eo[3 - k];
800                }
801                for(k = 0; k < 8; k++)
802                {
803                    WORD32 itrans_out;
804                    itrans_out =
805                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
806                    pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2]));
807                    itrans_out =
808                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
809                    pu1_dst[(k + 8) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 8) * 2]));
810                }
811                pi2_tmp++;
812                pu1_pred += pred_strd;
813                pu1_dst += dst_strd;
814            }
815        }
816        else /* All rows of output of 1st stage are non-zero */
817        {
818            for(j = 0; j < trans_size; j++)
819            {
820                /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
821                for(k = 0; k < 8; k++)
822                {
823                    o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
824                                    + g_ai2_ihevc_trans_16[3][k]
825                                                    * pi2_tmp[3 * trans_size]
826                                    + g_ai2_ihevc_trans_16[5][k]
827                                                    * pi2_tmp[5 * trans_size]
828                                    + g_ai2_ihevc_trans_16[7][k]
829                                                    * pi2_tmp[7 * trans_size]
830                                    + g_ai2_ihevc_trans_16[9][k]
831                                                    * pi2_tmp[9 * trans_size]
832                                    + g_ai2_ihevc_trans_16[11][k]
833                                                    * pi2_tmp[11 * trans_size]
834                                    + g_ai2_ihevc_trans_16[13][k]
835                                                    * pi2_tmp[13 * trans_size]
836                                    + g_ai2_ihevc_trans_16[15][k]
837                                                    * pi2_tmp[15 * trans_size];
838                }
839                for(k = 0; k < 4; k++)
840                {
841                    eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
842                                    + g_ai2_ihevc_trans_16[6][k]
843                                                    * pi2_tmp[6 * trans_size]
844                                    + g_ai2_ihevc_trans_16[10][k]
845                                                    * pi2_tmp[10 * trans_size]
846                                    + g_ai2_ihevc_trans_16[14][k]
847                                                    * pi2_tmp[14 * trans_size];
848                }
849                eeo[0] =
850                                g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size]
851                                                + g_ai2_ihevc_trans_16[12][0]
852                                                                * pi2_tmp[12
853                                                                                * trans_size];
854                eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0]
855                                + g_ai2_ihevc_trans_16[8][0] * pi2_tmp[8 * trans_size];
856                eeo[1] =
857                                g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size]
858                                                + g_ai2_ihevc_trans_16[12][1]
859                                                                * pi2_tmp[12
860                                                                                * trans_size];
861                eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0]
862                                + g_ai2_ihevc_trans_16[8][1] * pi2_tmp[8 * trans_size];
863
864                /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
865                for(k = 0; k < 2; k++)
866                {
867                    ee[k] = eee[k] + eeo[k];
868                    ee[k + 2] = eee[1 - k] - eeo[1 - k];
869                }
870                for(k = 0; k < 4; k++)
871                {
872                    e[k] = ee[k] + eo[k];
873                    e[k + 4] = ee[3 - k] - eo[3 - k];
874                }
875                for(k = 0; k < 8; k++)
876                {
877                    WORD32 itrans_out;
878                    itrans_out =
879                                    CLIP_S16(((e[k] + o[k] + add) >> shift));
880                    pu1_dst[k * 2] = CLIP_U8((itrans_out + pu1_pred[k * 2]));
881                    itrans_out =
882                                    CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
883                    pu1_dst[(k + 8) * 2] = CLIP_U8((itrans_out + pu1_pred[(k + 8) * 2]));
884                }
885                pi2_tmp++;
886                pu1_pred += pred_strd;
887                pu1_dst += dst_strd;
888            }
889        }
890        /************************************************************************************************/
891        /************************************END - IT_RECON_16x16****************************************/
892        /************************************************************************************************/
893    }
894}
895
896