1/******************************************************************************
2*
3* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4*
5* Licensed under the Apache License, Version 2.0 (the "License");
6* you may not use this file except in compliance with the License.
7* You may obtain a copy of the License at:
8*
9* http://www.apache.org/licenses/LICENSE-2.0
10*
11* Unless required by applicable law or agreed to in writing, software
12* distributed under the License is distributed on an "AS IS" BASIS,
13* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14* See the License for the specific language governing permissions and
15* limitations under the License.
16*
17******************************************************************************/
18/**
19 *******************************************************************************
20 * @file
21 *  ihevc_itrans.c
22 *
23 * @brief
24 *  Contains function definitions for single stage  inverse transform
25 *
26 * @author
27 *  100470
28 *
29 * @par List of Functions:
30 *  - ihevc_itrans_4x4_ttype1()
31 *  - ihevc_itrans_4x4()
32 *  - ihevc_itrans_8x8()
33 *  - ihevc_itrans_16x16()
34 *  - ihevc_itrans_32x32()
35 *
36 * @remarks
37 *  None
38 *
39 *******************************************************************************
40 */
41#include <stdio.h>
42#include <string.h>
43#include "ihevc_typedefs.h"
44#include "ihevc_macros.h"
45#include "ihevc_platform_macros.h"
46#include "ihevc_defs.h"
47#include "ihevc_trans_tables.h"
48#include "ihevc_func_selector.h"
49#include "ihevc_trans_macros.h"
50
51#define NON_OPTIMIZED 1
52
53/**
54 *******************************************************************************
55 *
56 * @brief
57 *  This function performs Single stage  Inverse transform type 1 (DST) for
58 * 4x4 input block
59 *
60 * @par Description:
61 *  Performs single stage 4x4 inverse transform type 1  by utilizing the
62 * symmetry of transformation matrix  and reducing number of multiplications
63 * wherever  possible but keeping the number of operations
64 * (addition,multiplication and shift)same
65 *
66 * @param[in] pi2_src
67 *  Input 4x4 coefficients
68 *
69 * @param[out] pi2_dst
70 *  Output 4x4 block
71 *
72 * @param[in] src_strd
73 *  Input stride
74 *
75 * @param[in] dst_strd
76 *  Output Stride
77 *
78 * @param[in] i4_shift
79 *  Output shift
80 *
81 * @param[in] zero_cols
82 *  Zero columns in pi2_src
83 *
84 * @returns  Void
85 *
86 * @remarks
87 *  None
88 *
89 *******************************************************************************
90 */
91
92
93void ihevc_itrans_4x4_ttype1(WORD16 *pi2_src,
94                             WORD16 *pi2_dst,
95                             WORD32 src_strd,
96                             WORD32 dst_strd,
97                             WORD32 i4_shift,
98                             WORD32 zero_cols)
99{
100    WORD32 i, c[4];
101    WORD32 add;
102
103    add = 1 << (i4_shift - 1);
104
105    for(i = 0; i < TRANS_SIZE_4; i++)
106    {
107        /* Checking for Zero Cols */
108        if((zero_cols & 1) == 1)
109        {
110            memset(pi2_dst, 0, TRANS_SIZE_4 * sizeof(WORD16));
111        }
112        else
113        {
114            // Intermediate Variables
115            c[0] = pi2_src[0] + pi2_src[2 * src_strd];
116            c[1] = pi2_src[2 * src_strd] + pi2_src[3 * src_strd];
117            c[2] = pi2_src[0] - pi2_src[3 * src_strd];
118            c[3] = 74 * pi2_src[src_strd];
119
120            pi2_dst[0] =
121                            CLIP_S16((29 * c[0] + 55 * c[1] + c[3] + add) >> i4_shift);
122            pi2_dst[1] =
123                            CLIP_S16((55 * c[2] - 29 * c[1] + c[3] + add) >> i4_shift);
124            pi2_dst[2] =
125                            CLIP_S16((74 * (pi2_src[0] - pi2_src[2 * src_strd] + pi2_src[3 * src_strd]) + add) >> i4_shift);
126            pi2_dst[3] =
127                            CLIP_S16((55 * c[0] + 29 * c[2] - c[3] + add) >> i4_shift);
128        }
129        pi2_src++;
130        pi2_dst += dst_strd;
131        zero_cols = zero_cols >> 1;
132    }
133}
134
135
136/**
137 *******************************************************************************
138 *
139 * @brief
140 *  This function performs Single stage  Inverse transform for 4x4 input
141 * block
142 *
143 * @par Description:
144 *  Performs single stage 4x4 inverse transform by utilizing  the symmetry of
145 * transformation matrix and reducing number  of multiplications wherever
146 * possible but keeping the  number of operations(addition,multiplication and
147 * shift)  same
148 *
149 * @param[in] pi2_src
150 *  Input 4x4 coefficients
151 *
152 * @param[out] pi2_dst
153 *  Output 4x4 block
154 *
155 * @param[in] src_strd
156 *  Input stride
157 *
158 * @param[in] dst_strd
159 *  Output Stride
160 *
161 * @param[in] i4_shift
162 *  Output shift
163 *
164 * @param[in] zero_cols
165 *  Zero columns in pi2_src
166 *
167 * @returns  Void
168 *
169 * @remarks
170 *  None
171 *
172 *******************************************************************************
173 */
174
175#if NON_OPTIMIZED
176void ihevc_itrans_4x4(WORD16 *pi2_src,
177                      WORD16 *pi2_dst,
178                      WORD32 src_strd,
179                      WORD32 dst_strd,
180                      WORD32 i4_shift,
181                      WORD32 zero_cols)
182{
183    WORD32 j;
184    WORD32 e[2], o[2];
185    WORD32 add;
186
187    add = 1 << (i4_shift - 1);
188
189    for(j = 0; j < TRANS_SIZE_4; j++)
190    {
191        /* Checking for Zero Cols */
192        if((zero_cols & 1) == 1)
193        {
194            memset(pi2_dst, 0, TRANS_SIZE_4 * sizeof(WORD16));
195        }
196        else
197        {
198
199            /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
200            o[0] = g_ai2_ihevc_trans_4[1][0] * pi2_src[src_strd]
201                            + g_ai2_ihevc_trans_4[3][0] * pi2_src[3 * src_strd];
202            o[1] = g_ai2_ihevc_trans_4[1][1] * pi2_src[src_strd]
203                            + g_ai2_ihevc_trans_4[3][1] * pi2_src[3 * src_strd];
204            e[0] = g_ai2_ihevc_trans_4[0][0] * pi2_src[0]
205                            + g_ai2_ihevc_trans_4[2][0] * pi2_src[2 * src_strd];
206            e[1] = g_ai2_ihevc_trans_4[0][1] * pi2_src[0]
207                            + g_ai2_ihevc_trans_4[2][1] * pi2_src[2 * src_strd];
208
209            pi2_dst[0] =
210                            CLIP_S16(((e[0] + o[0] + add) >> i4_shift));
211            pi2_dst[1] =
212                            CLIP_S16(((e[1] + o[1] + add) >> i4_shift));
213            pi2_dst[2] =
214                            CLIP_S16(((e[1] - o[1] + add) >> i4_shift));
215            pi2_dst[3] =
216                            CLIP_S16(((e[0] - o[0] + add) >> i4_shift));
217
218        }
219        pi2_src++;
220        pi2_dst += dst_strd;
221        zero_cols = zero_cols >> 1;
222    }
223}
224#else
225void ihevc_itrans_4x4(WORD16 *pi2_src,
226                      WORD16 *pi2_dst,
227                      WORD32 src_strd,
228                      WORD32 dst_strd,
229                      WORD32 i4_shift,
230                      WORD32 zero_cols)
231{
232    WORD32 j;
233    WORD32 e[2], o[2];
234    WORD32 add;
235
236    add = 1 << (i4_shift - 1);
237
238    /***************************************************************************/
239    /* Transform Matrix 4x4                                                    */
240    /*      0   1   2   3                                                      */
241    /* 0 { 64, 64, 64, 64},                                                    */
242    /* 1 { 83, 36,-36,-83},                                                    */
243    /* 2 { 64,-64,-64, 64},                                                    */
244    /* 3 { 36,-83, 83,-36}                                                     */
245    /***************************************************************************/
246
247    for(j = 0; j < TRANS_SIZE_4; j++)
248    {
249        WORD32 temp;
250
251        /* Checking for Zero Cols */
252        if((zero_cols & 1) == 1)
253        {
254            memset(pi2_dst, 0, TRANS_SIZE_4 * sizeof(WORD16));
255        }
256        else
257        {
258            /* Common operation in o[0] and o[1] */
259            temp = (pi2_src[src_strd] + pi2_src[3 * src_strd]) * 36;
260
261            o[0] = temp + 47 * pi2_src[src_strd];
262            o[1] = temp - 119 * pi2_src[3 * src_strd];
263            e[0] = (pi2_src[0] + pi2_src[2 * src_strd]) << 6;
264            e[1] = (pi2_src[0] - pi2_src[2 * src_strd]) << 6;
265
266            pi2_dst[0] =
267                            CLIP_S16(((e[0] + o[0] + add) >> i4_shift));
268            pi2_dst[1] =
269                            CLIP_S16(((e[1] + o[1] + add) >> i4_shift));
270            pi2_dst[2] =
271                            CLIP_S16(((e[1] - o[1] + add) >> i4_shift));
272            pi2_dst[3] =
273                            CLIP_S16(((e[0] - o[0] + add) >> i4_shift));
274        }
275        pi2_src++;
276        pi2_dst += dst_strd;
277        zero_cols = zero_cols >> 1;
278    }
279}
280#endif
281
282/**
283 *******************************************************************************
284 *
285 * @brief
286 *  This function performs Single stage  Inverse transform for 8x8 input
287 * block
288 *
289 * @par Description:
290 *  Performs single stage 8x8 inverse transform by utilizing  the symmetry of
291 * transformation matrix and reducing number  of multiplications wherever
292 * possible but keeping the  number of operations(addition,multiplication and
293 * shift)  same
294 *
295 * @param[in] pi2_src
296 *  Input 8x8 coefficients
297 *
298 * @param[out] pi2_dst
299 *  Output 8x8 block
300 *
301 * @param[in] src_strd
302 *  Input stride
303 *
304 * @param[in] dst_strd
305 *  Output Stride
306 *
307 * @param[in] i4_shift
308 *  Output shift
309 *
310 * @param[in] zero_cols
311 *  Zero columns in pi2_src
312 *
313 * @returns  Void
314 *
315 * @remarks
316 *  None
317 *
318 *******************************************************************************
319 */
320
321#if NON_OPTIMIZED
322void ihevc_itrans_8x8(WORD16 *pi2_src,
323                      WORD16 *pi2_dst,
324                      WORD32 src_strd,
325                      WORD32 dst_strd,
326                      WORD32 i4_shift,
327                      WORD32 zero_cols)
328{
329    WORD32 j, k;
330    WORD32 e[4], o[4];
331    WORD32 ee[2], eo[2];
332    WORD32 add;
333
334    add = 1 << (i4_shift - 1);
335
336    for(j = 0; j < TRANS_SIZE_8; j++)
337    {
338        /* Checking for Zero Cols */
339        if((zero_cols & 1) == 1)
340        {
341            memset(pi2_dst, 0, TRANS_SIZE_8 * sizeof(WORD16));
342        }
343        else
344        {
345            /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
346            for(k = 0; k < 4; k++)
347            {
348                o[k] = g_ai2_ihevc_trans_8[1][k] * pi2_src[src_strd]
349                                + g_ai2_ihevc_trans_8[3][k]
350                                                * pi2_src[3 * src_strd]
351                                + g_ai2_ihevc_trans_8[5][k]
352                                                * pi2_src[5 * src_strd]
353                                + g_ai2_ihevc_trans_8[7][k]
354                                                * pi2_src[7 * src_strd];
355            }
356
357            eo[0] = g_ai2_ihevc_trans_8[2][0] * pi2_src[2 * src_strd]
358                            + g_ai2_ihevc_trans_8[6][0] * pi2_src[6 * src_strd];
359            eo[1] = g_ai2_ihevc_trans_8[2][1] * pi2_src[2 * src_strd]
360                            + g_ai2_ihevc_trans_8[6][1] * pi2_src[6 * src_strd];
361            ee[0] = g_ai2_ihevc_trans_8[0][0] * pi2_src[0]
362                            + g_ai2_ihevc_trans_8[4][0] * pi2_src[4 * src_strd];
363            ee[1] = g_ai2_ihevc_trans_8[0][1] * pi2_src[0]
364                            + g_ai2_ihevc_trans_8[4][1] * pi2_src[4 * src_strd];
365
366            /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
367            e[0] = ee[0] + eo[0];
368            e[3] = ee[0] - eo[0];
369            e[1] = ee[1] + eo[1];
370            e[2] = ee[1] - eo[1];
371            for(k = 0; k < 4; k++)
372            {
373                pi2_dst[k] =
374                                CLIP_S16(((e[k] + o[k] + add) >> i4_shift));
375                pi2_dst[k + 4] =
376                                CLIP_S16(((e[3 - k] - o[3 - k] + add) >> i4_shift));
377            }
378        }
379        pi2_src++;
380        pi2_dst += dst_strd;
381        zero_cols = zero_cols >> 1;
382    }
383}
384
385#else
386void ihevc_itrans_8x8(WORD16 *pi2_src,
387                      WORD16 *pi2_dst,
388                      WORD32 src_strd,
389                      WORD32 dst_strd,
390                      WORD32 i4_shift,
391                      WORD32 zero_cols)
392{
393    /* Transform Matrix 8x8                          */
394    /*              0    1    2   3   4   5   6   7  */
395    /*     0 -      64   64   64  64  64  64  64  64 */
396    /*     1 -      89   75   50  18 -18 -50 -75 -89 */
397    /*     2 -      83   36  -36 -83 -83 -36  36  83 */
398    /*     3 -      75  -18  -89 -50  50  89  18 -75 */
399    /*     4 -      64  -64  -64  64  64 -64 -64  64 */
400    /*     5 -      50  -89   18  75 -75 -18  89 -50 */
401    /*     6 -      36  -83   83 -36 -36  83 -83  36 */
402    /*     7 -      18  -50   75 -89  89 -75  50 -18 */
403
404    /* 0th and 4th row will have no multiplications */
405    /* 2nd and 6th row has only two coefff multiplies */
406    /* 1st, 3rd, 5th and 7th rows have o mirror symmetry */
407    WORD32 j, k;
408    WORD32 temp1, temp2;
409    WORD32 e[4], o[4];
410    WORD32 ee[2], eo[2];
411    WORD32 add;
412
413    add = 1 << (i4_shift - 1);
414
415    for(j = 0; j < TRANS_SIZE_8; j++)
416    {
417        /* Checking for Zero Cols */
418        if((zero_cols & 1) == 1)
419        {
420            memset(pi2_dst, 0, TRANS_SIZE_8 * sizeof(WORD16));
421        }
422        else
423        {
424
425            /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
426            /*
427             o[0] = 89 *pi2_src[8] +  75 *pi2_src[3*8] +  50 *pi2_src[5*8] +  18 *pi2_src[7*8];
428             o[1] = 75 *pi2_src[8] + -18 *pi2_src[3*8] + -89 *pi2_src[5*8] + -50 *pi2_src[7*8];
429             o[2] = 50 *pi2_src[8] + -89 *pi2_src[3*8] +  18 *pi2_src[5*8] +  75 *pi2_src[7*8];
430             o[3] = 18 *pi2_src[8] + -50 *pi2_src[3*8] +  75 *pi2_src[5*8] + -89 *pi2_src[7*8];
431             */
432
433            /* Optimization: 4 mul + 2 add  ---> 3 mul + 3 add */
434            /*
435             temp1 = (pi2_src[8  ] + pi2_src[3*8]) * 75;
436             temp2 = (pi2_src[5*8] + pi2_src[7*8]) * 50;
437
438             o[0] = temp1 + 14 * pi2_src[8  ] + temp2 - 32 * pi2_src[7*8];
439             o[1] = temp1 - 93 * pi2_src[3*8] - temp2 - 39 * pi2_src[5*8];
440             */
441
442            temp1 = (pi2_src[src_strd] + pi2_src[3 * src_strd]) * 75;
443            temp2 = (pi2_src[5 * src_strd] + pi2_src[7 * src_strd]) * 50;
444
445            o[0] = temp1 + 14 * pi2_src[src_strd] + temp2
446                            - (pi2_src[7 * src_strd] << 5);
447            o[1] = temp1 - 93 * pi2_src[3 * src_strd] - temp2
448                            - 39 * pi2_src[5 * src_strd];
449
450            /* Optimization: 4 mul + 2 add  ---> 3 mul + 3 add */
451            /*
452             temp1 = (pi2_src[8  ] - pi2_src[3*8]) * 50;
453             temp2 = (pi2_src[5*8] + pi2_src[7*8]) * 75;
454
455             o[2] = temp1 - 39 * pi2_src[3*8] + temp2 -  57 * pi2_src[5*8];
456             o[3] = temp1 - 32 * pi2_src[8  ] + temp2 - 164 * pi2_src[7*8];
457             */
458
459            temp1 = (pi2_src[src_strd] - pi2_src[3 * src_strd]) * 50;
460            temp2 = (pi2_src[5 * src_strd] + pi2_src[7 * src_strd]) * 75;
461
462            o[2] = temp1 - 39 * pi2_src[3 * src_strd] + temp2
463                            - 57 * pi2_src[5 * src_strd];
464            o[3] = temp1 - (pi2_src[src_strd] << 5) + temp2
465                            - 164 * pi2_src[7 * src_strd];
466
467            /*
468             eo[0] = 83 *pi2_src[ 2*8 ] +  36 *pi2_src[ 6*8 ];
469             eo[1] = 36 *pi2_src[ 2*8 ] + -83 *pi2_src[ 6*8 ];
470             ee[0] = 64 *pi2_src[ 0   ] +  64 *pi2_src[ 4*8 ];
471             ee[1] = 64 *pi2_src[ 0   ] + -64 *pi2_src[ 4*8 ];
472             */
473
474            /* Optimization: 4 mul + 2 add  ---> 3 mul + 3 add */
475            temp1 = (pi2_src[2 * src_strd] + pi2_src[6 * src_strd]) * 36;
476            eo[0] = temp1 + 47 * pi2_src[2 * src_strd];
477            eo[1] = temp1 - 119 * pi2_src[6 * src_strd];
478
479            /* Optimization: 4 mul + 2 add  ---> 2 i4_shift + 2 add */
480            ee[0] = (pi2_src[0] + pi2_src[4 * src_strd]) << 6;
481            ee[1] = (pi2_src[0] - pi2_src[4 * src_strd]) << 6;
482
483            e[0] = ee[0] + eo[0];
484            e[3] = ee[0] - eo[0];
485            e[1] = ee[1] + eo[1];
486            e[2] = ee[1] - eo[1];
487
488            for(k = 0; k < 4; k++)
489            {
490                pi2_dst[k] =
491                                CLIP_S16(((e[k] + o[k] + add) >> i4_shift));
492                pi2_dst[k + 4] =
493                                CLIP_S16(((e[3 - k] - o[3 - k] + add) >> i4_shift));
494            }
495        }
496        pi2_src++;
497        pi2_dst += dst_strd;
498        zero_cols = zero_cols >> 1;
499    }
500
501}
502#endif
503
504
505/**
506 *******************************************************************************
507 *
508 * @brief
509 *  This function performs Single stage  Inverse transform for 16x16 input
510 * block
511 *
512 * @par Description:
513 *  Performs single stage 16x16 inverse transform by  utilizing the symmetry
514 * of transformation matrix  and reducing number of multiplications wherever
515 * possible  but keeping the number of operations  (addition,multiplication
516 * and shift) same
517 *
518 * @param[in] pi2_src
519 *  Input 16x16 coefficients
520 *
521 * @param[out] pi2_dst
522 *  Output 16x16 block
523 *
524 * @param[in] src_strd
525 *  Input stride
526 *
527 * @param[in] dst_strd
528 *  Output Stride
529 *
530 * @param[in] i4_shift
531 *  Output shift
532 *
533 * @param[in] zero_cols
534 *  Zero columns in pi2_src
535 *
536 * @returns  Void
537 *
538 * @remarks
539 *  None
540 *
541 *******************************************************************************
542 */
543
544#if NON_OPTIMIZED
545void ihevc_itrans_16x16(WORD16 *pi2_src,
546                        WORD16 *pi2_dst,
547                        WORD32 src_strd,
548                        WORD32 dst_strd,
549                        WORD32 i4_shift,
550                        WORD32 zero_cols)
551{
552    WORD32 j, k;
553    WORD32 e[8], o[8];
554    WORD32 ee[4], eo[4];
555    WORD32 eee[2], eeo[2];
556    WORD32 add;
557
558    add = 1 << (i4_shift - 1);
559
560    for(j = 0; j < TRANS_SIZE_16; j++)
561    {
562        /* Checking for Zero Cols */
563        if((zero_cols & 1) == 1)
564        {
565            memset(pi2_dst, 0, TRANS_SIZE_16 * sizeof(WORD16));
566        }
567        else
568        {
569            /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
570            for(k = 0; k < 8; k++)
571            {
572                o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_src[src_strd]
573                                + g_ai2_ihevc_trans_16[3][k]
574                                                * pi2_src[3 * src_strd]
575                                + g_ai2_ihevc_trans_16[5][k]
576                                                * pi2_src[5 * src_strd]
577                                + g_ai2_ihevc_trans_16[7][k]
578                                                * pi2_src[7 * src_strd]
579                                + g_ai2_ihevc_trans_16[9][k]
580                                                * pi2_src[9 * src_strd]
581                                + g_ai2_ihevc_trans_16[11][k]
582                                                * pi2_src[11 * src_strd]
583                                + g_ai2_ihevc_trans_16[13][k]
584                                                * pi2_src[13 * src_strd]
585                                + g_ai2_ihevc_trans_16[15][k]
586                                                * pi2_src[15 * src_strd];
587            }
588            for(k = 0; k < 4; k++)
589            {
590                eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_src[2 * src_strd]
591                                + g_ai2_ihevc_trans_16[6][k]
592                                                * pi2_src[6 * src_strd]
593                                + g_ai2_ihevc_trans_16[10][k]
594                                                * pi2_src[10 * src_strd]
595                                + g_ai2_ihevc_trans_16[14][k]
596                                                * pi2_src[14 * src_strd];
597            }
598            eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_src[4 * src_strd]
599                            + g_ai2_ihevc_trans_16[12][0]
600                                            * pi2_src[12 * src_strd];
601            eee[0] =
602                            g_ai2_ihevc_trans_16[0][0] * pi2_src[0]
603                                            + g_ai2_ihevc_trans_16[8][0]
604                                                            * pi2_src[8
605                                                                            * src_strd];
606            eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_src[4 * src_strd]
607                            + g_ai2_ihevc_trans_16[12][1]
608                                            * pi2_src[12 * src_strd];
609            eee[1] =
610                            g_ai2_ihevc_trans_16[0][1] * pi2_src[0]
611                                            + g_ai2_ihevc_trans_16[8][1]
612                                                            * pi2_src[8
613                                                                            * src_strd];
614
615            /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
616            for(k = 0; k < 2; k++)
617            {
618                ee[k] = eee[k] + eeo[k];
619                ee[k + 2] = eee[1 - k] - eeo[1 - k];
620            }
621            for(k = 0; k < 4; k++)
622            {
623                e[k] = ee[k] + eo[k];
624                e[k + 4] = ee[3 - k] - eo[3 - k];
625            }
626            for(k = 0; k < 8; k++)
627            {
628                pi2_dst[k] =
629                                CLIP_S16(((e[k] + o[k] + add) >> i4_shift));
630                pi2_dst[k + 8] =
631                                CLIP_S16(((e[7 - k] - o[7 - k] + add) >> i4_shift));
632            }
633        }
634        pi2_src++;
635        pi2_dst += dst_strd;
636        zero_cols = zero_cols >> 1;
637    }
638}
639#else
640void ihevc_itrans_16x16(WORD16 *pi2_src,
641                        WORD16 *pi2_dst,
642                        WORD32 src_strd,
643                        WORD32 dst_strd,
644                        WORD32 i4_shift,
645                        WORD32 zero_cols)
646{
647    WORD32 j, k;
648    WORD32 e[8], o[8];
649    WORD32 ee[4], eo[4];
650    WORD32 eee[2], eeo[2];
651    WORD32 add;
652    WORD32 temp1, temp2;
653
654    add = 1 << (i4_shift - 1);
655    /***************************************************************************/
656    /* Transform Matrix 16x16                                                  */
657    /*       0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15     */
658    /* 0  { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64},   */
659    /* 1  { 90, 87, 80, 70, 57, 43, 25,  9, -9,-25,-43,-57,-70,-80,-87,-90},   */
660    /* 2  { 89, 75, 50, 18,-18,-50,-75,-89,-89,-75,-50,-18, 18, 50, 75, 89},   */
661    /* 3  { 87, 57,  9,-43,-80,-90,-70,-25, 25, 70, 90, 80, 43, -9,-57,-87},   */
662    /* 4  { 83, 36,-36,-83,-83,-36, 36, 83, 83, 36,-36,-83,-83,-36, 36, 83},   */
663    /* 5  { 80,  9,-70,-87,-25, 57, 90, 43,-43,-90,-57, 25, 87, 70, -9,-80},   */
664    /* 6  { 75,-18,-89,-50, 50, 89, 18,-75,-75, 18, 89, 50,-50,-89,-18, 75},   */
665    /* 7  { 70,-43,-87,  9, 90, 25,-80,-57, 57, 80,-25,-90, -9, 87, 43,-70},   */
666    /* 8  { 64,-64,-64, 64, 64,-64,-64, 64, 64,-64,-64, 64, 64,-64,-64, 64},   */
667    /* 9  { 57,-80,-25, 90, -9,-87, 43, 70,-70,-43, 87,  9,-90, 25, 80,-57},   */
668    /* 10 { 50,-89, 18, 75,-75,-18, 89,-50,-50, 89,-18,-75, 75, 18,-89, 50},   */
669    /* 11 { 43,-90, 57, 25,-87, 70,  9,-80, 80, -9,-70, 87,-25,-57, 90,-43},   */
670    /* 12 { 36,-83, 83,-36,-36, 83,-83, 36, 36,-83, 83,-36,-36, 83,-83, 36},   */
671    /* 13 { 25,-70, 90,-80, 43,  9,-57, 87,-87, 57, -9,-43, 80,-90, 70,-25},   */
672    /* 14 { 18,-50, 75,-89, 89,-75, 50,-18,-18, 50,-75, 89,-89, 75,-50, 18},   */
673    /* 15 {  9,-25, 43,-57, 70,-80, 87,-90, 90,-87, 80,-70, 57,-43, 25, -9}    */
674    /***************************************************************************/
675
676    for(j = 0; j < TRANS_SIZE_16; j++)
677    {
678        /* Checking for Zero Cols */
679        if((zero_cols & 1) == 1)
680        {
681            memset(pi2_dst, 0, TRANS_SIZE_16 * sizeof(WORD16));
682        }
683        else
684        {
685            /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
686            {
687                /*
688                 o[k] = g_ai2_ihevc_trans_16[ 1][k]*pi2_src[ src_strd   ] + g_ai2_ihevc_trans_16[ 3][k]*pi2_src[ 3*src_strd   ] + g_ai2_ihevc_trans_16[ 5][k]*pi2_src[ 5*src_strd   ] + g_ai2_ihevc_trans_16[ 7][k]*pi2_src[ 7*src_strd   ] +
689                 g_ai2_ihevc_trans_16[ 9][k]*pi2_src[ 9*src_strd   ] + g_ai2_ihevc_trans_16[11][k]*pi2_src[11*src_strd   ] + g_ai2_ihevc_trans_16[13][k]*pi2_src[13*src_strd   ] + g_ai2_ihevc_trans_16[15][k]*pi2_src[15*src_strd   ];
690                 */
691
692                o[0] = 90 * pi2_src[src_strd] + 87 * pi2_src[3 * src_strd]
693                                + 80 * pi2_src[5 * src_strd]
694                                + 70 * pi2_src[7 * src_strd]
695                                + 57 * pi2_src[9 * src_strd]
696                                + 43 * pi2_src[11 * src_strd]
697                                + 25 * pi2_src[13 * src_strd]
698                                + 9 * pi2_src[15 * src_strd];
699
700                o[1] = 87 * pi2_src[src_strd] + 57 * pi2_src[3 * src_strd]
701                                + 9 * pi2_src[5 * src_strd]
702                                + -43 * pi2_src[7 * src_strd]
703                                + -80 * pi2_src[9 * src_strd]
704                                + -90 * pi2_src[11 * src_strd]
705                                + -70 * pi2_src[13 * src_strd]
706                                + -25 * pi2_src[15 * src_strd];
707
708                o[2] = 80 * pi2_src[src_strd] + 9 * pi2_src[3 * src_strd]
709                                + -70 * pi2_src[5 * src_strd]
710                                + -87 * pi2_src[7 * src_strd]
711                                + -25 * pi2_src[9 * src_strd]
712                                + 57 * pi2_src[11 * src_strd]
713                                + 90 * pi2_src[13 * src_strd]
714                                + 43 * pi2_src[15 * src_strd];
715
716                o[3] = 70 * pi2_src[src_strd] + -43 * pi2_src[3 * src_strd]
717                                + -87 * pi2_src[5 * src_strd]
718                                + 9 * pi2_src[7 * src_strd]
719                                + 90 * pi2_src[9 * src_strd]
720                                + 25 * pi2_src[11 * src_strd]
721                                + -80 * pi2_src[13 * src_strd]
722                                + -57 * pi2_src[15 * src_strd];
723
724                o[4] = 57 * pi2_src[src_strd] + -80 * pi2_src[3 * src_strd]
725                                + -25 * pi2_src[5 * src_strd]
726                                + 90 * pi2_src[7 * src_strd]
727                                + -9 * pi2_src[9 * src_strd]
728                                + -87 * pi2_src[11 * src_strd]
729                                + 43 * pi2_src[13 * src_strd]
730                                + 70 * pi2_src[15 * src_strd];
731
732                o[5] = 43 * pi2_src[src_strd] + -90 * pi2_src[3 * src_strd]
733                                + 57 * pi2_src[5 * src_strd]
734                                + 25 * pi2_src[7 * src_strd]
735                                + -87 * pi2_src[9 * src_strd]
736                                + 70 * pi2_src[11 * src_strd]
737                                + 9 * pi2_src[13 * src_strd]
738                                + -80 * pi2_src[15 * src_strd];
739
740                o[6] = 25 * pi2_src[src_strd] + -70 * pi2_src[3 * src_strd]
741                                + 90 * pi2_src[5 * src_strd]
742                                + -80 * pi2_src[7 * src_strd]
743                                + 43 * pi2_src[9 * src_strd]
744                                + 9 * pi2_src[11 * src_strd]
745                                + -57 * pi2_src[13 * src_strd]
746                                + 87 * pi2_src[15 * src_strd];
747
748                o[7] = 9 * pi2_src[src_strd] + -25 * pi2_src[3 * src_strd]
749                                + 43 * pi2_src[5 * src_strd]
750                                + -57 * pi2_src[7 * src_strd]
751                                + 70 * pi2_src[9 * src_strd]
752                                + -80 * pi2_src[11 * src_strd]
753                                + 87 * pi2_src[13 * src_strd]
754                                + -90 * pi2_src[15 * src_strd];
755            }
756            {
757                temp1 = (pi2_src[2 * src_strd] + pi2_src[6 * src_strd]) * 75;
758                temp2 = (pi2_src[10 * src_strd] + pi2_src[14 * src_strd]) * 50;
759                eo[0] = temp1 + 14 * pi2_src[2 * src_strd] + temp2
760                                - (pi2_src[14 * src_strd] << 5);
761                eo[1] = temp1 - 93 * pi2_src[6 * src_strd] - temp2
762                                - 39 * pi2_src[10 * src_strd];
763
764                temp1 = (pi2_src[2 * src_strd] - pi2_src[6 * src_strd]) * 50;
765                temp2 = (pi2_src[10 * src_strd] + pi2_src[14 * src_strd]) * 75;
766                eo[2] = temp1 - 39 * pi2_src[6 * src_strd] + temp2
767                                - 57 * pi2_src[10 * src_strd];
768                eo[3] = temp1 - (pi2_src[2 * src_strd] << 5) + temp2
769                                - 164 * pi2_src[14 * src_strd];
770            }
771
772            temp1 = (pi2_src[4 * src_strd] + pi2_src[12 * src_strd]) * 36;
773            eeo[0] = temp1 + 47 * pi2_src[4 * src_strd];
774            eeo[1] = temp1 - 119 * pi2_src[12 * src_strd];
775
776            eee[0] = (pi2_src[0] + pi2_src[8 * src_strd]) << 6;
777            eee[1] = (pi2_src[0] - pi2_src[8 * src_strd]) << 6;
778
779            /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
780            for(k = 0; k < 2; k++)
781            {
782                ee[k] = eee[k] + eeo[k];
783                ee[k + 2] = eee[1 - k] - eeo[1 - k];
784            }
785            for(k = 0; k < 4; k++)
786            {
787                e[k] = ee[k] + eo[k];
788                e[k + 4] = ee[3 - k] - eo[3 - k];
789            }
790            for(k = 0; k < 8; k++)
791            {
792                pi2_dst[k] =
793                                CLIP_S16(((e[k] + o[k] + add) >> i4_shift));
794                pi2_dst[k + 8] =
795                                CLIP_S16(((e[7 - k] - o[7 - k] + add) >> i4_shift));
796            }
797        }
798        pi2_src++;
799        pi2_dst += dst_strd;
800        zero_cols = zero_cols >> 1;
801    }
802}
803#endif
804
805/**
806 *******************************************************************************
807 *
808 * @brief
809 *  This function performs Single stage  Inverse transform for 32x32 input
810 * block
811 *
812 * @par Description:
813 *  Performs single stage 32x32 inverse transform by  utilizing the symmetry
814 * of transformation matrix and  reducing number of multiplications wherever
815 * possible  but keeping the number of operations  (addition,multiplication
816 * and shift) same
817 *
818 * @param[in] pi2_src
819 *  Input 32x32 coefficients
820 *
821 * @param[out] pi2_dst
822 *  Output 32x32 block
823 *
824 * @param[in] src_strd
825 *  Input stride
826 *
827 * @param[in] dst_strd
828 *  Output Stride
829 *
830 * @param[in] i4_shift
831 *  Output shift
832 *
833 * @param[in] zero_cols
834 *  Zero columns in pi2_src
835 *
836 * @returns  Void
837 *
838 * @remarks
839 *  None
840 *
841 *******************************************************************************
842 */
843
844
845void ihevc_itrans_32x32(WORD16 *pi2_src,
846                        WORD16 *pi2_dst,
847                        WORD32 src_strd,
848                        WORD32 dst_strd,
849                        WORD32 i4_shift,
850                        WORD32 zero_cols)
851{
852    WORD32 j, k;
853    WORD32 e[16], o[16];
854    WORD32 ee[8], eo[8];
855    WORD32 eee[4], eeo[4];
856    WORD32 eeee[2], eeeo[2];
857    WORD32 add;
858
859    add = 1 << (i4_shift - 1);
860
861    for(j = 0; j < TRANS_SIZE_32; j++)
862    {
863        /* Checking for Zero Cols */
864        if((zero_cols & 1) == 1)
865        {
866            memset(pi2_dst, 0, TRANS_SIZE_32 * sizeof(WORD16));
867        }
868        else
869        {
870            /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
871            for(k = 0; k < 16; k++)
872            {
873                o[k] = g_ai2_ihevc_trans_32[1][k] * pi2_src[src_strd]
874                                + g_ai2_ihevc_trans_32[3][k]
875                                                * pi2_src[3 * src_strd]
876                                + g_ai2_ihevc_trans_32[5][k]
877                                                * pi2_src[5 * src_strd]
878                                + g_ai2_ihevc_trans_32[7][k]
879                                                * pi2_src[7 * src_strd]
880                                + g_ai2_ihevc_trans_32[9][k]
881                                                * pi2_src[9 * src_strd]
882                                + g_ai2_ihevc_trans_32[11][k]
883                                                * pi2_src[11 * src_strd]
884                                + g_ai2_ihevc_trans_32[13][k]
885                                                * pi2_src[13 * src_strd]
886                                + g_ai2_ihevc_trans_32[15][k]
887                                                * pi2_src[15 * src_strd]
888                                + g_ai2_ihevc_trans_32[17][k]
889                                                * pi2_src[17 * src_strd]
890                                + g_ai2_ihevc_trans_32[19][k]
891                                                * pi2_src[19 * src_strd]
892                                + g_ai2_ihevc_trans_32[21][k]
893                                                * pi2_src[21 * src_strd]
894                                + g_ai2_ihevc_trans_32[23][k]
895                                                * pi2_src[23 * src_strd]
896                                + g_ai2_ihevc_trans_32[25][k]
897                                                * pi2_src[25 * src_strd]
898                                + g_ai2_ihevc_trans_32[27][k]
899                                                * pi2_src[27 * src_strd]
900                                + g_ai2_ihevc_trans_32[29][k]
901                                                * pi2_src[29 * src_strd]
902                                + g_ai2_ihevc_trans_32[31][k]
903                                                * pi2_src[31 * src_strd];
904            }
905            for(k = 0; k < 8; k++)
906            {
907                eo[k] = g_ai2_ihevc_trans_32[2][k] * pi2_src[2 * src_strd]
908                                + g_ai2_ihevc_trans_32[6][k]
909                                                * pi2_src[6 * src_strd]
910                                + g_ai2_ihevc_trans_32[10][k]
911                                                * pi2_src[10 * src_strd]
912                                + g_ai2_ihevc_trans_32[14][k]
913                                                * pi2_src[14 * src_strd]
914                                + g_ai2_ihevc_trans_32[18][k]
915                                                * pi2_src[18 * src_strd]
916                                + g_ai2_ihevc_trans_32[22][k]
917                                                * pi2_src[22 * src_strd]
918                                + g_ai2_ihevc_trans_32[26][k]
919                                                * pi2_src[26 * src_strd]
920                                + g_ai2_ihevc_trans_32[30][k]
921                                                * pi2_src[30 * src_strd];
922            }
923            for(k = 0; k < 4; k++)
924            {
925                eeo[k] = g_ai2_ihevc_trans_32[4][k] * pi2_src[4 * src_strd]
926                                + g_ai2_ihevc_trans_32[12][k]
927                                                * pi2_src[12 * src_strd]
928                                + g_ai2_ihevc_trans_32[20][k]
929                                                * pi2_src[20 * src_strd]
930                                + g_ai2_ihevc_trans_32[28][k]
931                                                * pi2_src[28 * src_strd];
932            }
933            eeeo[0] = g_ai2_ihevc_trans_32[8][0] * pi2_src[8 * src_strd]
934                            + g_ai2_ihevc_trans_32[24][0]
935                                            * pi2_src[24 * src_strd];
936            eeeo[1] = g_ai2_ihevc_trans_32[8][1] * pi2_src[8 * src_strd]
937                            + g_ai2_ihevc_trans_32[24][1]
938                                            * pi2_src[24 * src_strd];
939            eeee[0] = g_ai2_ihevc_trans_32[0][0] * pi2_src[0]
940                            + g_ai2_ihevc_trans_32[16][0]
941                                            * pi2_src[16 * src_strd];
942            eeee[1] = g_ai2_ihevc_trans_32[0][1] * pi2_src[0]
943                            + g_ai2_ihevc_trans_32[16][1]
944                                            * pi2_src[16 * src_strd];
945
946            /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
947            eee[0] = eeee[0] + eeeo[0];
948            eee[3] = eeee[0] - eeeo[0];
949            eee[1] = eeee[1] + eeeo[1];
950            eee[2] = eeee[1] - eeeo[1];
951            for(k = 0; k < 4; k++)
952            {
953                ee[k] = eee[k] + eeo[k];
954                ee[k + 4] = eee[3 - k] - eeo[3 - k];
955            }
956            for(k = 0; k < 8; k++)
957            {
958                e[k] = ee[k] + eo[k];
959                e[k + 8] = ee[7 - k] - eo[7 - k];
960            }
961            for(k = 0; k < 16; k++)
962            {
963                pi2_dst[k] =
964                                CLIP_S16(((e[k] + o[k] + add) >> i4_shift));
965                pi2_dst[k + 16] =
966                                CLIP_S16(((e[15 - k] - o[15 - k] + add) >> i4_shift));
967            }
968        }
969        pi2_src++;
970        pi2_dst += dst_strd;
971        zero_cols = zero_cols >> 1;
972    }
973}
974
975