1/******************************************************************************
2 *
3 * Copyright (C) 2015 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19*/
20/**
21 *******************************************************************************
22 * @file
23 *  ih264_resi_trans_quant.c
24 *
25 * @brief
26 *  Contains function definitions single stage  forward transform for H.264
27 *  It will calculate the residue, do the cf and then do quantization
28 *
29 * @author
30 *  Ittiam
31 *
32 * @par List of Functions:
33 *  - ih264_resi_trans_quant_4x4()
34 *  - ih264_resi_trans_quant_chroma_4x4
35 *  - ih264_hadamard_quant_4x4
36 *  - ih264_hadamard_quant_2x2_uv
37 *  - ih264_resi_trans_quant_8x8
38 *
39 * @remarks
40 *******************************************************************************
41 */
42
43/*****************************************************************************/
44/* File Includes                                                             */
45/*****************************************************************************/
46
47/* System include files */
48#include <stddef.h>
49
50/* User include files */
51#include "ih264_typedefs.h"
52#include "ih264_defs.h"
53#include "ih264_size_defs.h"
54#include "ih264_macros.h"
55#include "ih264_trans_macros.h"
56#include "ih264_trans_data.h"
57#include "ih264_structs.h"
58#include "ih264_trans_quant_itrans_iquant.h"
59
60/**
61 *******************************************************************************
62 *
63 * @brief
64 *   This function performs forward transform and quantization on a 4*4 block
65 *
66 * @par Description:
67 *   The function accepts source buffer and estimation buffer. From these, it
68 *   computes the residue. This is residue is then transformed and quantized.
69 *   The transform and quantization are in placed computed. They use the residue
70 *   buffer for this.
71 *
72 * @param[in] pu1_src
73 *   Pointer to source sub-block
74 *
75 * @param[in] pu1_pred
76 *   Pointer to prediction sub-block
77 *
78 * @param[in] pi2_out
79 *   Pointer to residual sub-block
80 *
81 * @param[in] src_strd
82 *   Source stride
83 *
84 * @param[in] pred_strd
85 *   Prediction stride
86 *
87 * @param[in] dst_strd
88 *   Destination stride
89 *
90 * @param[in] u4_qbits
91 *    QP_BITS_h264_4x4 + floor(QP/6)
92 *
93 * @param[in] pu2_threshold_matrix
94 *   Pointer to Forward Quant Threshold Matrix
95 *
96 * @param[in] pu2_scale_matrix
97 *   Pointer to Forward Quant Scale Matrix
98 *
99 * @param[in] u4_round_factor
100 *   Quantization Round factor
101 *
102 * @param[out] pu1_nnz
103 *   Total non-zero coefficients in the current sub-block
104 *
105 * @returns
106 *
107 * @remarks
108 *   None
109 *
110 *******************************************************************************
111 */
112void ih264_resi_trans_quant_4x4(UWORD8 *pu1_src,
113                                UWORD8 *pu1_pred,
114                                WORD16 *pi2_out,
115                                WORD32 src_strd,
116                                WORD32 pred_strd,
117                                const UWORD16 *pu2_scale_matrix,
118                                const UWORD16 *pu2_threshold_matrix,
119                                UWORD32 u4_qbits,
120                                UWORD32 u4_round_factor,
121                                UWORD8 *pu1_nnz,
122                                WORD16 *pi2_alt_dc_addr)
123{
124    UWORD32 i;
125    WORD32  x0, x1, x2, x3, x4, x5, x6, x7;
126    WORD32  i4_value, i4_sign;
127    UWORD32 u4_abs_value;
128    WORD16  *pi2_out_tmp = pi2_out;
129    UWORD32 u4_nonzero_coeff = 0;
130
131    for (i = 0; i < SUB_BLK_WIDTH_4x4; i++)
132    {
133        /* computing prediction error (residue) */
134        x4 = pu1_src[0] - pu1_pred[0];
135        x5 = pu1_src[1] - pu1_pred[1];
136        x6 = pu1_src[2] - pu1_pred[2];
137        x7 = pu1_src[3] - pu1_pred[3];
138
139        /* Horizontal transform */
140        x0 = x4 + x7;
141        x1 = x5 + x6;
142        x2 = x5 - x6;
143        x3 = x4 - x7;
144
145        pi2_out_tmp[0] = x0 + x1;
146        pi2_out_tmp[1] = (x3 <<1) + x2;
147        pi2_out_tmp[2] = x0 - x1;
148        pi2_out_tmp[3] = x3 - (x2<<1);
149
150        /* pointing to next row; */
151        pu1_src += src_strd;
152        pu1_pred += pred_strd;
153        pi2_out_tmp += 4;
154
155    }
156    pi2_out_tmp = pi2_out;
157    for (i = 0; i < SUB_BLK_WIDTH_4x4; i++)
158    {
159
160        /* Vertical transform and quantization */
161        x4 = pi2_out_tmp[0];
162        x5 = pi2_out_tmp[4];
163        x6 = pi2_out_tmp[8];
164        x7 = pi2_out_tmp[12];
165
166
167        x0 = x4 + x7;
168        x1 = x5 + x6;
169        x2 = x5 - x6;
170        x3 = x4 - x7;
171
172        /* quantization is done in place */
173
174        i4_value = x0 + x1;
175
176        if(i==0)
177        {
178          (*pi2_alt_dc_addr) = i4_value;
179        }
180
181        FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0], pu2_scale_matrix[0], u4_round_factor, u4_qbits, u4_nonzero_coeff);
182        pi2_out_tmp[0] = i4_value;
183
184
185        i4_value = (x3 << 1) + x2;
186        FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[4], pu2_scale_matrix[4], u4_round_factor, u4_qbits, u4_nonzero_coeff);
187        pi2_out_tmp[4] = i4_value;
188
189
190        i4_value = x0 - x1;
191        FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[8], pu2_scale_matrix[8], u4_round_factor, u4_qbits, u4_nonzero_coeff);
192        pi2_out_tmp[8] = i4_value;
193
194
195        i4_value = x3 - (x2 << 1);
196        FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[12], pu2_scale_matrix[12], u4_round_factor, u4_qbits, u4_nonzero_coeff);
197        pi2_out_tmp[12] = i4_value;
198
199        pi2_out_tmp ++;
200        pu2_scale_matrix++;
201        pu2_threshold_matrix++;
202    }
203
204    /* Return total nonzero coefficients in the current sub block */
205    *pu1_nnz =  u4_nonzero_coeff;
206}
207/**
208 *******************************************************************************
209 *
210 * @brief
211 *   This function performs forward transform and quantization on a 4*4 chroma block
212 *   with interleaved values
213 *
214 * @par Description:
215 *   The function accepts source buffer and estimation buffer. From these, it
216 *   computes the residue. This is residue is then transformed and quantized.
217 *   The transform and quantization are in placed computed. They use the residue
218 *   buffer for this.
219 *
220 * @param[in] pu1_src
221 *   Pointer to source sub-block
222 *
223 * @param[in] pu1_pred
224 *   Pointer to prediction sub-block
225 *
226 * @param[in] pi2_out
227 *   Pointer to residual sub-block
228 *
229 * @param[in] src_strd
230 *   Source stride
231 *
232 * @param[in] pred_strd
233 *   Prediction stride
234 *
235 * @param[in] dst_strd
236 *   Destination stride
237 *
238 * @param[in] u4_qbits
239 *    QP_BITS_h264_4x4 + floor(QP/6)
240 *
241 * @param[in] pu2_threshold_matrix
242 *   Pointer to Forward Quant Threshold Matrix
243 *
244 * @param[in] pu2_scale_matrix
245 *   Pointer to Forward Quant Scale Matrix
246 *
247 * @param[in] u4_round_factor
248 *   Quantization Round factor
249 *
250 * @param[out] pu1_nnz
251 *   Total non-zero coefficients in the current sub-block
252 *
253 * @returns
254 *
255 * @remarks
256 *   None
257 *
258 *******************************************************************************
259 */
260void ih264_resi_trans_quant_chroma_4x4(UWORD8 *pu1_src,
261                                       UWORD8 *pu1_pred,
262                                       WORD16 *pi2_out,
263                                       WORD32 src_strd,
264                                       WORD32 pred_strd,
265                                       const UWORD16 *pu2_scale_matrix,
266                                       const UWORD16 *pu2_threshold_matrix,
267                                       UWORD32 u4_qbits,
268                                       UWORD32 u4_round_factor,
269                                       UWORD8 *pu1_nnz,
270                                       WORD16 *pu1_dc_alt_addr)
271{
272    UWORD32 i;
273    WORD32  x0, x1, x2, x3, x4, x5, x6, x7;
274    WORD32  i4_value, i4_sign;
275    UWORD32 u4_abs_value;
276    WORD16  *pi2_out_tmp = pi2_out;
277    UWORD32 u4_nonzero_coeff = 0;
278
279    for (i = 0; i < SUB_BLK_WIDTH_4x4; i++)
280    {
281        /* computing prediction error (residue) */
282        x4 = pu1_src[0] - pu1_pred[0];
283        x5 = pu1_src[2] - pu1_pred[2];
284        x6 = pu1_src[4] - pu1_pred[4];
285        x7 = pu1_src[6] - pu1_pred[6];
286
287        /* Horizontal transform */
288        x0 = x4 + x7;
289        x1 = x5 + x6;
290        x2 = x5 - x6;
291        x3 = x4 - x7;
292
293        pi2_out_tmp[0] = x0 + x1;
294        pi2_out_tmp[1] = (x3 <<1) + x2;
295        pi2_out_tmp[2] = x0 - x1;
296        pi2_out_tmp[3] = x3 - (x2<<1);
297
298        /* pointing to next row; */
299        pu1_src += src_strd;
300        pu1_pred += pred_strd;
301        pi2_out_tmp += 4;
302
303    }
304    pi2_out_tmp = pi2_out;
305    for (i = 0; i < SUB_BLK_WIDTH_4x4; i++)
306    {
307
308        /* Vertical transform and quantization */
309        x4 = pi2_out_tmp[0];
310        x5 = pi2_out_tmp[4];
311        x6 = pi2_out_tmp[8];
312        x7 = pi2_out_tmp[12];
313
314
315        x0 = x4 + x7;
316        x1 = x5 + x6;
317        x2 = x5 - x6;
318        x3 = x4 - x7;
319
320        /* quantization is done in place */
321
322        i4_value = x0 + x1;
323
324        if(i==0)
325        {
326          *pu1_dc_alt_addr = i4_value;
327        }
328
329        FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
330                  pu2_scale_matrix[0], u4_round_factor, u4_qbits,
331                  u4_nonzero_coeff);
332        pi2_out_tmp[0] = i4_value;
333
334        i4_value = (x3 << 1) + x2;
335        FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[4],
336                  pu2_scale_matrix[4], u4_round_factor, u4_qbits,
337                  u4_nonzero_coeff);
338        pi2_out_tmp[4] = i4_value;
339
340        i4_value = x0 - x1;
341        FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[8],
342                  pu2_scale_matrix[8], u4_round_factor, u4_qbits,
343                  u4_nonzero_coeff);
344        pi2_out_tmp[8] = i4_value;
345
346        i4_value = x3 - (x2 << 1);
347        FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[12],
348                  pu2_scale_matrix[12], u4_round_factor, u4_qbits,
349                  u4_nonzero_coeff);
350        pi2_out_tmp[12] = i4_value;
351
352        pi2_out_tmp ++;
353        pu2_scale_matrix++;
354        pu2_threshold_matrix++;
355    }
356
357    /* Return total nonzero coefficients in the current sub block */
358    *pu1_nnz =  u4_nonzero_coeff;
359}
360
361/**
362 *******************************************************************************
363 *
364 * @brief
365 *   This function performs forward hadamard transform and quantization on a 4*4 block
366 *
367 * @par Description:
368 *   The function accepts source buffer and estimation buffer. From these, it
369 *   computes the residue. This is residue is then transformed and quantized.
370 *   The transform and quantization are in placed computed. They use the residue
371 *   buffer for this.
372 *
373 * @param[in] pu1_src
374 *   Pointer to source sub-block
375 *
376 * @param[in] pu1_pred
377 *   Pointer to prediction sub-block
378 *
379 * @param[in] pi2_out
380 *   Pointer to residual sub-block
381 *
382 * @param[in] src_strd
383 *   Source stride
384 *
385 * @param[in] pred_strd
386 *   Prediction stride
387 *
388 * @param[in] dst_strd
389 *   Destination stride
390 *
391 * @param[in] u4_qbits
392 *    QP_BITS_h264_4x4 + floor(QP/6)
393 *
394 * @param[in] pu2_threshold_matrix
395 *   Pointer to Forward Quant Threshold Matrix
396 *
397 * @param[in] pu2_scale_matrix
398 *   Pointer to Forward Quant Scale Matrix
399 *
400 * @param[in] u4_round_factor
401 *   Quantization Round factor
402 *
403 * @param[out] pu1_nnz
404 *   Total non-zero coefficients in the current sub-block
405 *
406 * @returns
407 *
408 * @remarks
409 *   None
410 *
411 */
412
413void ih264_hadamard_quant_4x4(WORD16 *pi2_src,
414                              WORD16 *pi2_dst,
415                              const UWORD16 *pu2_scale_matrix,
416                              const UWORD16 *pu2_threshold_matrix,
417                              UWORD32 u4_qbits,
418                              UWORD32 u4_round_factor,
419                              UWORD8 *pu1_nnz)
420{
421  WORD32 i;
422  WORD32 x0,x1,x2,x3,x4,x5,x6,x7,i4_value;
423  UWORD32 u4_abs_value;
424  WORD32 i4_sign;
425
426  *pu1_nnz = 0;
427
428  for (i = 0; i < SUB_BLK_WIDTH_4x4; i++)
429    {
430        x4 = pi2_src[0];
431        x5 = pi2_src[1];
432        x6 = pi2_src[2];
433        x7 = pi2_src[3];
434
435        x0 = x4 + x7;
436        x1 = x5 + x6;
437        x2 = x5 - x6;
438        x3 = x4 - x7;
439
440        pi2_dst[0] = x0 + x1;
441        pi2_dst[1] = x3 + x2;
442        pi2_dst[2] = x0 - x1;
443        pi2_dst[3] = x3 - x2;
444
445        pi2_src += 4;
446        pi2_dst += 4;
447    }
448
449    /* Vertical transform and quantization */
450    pi2_dst -= SUB_BLK_WIDTH_4x4<<2;
451
452    for (i = 0; i < SUB_BLK_WIDTH_4x4; i++)
453    {
454        x4 = pi2_dst[0];
455        x5 = pi2_dst[4];
456        x6 = pi2_dst[8];
457        x7 = pi2_dst[12] ;
458
459        x0 = x4 + x7;
460        x1 = x5 + x6;
461        x2 = x5 - x6;
462        x3 = x4 - x7;
463
464
465        i4_value = (x0 + x1) >> 1;
466        FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
467                  pu2_scale_matrix[0], u4_round_factor, u4_qbits, pu1_nnz[0]);
468        pi2_dst[0] = i4_value;
469
470        i4_value = (x3 + x2) >> 1;
471        FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
472                  pu2_scale_matrix[0], u4_round_factor, u4_qbits, pu1_nnz[0]);
473        pi2_dst[4] = i4_value;
474
475        i4_value = (x0 - x1) >> 1;
476        FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
477                  pu2_scale_matrix[0], u4_round_factor, u4_qbits, pu1_nnz[0]);
478        pi2_dst[8] = i4_value;
479
480        i4_value = (x3 - x2) >> 1;
481        FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
482                  pu2_scale_matrix[0], u4_round_factor, u4_qbits, pu1_nnz[0]);
483        pi2_dst[12] = i4_value;
484
485        pi2_dst ++;
486    }
487}
488
489/**
490 *******************************************************************************
491 *
492 * @brief
493 *   This function performs forward hadamard transform and quantization on a 2*2 block
494 *   for both U and V planes
495 *
496 * @par Description:
497 *   The function accepts source buffer and estimation buffer. From these, it
498 *   computes the residue. This is residue is then transformed and quantized.
499 *   The transform and quantization are in placed computed. They use the residue
500 *   buffer for this.
501 *
502 * @param[in] pu1_src
503 *   Pointer to source sub-block
504 *
505 * @param[in] pu1_pred
506 *   Pointer to prediction sub-block
507 *
508 * @param[in] pi2_out
509 *   Pointer to residual sub-block
510 *
511 * @param[in] src_strd
512 *   Source stride
513 *
514 * @param[in] pred_strd
515 *   Prediction stride
516 *
517 * @param[in] dst_strd
518 *   Destination stride
519 *
520 * @param[in] u4_qbits
521 *    QP_BITS_h264_4x4 + floor(QP/6)
522 *
523 * @param[in] pu2_threshold_matrix
524 *   Pointer to Forward Quant Threshold Matrix
525 *
526 * @param[in] pu2_scale_matrix
527 *   Pointer to Forward Quant Scale Matrix
528 *
529 * @param[in] u4_round_factor
530 *   Quantization Round factor
531 *
532 * @param[out] pu1_nnz
533 *   Total non-zero coefficients in the current sub-block
534 *
535 * @returns
536 *
537 * @remarks
538 *   NNZ for dc is populated at 0 and 5th position of pu1_nnz
539 *
540 */
541
542void ih264_hadamard_quant_2x2_uv(WORD16 *pi2_src,
543                                 WORD16 *pi2_dst,
544                                 const UWORD16 *pu2_scale_matrix,
545                                 const UWORD16 *pu2_threshold_matrix,
546                                 UWORD32 u4_qbits,
547                                 UWORD32 u4_round_factor,
548                                 UWORD8 *pu1_nnz)
549{
550    WORD32 x0, x1, x2, x3, x4, x5, x6, x7;
551    WORD32 i4_value, i4_sign, plane;
552    UWORD32 u4_abs_value;
553
554    for(plane = 0; plane < 2; plane++)
555    {
556        pu1_nnz[plane] = 0;
557
558        /* Horizontal transform */
559        x4 = pi2_src[0];
560        x5 = pi2_src[1];
561        x6 = pi2_src[2];
562        x7 = pi2_src[3];
563
564        x0 = x4 + x5;
565        x1 = x4 - x5;
566        x2 = x6 + x7;
567        x3 = x6 - x7;
568
569        /* Vertical transform and quantization */
570        i4_value = (x0 + x2);
571        FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
572                  pu2_scale_matrix[0], u4_round_factor, u4_qbits,
573                  pu1_nnz[plane]);
574        pi2_dst[0] = i4_value;
575
576        i4_value = (x0 - x2);
577        FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
578                  pu2_scale_matrix[0], u4_round_factor, u4_qbits,
579                  pu1_nnz[plane]);
580        pi2_dst[2] = i4_value;
581
582        i4_value = (x1 - x3);
583        FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
584                  pu2_scale_matrix[0], u4_round_factor, u4_qbits,
585                  pu1_nnz[plane]);
586        pi2_dst[3] = i4_value;
587
588        i4_value = (x1 + x3);
589        FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
590                  pu2_scale_matrix[0], u4_round_factor, u4_qbits,
591                  pu1_nnz[plane]);
592        pi2_dst[1] = i4_value;
593
594        pi2_dst += 4;
595        pi2_src += 4;
596
597    }
598}
599
600/*
601 *******************************************************************************
602 *
603 * @brief
604 *  This function performs Single stage forward transform CF8 and quantization on 8*8 blocks
605 *  for h.264
606 *
607 * @par Description:
608 *  Performs single stage 8x8 forward transform CF8 after calculating the residue
609 *  The result is then quantized
610 *
611 * @param[in] pu1_src
612 *  Input 8x8 pixels
613 *
614 * @param[in] pu1_pred
615 *  Input 8x8 pixels
616 *
617 * @param[in] pi1_out
618 * Output 8x8 pixels
619 *
620 * @param[in] u4_thresh
621 *  Threshold under which the coeffs are not quantized
622 *
623 *  @param[in] u4_qp_div
624 *  QP/6
625 *
626 *  @param[in] u4_qp_rem
627 *  QP%6
628 *
629 * @param[in] u2_src_stride
630 *  Source stride
631 *
632 * @param[in] pred_strd
633 * stride for prediciton buffer
634 *
635 *  @param[in] dst_strd
636 *  stride for destination buffer
637 *
638 *  @param[in] pu4_quant_mat
639 *  Pointer to the 4x4 quantization matrix
640 *
641 * @returns  Void
642 *
643 *
644 *******************************************************************************
645 */
646void ih264_resi_trans_quant_8x8(UWORD8 *pu1_src,
647                                UWORD8 *pu1_pred,
648                                WORD16 *pi2_out,
649                                WORD32 src_strd,
650                                WORD32 pred_strd,
651                                const UWORD16 *pu2_scale_matrix,
652                                const UWORD16 *pu2_threshold_matrix,
653                                UWORD32 u4_qbits,
654                                UWORD32 u4_round_factor,
655                                UWORD8 *pu1_nnz,
656                                WORD16 *pu1_dc_alt_addr)
657
658{
659    WORD16 *pi2_out_tmp = pi2_out;
660    UWORD32 i;
661    WORD32 a0, a1, a2, a3, a4, a5, a6, a7;
662    WORD32 r0, r1, r2, r3, r4, r5, r6, r7;
663    WORD32 i4_sign;
664    UWORD32 u4_abs_value;
665    UWORD32 u4_nonzero_coeff = 0;
666
667    UNUSED(pu1_dc_alt_addr);
668
669    /*Horizontal transform */
670    /* we are going to use the a's and r's in a twisted way since */
671    /*i dont want to declare more variables */
672    for(i = 0; i < SUB_BLK_WIDTH_8x8; ++i)
673    {
674        r0 = pu1_src[0];
675        r0 -= pu1_pred[0];
676        r1 = pu1_src[1];
677        r1 -= pu1_pred[1];
678        r2 = pu1_src[2];r2 -= pu1_pred[2];
679        r3 = pu1_src[3];r3 -= pu1_pred[3];
680        r4 = pu1_src[4];r4 -= pu1_pred[4];
681        r5 = pu1_src[5];r5 -= pu1_pred[5];
682        r6 = pu1_src[6];r6 -= pu1_pred[6];
683        r7 = pu1_src[7];r7 -= pu1_pred[7];
684
685
686        a0 = r0 + r7;
687        a1 = r1 + r6;
688        a2 = r2 + r5;
689        a3 = r3 + r4;
690
691        a4 = a0 + a3;
692        a5 = a1 + a2;
693        a6 = a0 - a3;
694        a7 = a1 - a2;
695
696        pi2_out_tmp[0] = a4 + a5;
697
698        pi2_out_tmp[2] = a6 + (a7>>1);
699        pi2_out_tmp[4] = a4 - a5;
700        pi2_out_tmp[6] = (a6>>1) - a7;
701
702        a0 = r0 - r7;
703        a1 = r1 - r6;
704        a2 = r2 - r5;
705        a3 = r3 - r4;
706
707        a4 = a1 + a2 + ((a0>>1) + a0);
708        a5 = a0 - a3 - ((a2>>1) + a2);
709        a6 = a0 + a3 - ((a1>>1) + a1);
710        a7 = a1 - a2 + ((a3>>1) + a3);
711
712        pi2_out_tmp[1] = a4 + (a7>>2);
713        pi2_out_tmp[3] = a5 + (a6>>2);
714        pi2_out_tmp[5] = a6 - (a5>>2);
715        pi2_out_tmp[7] = (a4>>2) - a7;
716
717        pu1_src += src_strd;
718        pu1_pred += pred_strd;
719        pi2_out_tmp += 8;
720    }
721
722    /*vertical transform and quant */
723
724    pi2_out_tmp = pi2_out;
725
726    for (i = 0; i < SUB_BLK_WIDTH_8x8; ++i)
727    {
728
729        r0 = pi2_out_tmp[0];
730        r1 = pi2_out_tmp[8];
731        r2 = pi2_out_tmp[16];
732        r3 = pi2_out_tmp[24];
733        r4 = pi2_out_tmp[32];
734        r5 = pi2_out_tmp[40];
735        r6 = pi2_out_tmp[48];
736        r7 = pi2_out_tmp[56];
737
738        a0 = r0 + r7;
739        a1 = r1 + r6;
740        a2 = r2 + r5;
741        a3 = r3 + r4;
742
743        a4 = a0 + a3;
744        a5 = a1 + a2;
745        a6 = a0 - a3;
746        a7 = a1 - a2;
747
748        a0 = r0 - r7;
749        a1 = r1 - r6;
750        a2 = r2 - r5;
751        a3 = r3 - r4;
752
753        r0 = a4 + a5;
754        r2 = a6 + (a7>>1);
755        r4 = a4 - a5;
756        r6 = (a6>>1) - a7;
757
758        a4 = a1 + a2 + ((a0>>1) + a0);
759        a5 = a0 - a3 - ((a2>>1) + a2);
760        a6 = a0 + a3 - ((a1>>1) + a1);
761        a7 = a1 - a2 + ((a3>>1) + a3);
762
763        r1 = a4 + (a7>>2);
764        r3 = a5 + (a6>>2);
765        r5 = a6 - (a5>>2);
766        r7 = (a4>>2) - a7;
767
768        FWD_QUANT(r0, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
769                  pu2_scale_matrix[0], u4_round_factor, u4_qbits,
770                  u4_nonzero_coeff);
771        pi2_out_tmp[0] = r0;
772
773        FWD_QUANT(r1, u4_abs_value, i4_sign, pu2_threshold_matrix[8],
774                  pu2_scale_matrix[8], u4_round_factor, u4_qbits,
775                  u4_nonzero_coeff);
776        pi2_out_tmp[8] = r1;
777
778        FWD_QUANT(r2, u4_abs_value, i4_sign, pu2_threshold_matrix[16],
779                  pu2_scale_matrix[16], u4_round_factor, u4_qbits,
780                  u4_nonzero_coeff);
781        pi2_out_tmp[16] = r2;
782
783        FWD_QUANT(r3, u4_abs_value, i4_sign, pu2_threshold_matrix[24],
784                  pu2_scale_matrix[24], u4_round_factor, u4_qbits,
785                  u4_nonzero_coeff);
786        pi2_out_tmp[24] = r3;
787
788        FWD_QUANT(r4, u4_abs_value, i4_sign, pu2_threshold_matrix[32],
789                  pu2_scale_matrix[32], u4_round_factor, u4_qbits,
790                  u4_nonzero_coeff);
791        pi2_out_tmp[32] = r4;
792
793        FWD_QUANT(r5, u4_abs_value, i4_sign, pu2_threshold_matrix[40],
794                  pu2_scale_matrix[40], u4_round_factor, u4_qbits,
795                  u4_nonzero_coeff);
796        pi2_out_tmp[40] = r5;
797
798        FWD_QUANT(r6, u4_abs_value, i4_sign, pu2_threshold_matrix[48],
799                  pu2_scale_matrix[48], u4_round_factor, u4_qbits,
800                  u4_nonzero_coeff);
801        pi2_out_tmp[48] = r6;
802
803        FWD_QUANT(r7, u4_abs_value, i4_sign, pu2_threshold_matrix[56],
804                  pu2_scale_matrix[56], u4_round_factor, u4_qbits,
805                  u4_nonzero_coeff);
806        pi2_out_tmp[56] = r7;
807
808        pi2_out_tmp++;
809        pu2_scale_matrix++;
810        pu2_threshold_matrix++;
811    }
812       /* Return total nonzero coefficients in the current sub block */
813        *pu1_nnz =  u4_nonzero_coeff;
814}
815