18d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@/******************************************************************************
28d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
38d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * Copyright (C) 2015 The Android Open Source Project
48d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
58d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * Licensed under the Apache License, Version 2.0 (the "License");
68d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * you may not use this file except in compliance with the License.
78d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * You may obtain a copy of the License at:
88d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
98d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * http://www.apache.org/licenses/LICENSE-2.0
108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * Unless required by applicable law or agreed to in writing, software
128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * distributed under the License is distributed on an "AS IS" BASIS,
138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * See the License for the specific language governing permissions and
158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * limitations under the License.
168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *****************************************************************************
188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*/
207497191460a9504f8b4f64df169ab633f0b74353Harish Mahendrakar@**
218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *******************************************************************************
228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @file
238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *  ih264_iquant_itrans_recon_a9.s
248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @brief
268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *  Contains function definitions for single stage  inverse transform
278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @author
298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *  Mohit
308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *  Harinarayanaan
318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @par List of Functions:
338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *  - ih264_iquant_itrans_recon_4x4_a9()
348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *  - ih264_iquant_itrans_recon_8x8_a9()
358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *  - ih264_iquant_itrans_recon_chroma_4x4_a9()
368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @remarks
388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *  None
398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *******************************************************************************
417497191460a9504f8b4f64df169ab633f0b74353Harish Mahendrakar@*
427497191460a9504f8b4f64df169ab633f0b74353Harish Mahendrakar@**
438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *******************************************************************************
448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @brief
468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *  This function performs inverse quant and Inverse transform type Ci4 for 4*4 block
478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @par Description:
498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *  Performs inverse transform Ci4 and adds the residue to get the
508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *  reconstructed block
518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] pi2_src
538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *  Input 4x4 coefficients
548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] pu1_pred
568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *  Prediction 4x4 block
578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[out] pu1_out
598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *  Output 4x4 block
608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] u4_qp_div_6
628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *     QP
638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] pu2_weigh_mat
658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * Pointer to weight matrix
668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] pred_strd,
688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *  Prediction stride
698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] out_strd
718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *  Output Stride
728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *@param[in] pi2_tmp
748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * temporary buffer of size 1*16
758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] pu2_iscal_mat
778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * Pointer to the inverse quantization matrix
788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @returns  Void
808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @remarks
828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *  None
838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *******************************************************************************
857497191460a9504f8b4f64df169ab633f0b74353Harish Mahendrakar@ *
868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@void ih264_iquant_itrans_recon_4x4(WORD16 *pi2_src,
878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@                                   UWORD8 *pu1_pred,
888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@                                   UWORD8 *pu1_out,
898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@                                   WORD32 pred_strd,
908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@                                   WORD32 out_strd,
918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@                                   const UWORD16 *pu2_iscal_mat,
928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@                                   const UWORD16 *pu2_weigh_mat,
938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@                                   UWORD32 u4_qp_div_6,
948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@                                   WORD32 *pi4_tmp,
958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@                                   WORD32 iq_start_idx
968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@                                   WORD16 *pi2_dc_ld_addr)
978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@**************Variables Vs Registers*****************************************
988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r0 => *pi2_src
998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r1 => *pu1_pred
1008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r2 => *pu1_out
1018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r3 =>  pred_strd
1028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r4 =>  out_strd
1038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r5 =>  *pu2_iscal_mat
1048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r6 =>  *pu2_weigh_mat
1058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r7 =>  u4_qp_div_6
1068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r8 =>  iq_start_idx
1078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r10=>  pi2_dc_ld_addr
1088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S.text
109b5cec4f95fef51237ac5239983f636efacd2d63fMartin Storsjo.syntax unified
1108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S.p2align 2
1118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    .global ih264_iquant_itrans_recon_4x4_a9
1138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Sih264_iquant_itrans_recon_4x4_a9:
1158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@VLD4.S16 is used because the pointer is incremented by SUB_BLK_WIDTH_4x4
1178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@If the macro value changes need to change the instruction according to it.
1188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@Only one shift is done in horizontal inverse because,
1198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value
1208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0
1218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    stmfd         sp!, {r4-r12, r14}    @stack stores the values of the arguments
1238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ldr           r7, [sp, #52]         @Loads u4_qp_div_6
1248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ldr           r4, [sp, #40]         @Loads out_strd
1258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vdup.s32      q15, r7               @Populate the u4_qp_div_6 in Q15
1268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ldr           r5, [sp, #44]         @Loads *pu2_iscal_mat
1278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ldr           r6, [sp, #48]         @Loads *pu2_weigh_mat
1298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ldr           r8, [sp, #60]         @Loads iq_start_idx
1318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ldr           r10, [sp, #64]        @Load alternate dc address
1338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vpush         {d8-d15}
1358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@=======================DEQUANT FROM HERE===================================
1368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld4.s16      {d20, d21, d22, d23}, [r5] @Load pu2_iscal_mat[i], i =0..15
1388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld4.s16      {d26, d27, d28, d29}, [r6] @pu2_weigh_mat[i], i =0..15
1398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmul.s16      q10, q10, q13         @x[i]=(scale[i] * dequant[i]) where i = 0..7
1408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld4.s16      {d16, d17, d18, d19}, [r0] @pi2_src_tmp[i], i =0..15
1418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmul.s16      q11, q11, q14         @x[i]=(scale[i] * dequant[i]) where i = 8..15
1438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    subs          r8, r8, #1            @ if r8 == 1 => intra case , so result of subtraction is zero and Z flag is set
145b5cec4f95fef51237ac5239983f636efacd2d63fMartin Storsjo    ldrsheq       r9, [r10]             @ Loads signed halfword pi2_dc_ld_addr[0], if r8==1
1468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmull.s16     q0, d16, d20          @ Q0  = p[i] = (x[i] * trns_coeff[i]) where i = 0..3
1488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmull.s16     q1, d17, d21          @ Q1  = p[i] = (x[i] * trns_coeff[i]) where i = 4..7
1498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmull.s16     q2, d18, d22          @ Q2  = p[i] = (x[i] * trns_coeff[i]) where i = 8..11
1508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmull.s16     q3, d19, d23          @ Q3  = p[i] = (x[i] * trns_coeff[i]) where i = 12..15
1518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshl.s32      q0, q0, q15           @ Q0  = q[i] = (p[i] << (qP/6)) where i = 0..3
1538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshl.s32      q1, q1, q15           @ Q1  = q[i] = (p[i] << (qP/6)) where i = 4..7
1548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshl.s32      q2, q2, q15           @ Q2  = q[i] = (p[i] << (qP/6)) where i = 8..11
1558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshl.s32      q3, q3, q15           @ Q3  = q[i] = (p[i] << (qP/6)) where i = 12..15
1568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqrshrn.s32   d0, q0, #0x4          @ D0  = c[i] = ((q[i] + 32) >> 4) where i = 0..3
1588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqrshrn.s32   d1, q1, #0x4          @ D1  = c[i] = ((q[i] + 32) >> 4) where i = 4..7
1598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqrshrn.s32   d2, q2, #0x4          @ D2  = c[i] = ((q[i] + 32) >> 4) where i = 8..11
1608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqrshrn.s32   d3, q3, #0x4          @ D3  = c[i] = ((q[i] + 32) >> 4) where i = 12..15
1618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmoveq.16     d0[0], r9             @ Restore dc value in case of intra, i.e. r8 == 1
1638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@========= PROCESS IDCT FROM HERE =======
1658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@Steps for Stage 1:
1668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@------------------
1678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.32       d30[0], [r1], r3      @I row Load pu1_pred buffer
1688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vadd.s16      d4, d0, d2            @x0 = q0 + q1;
1698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsub.s16      d5, d0, d2            @x1 = q0 - q1;
1718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshr.s16      d8, d1, #1            @q0>>1
1738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshr.s16      d9, d3, #1            @q1>>1
1748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsub.s16      d6, d8, d3            @x2 = (q0 >> 1) -  q1;
1768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vadd.s16      d7, d1, d9            @x3 = q0+ (q1 >> 1);
1778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.32       d30[1], [r1], r3      @II row Load pu1_pred buffer
1788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vswp          d6, d7                @Reverse positions of x2 and x3
1808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsub.s16      q6, q2, q3            @x0-x3 and x1-x2 combined
1828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vadd.s16      q5, q2, q3            @x0 + x3 and x1+x2 combined
1838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.32       d31[0], [r1], r3      @III row Load pu1_pred buf
1858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vswp          d12, d13
1878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@Steps for Stage 2:
1888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@------------------
1898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.16       d10, d11
1908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.16       d12, d13
1918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.32       d10, d12
1928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.32       d11, d13
1938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vadd.s16      d14, d10, d12         @x0 = q0 + q1;
1948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsub.s16      d15, d10, d12         @x1 = q0 - q1;
1968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshr.s16      d18, d11, #1          @q0>>1
1988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshr.s16      d19, d13, #1          @q1>>1
1998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsub.s16      d16, d18, d13         @x2 = (q0 >> 1) -  q1;
2018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vadd.s16      d17, d11, d19         @x3 = q0+ (q1 >> 1);
2028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.32       d31[1], [r1], r3      @IV row Load pu1_pred buffer
2048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vswp          d16, d17              @Reverse positions of x2 and x3
2058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsub.s16      q11, q7, q8           @x0-x3 and x1-x2 combined
2078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vadd.s16      q10, q7, q8           @x0 + x3 and x1+x2 combined
2088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vswp          d22, d23
2108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vrshr.s16     q10, q10, #6          @
2128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vrshr.s16     q11, q11, #6
2138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.u8      q10, q10, d30
2158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.u8      q11, q11, d31
2168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqmovun.s16   d0, q10
2188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqmovun.s16   d1, q11
2198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.32       d0[0], [r2], r4       @I row store the value
2218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.32       d0[1], [r2], r4       @II row store the value
2228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.32       d1[0], [r2], r4       @III row store the value
2238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.32       d1[1], [r2]           @IV row store the value
2248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vpop          {d8-d15}
2268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ldmfd         sp!, {r4-r12, r15}    @Reload the registers from SP
2278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2297497191460a9504f8b4f64df169ab633f0b74353Harish Mahendrakar@**
2308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *******************************************************************************
2318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
2328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @brief
2338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *  This function performs inverse quant and Inverse transform type Ci4 for 4*4 block
2348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
2358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @par Description:
2368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *  Performs inverse transform Ci4 and adds the residue to get the
2378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *  reconstructed block
2388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
2398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] pi2_src
2408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *  Input 4x4 coefficients
2418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
2428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] pu1_pred
2438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *  Prediction 4x4 block
2448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
2458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[out] pu1_out
2468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *  Output 4x4 block
2478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
2488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] u4_qp_div_6
2498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *     QP
2508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
2518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] pu2_weigh_mat
2528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * Pointer to weight matrix
2538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
2548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] pred_strd,
2558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *  Prediction stride
2568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
2578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] out_strd
2588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *  Output Stride
2598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
2608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *@param[in] pi2_tmp
2618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * temporary buffer of size 1*16
2628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
2638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] pu2_iscal_mat
2648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * Pointer to the inverse quantization matrix
2658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
2668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @returns  Void
2678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
2688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @remarks
2698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *  None
2708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
2718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *******************************************************************************
2727497191460a9504f8b4f64df169ab633f0b74353Harish Mahendrakar@ *
2738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@void ih264_iquant_itrans_recon_chroma_4x4(WORD16 *pi2_src,
2748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@                                   UWORD8 *pu1_pred,
2758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@                                   UWORD8 *pu1_out,
2768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@                                   WORD32 pred_strd,
2778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@                                   WORD32 out_strd,
2788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@                                   const UWORD16 *pu2_iscal_mat,
2798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@                                   const UWORD16 *pu2_weigh_mat,
2808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@                                   UWORD32 u4_qp_div_6,
2818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@                                   WORD32 *pi4_tmp
2828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@                                   WORD16 *pi2_dc_src)
2838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@**************Variables Vs Registers*****************************************
2848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r0 => *pi2_src
2858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r1 => *pu1_pred
2868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r2 => *pu1_out
2878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r3 =>  pred_strd
2888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r4 =>  out_strd
2898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r5 =>  *pu2_iscal_mat
2908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r6 =>  *pu2_weigh_mat
2918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r7 =>  u4_qp_div_6
2928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    .global ih264_iquant_itrans_recon_chroma_4x4_a9
2948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Sih264_iquant_itrans_recon_chroma_4x4_a9:
2958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@VLD4.S16 is used because the pointer is incremented by SUB_BLK_WIDTH_4x4
2978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@If the macro value changes need to change the instruction according to it.
2988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@Only one shift is done in horizontal inverse because,
2998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value
3008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0
3018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    stmfd         sp!, {r4-r12, r14}    @stack stores the values of the arguments
3038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ldr           r7, [sp, #52]         @Loads u4_qp_div_6
3048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ldr           r4, [sp, #40]         @Loads out_strd
3058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vdup.s32      q15, r7               @Populate the u4_qp_div_6 in Q15
3068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ldr           r5, [sp, #44]         @Loads *pu2_iscal_mat
3078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ldr           r6, [sp, #48]         @Loads *pu2_weigh_mat
3088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ldr           r8, [sp, #60]         @loads *pi2_dc_src
3098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vpush         {d8-d15}
3118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@=======================DEQUANT FROM HERE===================================
3128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld4.s16      {d20, d21, d22, d23}, [r5] @Load pu2_iscal_mat[i], i =0..15
3148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld4.s16      {d26, d27, d28, d29}, [r6] @pu2_weigh_mat[i], i =0..15
3158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmul.s16      q10, q10, q13         @x[i]=(scale[i] * dequant[i]) where i = 0..7
3168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld4.s16      {d16, d17, d18, d19}, [r0] @pi2_src_tmp[i], i =0..15
3178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmul.s16      q11, q11, q14         @x[i]=(scale[i] * dequant[i]) where i = 8..15
3198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmull.s16     q0, d16, d20          @ Q0  = p[i] = (x[i] * trns_coeff[i]) where i = 0..3
3218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmull.s16     q1, d17, d21          @ Q1  = p[i] = (x[i] * trns_coeff[i]) where i = 4..7
3228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmull.s16     q2, d18, d22          @ Q2  = p[i] = (x[i] * trns_coeff[i]) where i = 8..11
3238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmull.s16     q3, d19, d23          @ Q3  = p[i] = (x[i] * trns_coeff[i]) where i = 12..15
3248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshl.s32      q0, q0, q15           @ Q0  = q[i] = (p[i] << (qP/6)) where i = 0..3
3268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshl.s32      q1, q1, q15           @ Q1  = q[i] = (p[i] << (qP/6)) where i = 4..7
3278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshl.s32      q2, q2, q15           @ Q2  = q[i] = (p[i] << (qP/6)) where i = 8..11
3288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshl.s32      q3, q3, q15           @ Q3  = q[i] = (p[i] << (qP/6)) where i = 12..15
3298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqrshrn.s32   d0, q0, #0x4          @ D0  = c[i] = ((q[i] + 32) >> 4) where i = 0..3
3318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqrshrn.s32   d1, q1, #0x4          @ D1  = c[i] = ((q[i] + 32) >> 4) where i = 4..7
3328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqrshrn.s32   d2, q2, #0x4          @ D2  = c[i] = ((q[i] + 32) >> 4) where i = 8..11
3338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqrshrn.s32   d3, q3, #0x4          @ D3  = c[i] = ((q[i] + 32) >> 4) where i = 12..15
3348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ldrsh         r9, [r8]              @ Loads signed halfword pi2_dc_src[0]
3368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmov.16       d0[0], r9             @ Restore dc value since its chroma iq-it
3378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@========= PROCESS IDCT FROM HERE =======
3398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@Steps for Stage 1:
3408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@------------------
3418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld2.8        {d28, d29}, [r1], r3  @I row Load pu1_pred buffer
3428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vadd.s16      d4, d0, d2            @x0 = q0 + q1;
3438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsub.s16      d5, d0, d2            @x1 = q0 - q1;
3458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshr.s16      d8, d1, #1            @q0>>1
3478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshr.s16      d9, d3, #1            @q1>>1
3488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsub.s16      d6, d8, d3            @x2 = (q0 >> 1) -  q1;
3508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vadd.s16      d7, d1, d9            @x3 = q0+ (q1 >> 1);
3518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld2.8        {d29, d30}, [r1], r3  @II row Load pu1_pred buffer
3528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vswp          d6, d7                @Reverse positions of x2 and x3
3548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsub.s16      q6, q2, q3            @x0-x3 and x1-x2 combined
3568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.32       d28, d29              @ D28 -- row I and II of pu1_pred_buffer
3578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vadd.s16      q5, q2, q3            @x0 + x3 and x1+x2 combined
3588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld2.8        {d29, d30}, [r1], r3  @III row Load pu1_pred buf
3608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vswp          d12, d13
3628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@Steps for Stage 2:
3638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@------------------
3648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.16       d10, d11
3658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.16       d12, d13
3668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.32       d10, d12
3678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.32       d11, d13
3688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vadd.s16      d14, d10, d12         @x0 = q0 + q1;
3698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsub.s16      d15, d10, d12         @x1 = q0 - q1;
3718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshr.s16      d18, d11, #1          @q0>>1
3738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshr.s16      d19, d13, #1          @q1>>1
3748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsub.s16      d16, d18, d13         @x2 = (q0 >> 1) -  q1;
3768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vadd.s16      d17, d11, d19         @x3 = q0+ (q1 >> 1);
3778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld2.8        {d30, d31}, [r1], r3  @IV row Load pu1_pred buffer
3798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vswp          d16, d17              @Reverse positions of x2 and x3
3808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsub.s16      q11, q7, q8           @x0-x3 and x1-x2 combined
3828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.32       d29, d30              @ D29 -- row III and IV of pu1_pred_buf
3838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vadd.s16      q10, q7, q8           @x0 + x3 and x1+x2 combined
3848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vswp          d22, d23
3868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vrshr.s16     q10, q10, #6          @
3888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vrshr.s16     q11, q11, #6
3898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.u8      q10, q10, d28
3918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.u8      q11, q11, d29
3928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.u8       d0, [r2], r4          @Loading out buffer 16 coeffs
3948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.u8       d1, [r2], r4
3958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.u8       d2, [r2], r4
3968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.u8       d3, [r2], r4
3978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sub           r2, r2, r4, lsl #2
3998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqmovun.s16   d20, q10              @Getting quantized coeffs
4018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqmovun.s16   d22, q11
4028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmovl.u8      q10, d20              @Move the coffs into 16 bit
4048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmovl.u8      q11, d22              @so that we can use vbit to copy
4058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmov.u16      q14, #0x00ff          @Copy lsb from qantized(long)coeffs
4078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vbit.u8       q0, q10, q14
4098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vbit.u8       q1, q11, q14
4108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.u8       d0, [r2], r4
4128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.u8       d1, [r2], r4
4138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.u8       d2, [r2], r4
4148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.u8       d3, [r2]
4158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vpop          {d8-d15}
4178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ldmfd         sp!, {r4-r12, r15}    @Reload the registers from SP
4188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4207497191460a9504f8b4f64df169ab633f0b74353Harish Mahendrakar@*
4218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *******************************************************************************
4228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
4238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @brief
4248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *  This function performs inverse quant and Inverse transform type Ci4 for 8*8 block
4258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
4268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @par Description:
4278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *  Performs inverse transform Ci8 and adds the residue to get the
4288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *  reconstructed block
4298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
4308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] pi2_src
4318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *  Input 4x4 coefficients
4328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
4338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] pu1_pred
4348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *  Prediction 4x4 block
4358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
4368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[out] pu1_out
4378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *  Output 4x4 block
4388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
4398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] u4_qp_div_6
4408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *     QP
4418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
4428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] pu2_weigh_mat
4438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * Pointer to weight matrix
4448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
4458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] pred_strd,
4468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *  Prediction stride
4478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
4488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] out_strd
4498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *  Output Stride
4508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
4518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *@param[in] pi2_tmp
4528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * temporary buffer of size 1*64
4538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
4548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] pu2_iscal_mat
4558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * Pointer to the inverse quantization matrix
4568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
4578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @returns  Void
4588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
4598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @remarks
4608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *  None
4618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
4628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *******************************************************************************
4637497191460a9504f8b4f64df169ab633f0b74353Harish Mahendrakar@ *
4648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@void ih264_iquant_itrans_recon_8x8(WORD16 *pi2_src,
4658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@                                   UWORD8 *pu1_pred,
4668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@                                   UWORD8 *pu1_out,
4678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@                                   WORD32 pred_strd,
4688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@                                   WORD32 out_strd,
4698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@                                   const UWORD16 *pu2_iscal_mat,
4708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@                                   const UWORD16 *pu2_weigh_mat,
4718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@                                   UWORD32 u4_qp_div_6,
4728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@                                   WORD32 *pi4_tmp,
4738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@                                   WORD32 iq_start_idx)
4748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@**************Variables Vs Registers*****************************************
4758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r0 => *pi2_src
4768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r1 => *pu1_pred
4778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r2 => *pu1_out
4788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r3 =>  pred_strd
4798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r4 =>  out_strd
4808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r5 =>  *pu2_iscal_mat
4818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r6 =>  *pu2_weigh_mat
4828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r7 =>  u4_qp_div_6
4838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    .global ih264_iquant_itrans_recon_8x8_a9
4868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Sih264_iquant_itrans_recon_8x8_a9:
4878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    stmfd         sp!, {r4-r12, r14}    @stack stores the values of the arguments
4898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ldr           r7, [sp, #52]         @Loads u4_qp_div_6
4908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ldr           r4, [sp, #40]         @Loads out_strd
4918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ldr           r5, [sp, #44]         @Loads *pu2_iscal_mat
4938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ldr           r6, [sp, #48]         @Loads *pu2_weigh_mat
4948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vdup.s32      q15, r7               @Populate the u4_qp_div_6 in Q15
4958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vpush         {d8-d15}
4968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Sidct_8x8_begin:
4988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@========= DEQUANT FROM HERE ===========
5008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.32       {q13}, [r5]!          @ Q13 = dequant values row 0
5028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.32       {q10}, [r6]!          @ Q10 = scaling factors row 0
5038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.32       {q14}, [r5]!          @ Q14 = dequant values row 1
5048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmul.s16      q10, q10, q13         @ Q10 = x[i] = (scale[i] * dequant[i]) where i = 0..7
5058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.32       {q11}, [r6]!          @ Q11 = scaling factors row 1
5068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.32       {q8}, [r0]!           @ Q8  = Source row 0
5078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmul.s16      q11, q11, q14         @ Q11 = x[i] = (scale[i] * dequant[i]) where i = 8..15
5088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmull.s16     q0, d16, d20          @ Q0  = p[i] = (x[i] * trns_coeff[i]) where i = 0..3
5098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.32       {q9}, [r0]!           @ Q8  = Source row 1
5108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmull.s16     q1, d17, d21          @ Q1  = p[i] = (x[i] * trns_coeff[i]) where i = 4..7
5118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmull.s16     q2, d18, d22          @ Q2  = p[i] = (x[i] * trns_coeff[i]) where i = 8..11
5128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.32       {q13}, [r6]!          @ Scaling factors row 2
5138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmull.s16     q3, d19, d23          @ Q3  = p[i] = (x[i] * trns_coeff[i]) where i = 12..15
5148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.32       {q14}, [r6]!          @ Scaling factors row 3
5158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshl.s32      q0, q0, q15           @ Q0  = q[i] = (p[i] << (qP/6)) where i = 0..3
5168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.32       {q10}, [r5]!          @ Q10 = Dequant values row 2
5178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshl.s32      q1, q1, q15           @ Q1  = q[i] = (p[i] << (qP/6)) where i = 4..7
5188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.32       {q8}, [r0]!           @ Source Row 2
5198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshl.s32      q2, q2, q15           @ Q2  = q[i] = (p[i] << (qP/6)) where i = 8..11
5208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.32       {q11}, [r5]!          @ Q11 = Dequant values row 3
5218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshl.s32      q3, q3, q15           @ Q3  = q[i] = (p[i] << (qP/6)) where i = 12..15
5228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.32       {q9}, [r0]!           @ Source Row 3
5238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmul.s16      q10, q10, q13         @ Dequant row2*scale matrix row 2
5248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmul.s16      q11, q11, q14         @ Dequant row 3*scale matrix row 3
5258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.32       {q4}, [r6]!           @ Scaling factors row 4
5268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqrshrn.s32   d0, q0, #0x6          @ D0  = c[i] = ((q[i] + 32) >> 6) where i = 0..3
5278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqrshrn.s32   d1, q1, #0x6          @ D1  = c[i] = ((q[i] + 32) >> 6) where i = 4..7
5288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.32       {q5}, [r6]!           @ Scaling factors row 5
5298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqrshrn.s32   d2, q2, #0x6          @ D2  = c[i] = ((q[i] + 32) >> 6) where i = 8..11
5308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqrshrn.s32   d3, q3, #0x6          @ D3  = c[i] = ((q[i] + 32) >> 6) where i = 12..15
5318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.32       {q13}, [r5]!          @ Q13 = Dequant values row 4
5328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmull.s16     q2, d16, d20          @ p[i] = (x[i] * trns_coeff[i]) where i=16..19
5338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmull.s16     q3, d17, d21          @ p[i] = (x[i] * trns_coeff[i]) where i=20..23
5348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.32       {q12}, [r5]!          @ Q12 = Dequant values row 5
5358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmull.s16     q6, d18, d22          @ p[i] = (x[i] * trns_coeff[i]) where i=24..27
5368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmull.s16     q7, d19, d23          @ p[i] = (x[i] * trns_coeff[i]) where i=28..31
5378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.32       {q14}, [r0]!          @ Source row 4
5398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmul.s16      q10, q4, q13          @ Dequant row4*scale matrix row 4
5408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmul.s16      q11, q5, q12          @ Dequant row5*scale matrix row 5
5418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.32       {q9}, [r0]!           @ Source row 5
5428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshl.s32      q2, q2, q15           @
5438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshl.s32      q3, q3, q15           @
5448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.32       {q13}, [r6]!          @ Scaling factors row 6
5458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshl.s32      q6, q6, q15           @
5468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshl.s32      q7, q7, q15           @
5478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmull.s16     q4, d28, d20          @ i = 32..35
5488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqrshrn.s32   d4, q2, #0x6          @ D4  = c[i] = ((q[i] + 32) >> 6) where i = 16..19
5498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqrshrn.s32   d5, q3, #0x6          @ D5  = c[i] = ((q[i] + 32) >> 6) where i = 20..23
5508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmull.s16     q5, d29, d21          @ i =36..39
5518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.32       {q10}, [r5]!          @ Dequant values row 6
5528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqrshrn.s32   d6, q6, #0x6          @ D6  = c[i] = ((q[i] + 32) >> 6) where i = 24..27
5538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqrshrn.s32   d7, q7, #0x6          @ D7  = c[i] = ((q[i] + 32) >> 6) where i = 28..31
5548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.32       {q14}, [r6]!          @ Scaling factors row 7
5558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmull.s16     q6, d18, d22          @
5568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.32       {q8}, [r0]!           @ Source row 6
5578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmull.s16     q7, d19, d23          @
5588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.32       {q11}, [r5]!          @ Dequant values row 7
5598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshl.s32      q4, q4, q15           @
5608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.32       {q9}, [r0]!           @ Source row 7
5618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshl.s32      q5, q5, q15           @
5628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshl.s32      q6, q6, q15           @
5648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshl.s32      q7, q7, q15           @
5658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmul.s16      q10, q10, q13         @ Dequant*scaling row 6
5668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmul.s16      q11, q11, q14         @ Dequant*scaling row 7
5678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqrshrn.s32   d8, q4, #0x6          @ D8  = c[i] = ((q[i] + 32) >> 6) where i = 32..35
5688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqrshrn.s32   d9, q5, #0x6          @ D9  = c[i] = ((q[i] + 32) >> 6) where i = 36..39
5698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqrshrn.s32   d10, q6, #0x6         @ D10  = c[i] = ((q[i] + 32) >> 6) where i = 40..43
5708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqrshrn.s32   d11, q7, #0x6         @ D11  = c[i] = ((q[i] + 32) >> 6) where i = 44..47
5718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmull.s16     q6, d16, d20          @ i= 48..51
5728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmull.s16     q7, d17, d21          @ i= 52..55
5738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmull.s16     q8, d18, d22          @ i=56..59
5748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmull.s16     q9, d19, d23          @ i=60..63
5758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshl.s32      q6, q6, q15           @
5768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vzip.s16      q0, q1                @Transpose
5778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshl.s32      q7, q7, q15           @
5788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshl.s32      q8, q8, q15           @
5798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vzip.s16      q2, q3                @
5808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshl.s32      q9, q9, q15           @
5818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqrshrn.s32   d12, q6, #0x6         @ D12  = c[i] = ((q[i] + 32) >> 6) where i = 48..51
5828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vzip.s16      q4, q5                @Transpose
5838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqrshrn.s32   d13, q7, #0x6         @ D13  = c[i] = ((q[i] + 32) >> 6) where i = 52..55
5848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqrshrn.s32   d14, q8, #0x6         @ D14  = c[i] = ((q[i] + 32) >> 6) where i = 56..59
5858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vzip.s32      q0, q2                @Transpose
5868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqrshrn.s32   d15, q9, #0x6         @ D15  = c[i] = ((q[i] + 32) >> 6) where i = 60..63
5878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@========= PROCESS IDCT FROM HERE =======
5898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@Steps for Stage 2:
5918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@------------------
5928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@   TRANSPOSE 8x8 coeffs to actual order
5948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vzip.s16      q6, q7                @
5968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vzip.s32      q1, q3                @
5988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vzip.s32      q4, q6                @
5998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vzip.s32      q5, q7                @
6008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vswp          d1, d8                @ Q0/Q1 = Row order x0/x1
6028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vswp          d3, d10               @ Q2/Q3 = Row order x2/x3
6038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vswp          d5, d12               @ Q4/Q5 = Row order x4/x5
6048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vswp          d7, d14               @ Q6/Q7 = Row order x6/x7
6058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vswp          q1, q4                @
6078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshr.s16      q10, q2, #0x1         @
6088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vswp          q3, q6                @
6098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@Steps for Stage 1:
6118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@------------------
6128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vadd.s16      q8, q0, q4            @ Q8 = y0
6148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsub.s16      q9, q0, q4            @ Q9 = y2
6158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsra.s16      q2, q6, #0x1          @ Q2 = y6
6178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsub.s16      q6, q10, q6           @ Q6 = y4
6188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddl.s16     q12, d14, d2          @ y3 (0-3) 1+7
6208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddl.s16     q13, d15, d3          @ y3 (4-7) 1+7
6218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsubl.s16     q10, d14, d2          @ y5 (0-3) 7-1
6238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsubl.s16     q11, d15, d3          @ y5 (4-7) 7-1
6248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vadd.s16      q0, q8, q2            @ Q0 = z0
6268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsub.s16      q4, q8, q2            @ Q4 = z6
6278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vadd.s16      q8, q9, q6            @ Q8 = z2
6298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsub.s16      q2, q9, q6            @ Q2 = z4
6308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsubw.s16     q12, q12, d6          @ y3 (0-3) 1+7-3
6328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsubw.s16     q13, q13, d7          @ y3 (0-7) 1+7-3
6338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshr.s16      q6, q3, #0x1          @
6358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.s16     q10, q10, d10         @
6378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.s16     q11, q11, d11         @
6388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshr.s16      q9, q5, #0x1          @
6408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsubw.s16     q12, q12, d12         @
6428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsubw.s16     q13, q13, d13         @
6438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.s16     q10, q10, d18         @
6458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.s16     q11, q11, d19         @
6468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqmovn.s32    d12, q12              @
6488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddl.s16     q12, d10, d6          @
6498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqmovn.s32    d13, q13              @ Q6 = y3
6508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddl.s16     q13, d11, d7          @
6518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqmovn.s32    d18, q10              @
6528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsubl.s16     q10, d10, d6          @
6538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqmovn.s32    d19, q11              @ Q9 = y5
6548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsubl.s16     q11, d11, d7          @
6558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshr.s16      q3, q6, #0x2          @
6578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsra.s16      q6, q9, #0x2          @ Q6 = z3
6598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.s16     q12, q12, d2          @
6618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.s16     q13, q13, d3          @
6628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshr.s16      q1, #0x1              @
6648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsub.s16      q5, q3, q9            @ Q5 = z5
6668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsubw.s16     q10, q10, d14         @
6688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsubw.s16     q11, q11, d15         @
6698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshr.s16      q7, #0x1              @
6718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.s16     q12, q12, d2          @
6738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.s16     q13, q13, d3          @
6748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsubw.s16     q10, q10, d14         @
6768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsubw.s16     q11, q11, d15         @
6778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqmovn.s32    d14, q12              @
6808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vadd.s16      q1, q8, q5            @ Q1 = x1
6818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqmovn.s32    d15, q13              @ Q7 = y7
6828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsub.s16      q3, q8, q5            @ Q3 = x6
6838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqmovn.s32    d18, q10              @
6848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsub.s16      q5, q2, q6            @ Q5 = x5
6858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqmovn.s32    d19, q11              @ Q9 = y1
6868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vadd.s16      q2, q2, q6            @ Q2 = x2
6878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshr.s16      q12, q9, #0x2         @
6898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsra.s16      q9, q7, #0x2          @ Q9 = z1
6908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsub.s16      q11, q7, q12          @ Q11 = z7
6928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vadd.s16      q6, q4, q9            @ Q6 = x3
6948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsub.s16      q4, q4, q9            @ Q4 = x4
6958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsub.s16      q7, q0, q11           @ Q7 = x7
6978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vadd.s16      q0, q0, q11           @ Q0 = x0
6988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vswp.s16      q3, q6                @ Q3 = x3, Q6 = x6
7008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@Steps for Stage 2:
7038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@------------------
7048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@   TRANSPOSE 8x8 coeffs to actual order
7068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vzip.s16      q0, q1                @
7088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vzip.s16      q2, q3                @
7098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vzip.s16      q4, q5                @
7108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vzip.s16      q6, q7                @
7118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vzip.s32      q0, q2                @
7138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vzip.s32      q1, q3                @
7148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vzip.s32      q4, q6                @
7158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vzip.s32      q5, q7                @
7168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vswp          d1, d8                @ Q0/Q1 = Row order x0/x1
7188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vswp          d3, d10               @ Q2/Q3 = Row order x2/x3
7198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vswp          d5, d12               @ Q4/Q5 = Row order x4/x5
7208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vswp          d7, d14               @ Q6/Q7 = Row order x6/x7
7218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vswp          q1, q4                @
7238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshr.s16      q10, q2, #0x1         @
7248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vswp          q3, q6                @
7258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@Steps for Stage 3:
7278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@------------------
7288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@Repeat stage 1 again for vertical transform
7308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vadd.s16      q8, q0, q4            @ Q8 = y0
7328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.32       d28, [r1], r3         @ Q12 = 0x070605....0x070605....
7338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsub.s16      q9, q0, q4            @ Q9 = y2
7348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsra.s16      q2, q6, #0x1          @ Q2 = y6
7368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsub.s16      q6, q10, q6           @ Q6 = y4
7378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddl.s16     q12, d14, d2          @
7398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.32       d29, [r1], r3         @ Q12 = 0x070605....0x070605....
7408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddl.s16     q13, d15, d3          @
7418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsubl.s16     q10, d14, d2          @
7438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.32       d30, [r1], r3         @ Q12 = 0x070605....0x070605....
7448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsubl.s16     q11, d15, d3          @
7458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vadd.s16      q0, q8, q2            @ Q0 = z0
7478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.32       d31, [r1], r3         @ Q12 = 0x070605....0x070605....
7488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsub.s16      q4, q8, q2            @ Q4 = z6
7498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vadd.s16      q8, q9, q6            @ Q8 = z2
7518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsub.s16      q2, q9, q6            @ Q2 = z4
7528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsubw.s16     q12, q12, d6          @
7548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsubw.s16     q13, q13, d7          @
7558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshr.s16      q6, q3, #0x1          @
7578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.s16     q10, q10, d10         @
7598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.s16     q11, q11, d11         @
7608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshr.s16      q9, q5, #0x1          @
7628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsubw.s16     q12, q12, d12         @
7648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsubw.s16     q13, q13, d13         @
7658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.s16     q10, q10, d18         @
7678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.s16     q11, q11, d19         @
7688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqmovn.s32    d12, q12              @
7708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddl.s16     q12, d10, d6          @
7718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqmovn.s32    d13, q13              @ Q6 = y3
7728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddl.s16     q13, d11, d7          @
7738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqmovn.s32    d18, q10              @
7748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsubl.s16     q10, d10, d6          @
7758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqmovn.s32    d19, q11              @ Q9 = y5
7768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsubl.s16     q11, d11, d7          @
7778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshr.s16      q3, q6, #0x2          @
7798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsra.s16      q6, q9, #0x2          @ Q6 = z3
7818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.s16     q12, q12, d2          @
7838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.s16     q13, q13, d3          @
7848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshr.s16      q1, #0x1              @
7868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsub.s16      q5, q3, q9            @ Q5 = z5
7888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsubw.s16     q10, q10, d14         @
7908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsubw.s16     q11, q11, d15         @
7918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshr.s16      q7, #0x1              @
7938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.s16     q12, q12, d2          @
7958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.s16     q13, q13, d3          @
7968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsubw.s16     q10, q10, d14         @
7988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsubw.s16     q11, q11, d15         @
7998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqmovn.s32    d14, q12              @
8018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vadd.s16      q1, q8, q5            @ Q1 = x1
8028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqmovn.s32    d15, q13              @ Q7 = y7
8038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsub.s16      q3, q8, q5            @ Q3 = x6
8048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqmovn.s32    d18, q10              @
8058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsub.s16      q5, q2, q6            @ Q5 = x5
8068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqmovn.s32    d19, q11              @ Q9 = y1
8078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vadd.s16      q2, q2, q6            @ Q2 = x2
8088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshr.s16      q12, q9, #0x2         @
8108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsra.s16      q9, q7, #0x2          @ Q9 = z1
8118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsub.s16      q11, q7, q12          @ Q11 = z7
8138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vadd.s16      q6, q4, q9            @ Q6 = x3
8158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsub.s16      q4, q4, q9            @ Q4 = x4
8168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsub.s16      q7, q0, q11           @ Q7 = x7
8188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vadd.s16      q0, q0, q11           @ Q0 = x0
8198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vswp.s16      q3, q6                @ Q3 <-> Q6
8218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vrshr.s16     q1, q1, #6            @
8238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.32       d16, [r1], r3         @ Q12 = 0x070605....0x070605....
8248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vrshr.s16     q2, q2, #6            @
8258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vrshr.s16     q4, q4, #6            @
8268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.32       d17, [r1], r3         @ Q12 = 0x070605....0x070605....
8278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vrshr.s16     q5, q5, #6            @
8288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vrshr.s16     q7, q7, #6            @
8298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.32       d18, [r1], r3         @ Q12 = 0x070605....0x070605....
8308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vrshr.s16     q0, q0, #6            @
8318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vrshr.s16     q3, q3, #6            @
8328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.32       d19, [r1], r3         @ Q12 = 0x070605....0x070605....
8338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vrshr.s16     q6, q6, #6            @
8348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ Code Added to pack sign and magnitudes
8368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.u8      q0, q0, d28
8388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.u8      q1, q1, d29
8398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.u8      q2, q2, d30
8408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.u8      q3, q3, d31
8418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqmovun.s16   d0, q0
8428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.u8      q4, q4, d16
8438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqmovun.s16   d1, q1
8448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.u8      q5, q5, d17
8458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqmovun.s16   d2, q2
8468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.u8      q6, q6, d18
8478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqmovun.s16   d3, q3
8488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddw.u8      q7, q7, d19
8498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqmovun.s16   d4, q4
8518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.32       d0, [r2], r4          @ Magnitudes of 1st 4x4 block coeffs
8528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqmovun.s16   d5, q5
8538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.32       d1, [r2], r4          @ Magnitudes of 1st 4x4 block coeffs
8548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqmovun.s16   d6, q6
8558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.32       d2, [r2], r4          @ Magnitudes of 1st 4x4 block coeffs
8568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqmovun.s16   d7, q7
8578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.32       d3, [r2], r4          @ Magnitudes of 1st 4x4 block coeffs
8588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.32       d4, [r2], r4          @ Magnitudes of 1st 4x4 block coeffs
8598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.32       d5, [r2], r4          @ Magnitudes of 1st 4x4 block coeffs
8618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.32       d6, [r2], r4          @ Magnitudes of 1st 4x4 block coeffs
8648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.32       d7, [r2], r4          @ Magnitudes of 1st 4x4 block coeffs
8678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Sidct_8x8_end:
8698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vpop          {d8-d15}
8718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ldmfd         sp!, {r4-r12, r15}
8728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
873