18d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@/******************************************************************************
28d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
38d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * Copyright (C) 2015 The Android Open Source Project
48d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
58d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * Licensed under the Apache License, Version 2.0 (the "License");
68d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * you may not use this file except in compliance with the License.
78d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * You may obtain a copy of the License at:
88d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
98d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * http://www.apache.org/licenses/LICENSE-2.0
108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * Unless required by applicable law or agreed to in writing, software
128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * distributed under the License is distributed on an "AS IS" BASIS,
138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * See the License for the specific language governing permissions and
158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * limitations under the License.
168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *****************************************************************************
188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*/
207497191460a9504f8b4f64df169ab633f0b74353Harish Mahendrakar@**
218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *******************************************************************************
228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @file
238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *  ih264_ihadamard_scaling_a9.s
248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @brief
268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *  Contains function definitions for inverse hadamard transform on 4x4 DC outputs
278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *  of 16x16 intra-prediction
288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @author
308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *  Mohit
318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @par List of Functions:
338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *  - ih264_ihadamard_scaling_4x4_a9()
348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *  - ih264_ihadamard_scaling_2x2_uv_a9()
358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @remarks
378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *  None
388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *******************************************************************************
407497191460a9504f8b4f64df169ab633f0b74353Harish Mahendrakar@ *
418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @brief This function performs a 4x4 inverse hadamard transform on the 4x4 DC coefficients
428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * of a 16x16 intra prediction macroblock, and then performs scaling.
438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * prediction buffer
448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @par Description:
468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *  The DC coefficients pass through a 2-stage inverse hadamard transform.
478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *  This inverse transformed content is scaled to based on Qp value.
488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] pi2_src
508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *  input 4x4 block of DC coefficients
518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[out] pi2_out
538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *  output 4x4 block
548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] pu2_iscal_mat
568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *  pointer to scaling list
578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] pu2_weigh_mat
598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *  pointer to weight matrix
608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] u4_qp_div_6
628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *  Floor (qp/6)
638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] pi4_tmp
658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * temporary buffer of size 1*16
668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @returns none
688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @remarks none
708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *******************************************************************************
727497191460a9504f8b4f64df169ab633f0b74353Harish Mahendrakar@ *
738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *******************************************************************************
757497191460a9504f8b4f64df169ab633f0b74353Harish Mahendrakar@ *
768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ void ih264_ihadamard_scaling_4x4(WORD16* pi2_src,
778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@       WORD16* pi2_out,
788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@       const UWORD16 *pu2_iscal_mat,
798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@       const UWORD16 *pu2_weigh_mat,
808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@       UWORD32 u4_qp_div_6,
818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@       WORD32* pi4_tmp)
828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@**************Variables Vs Registers*****************************************
838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r0 => *pi2_src
848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r1 => *pi2_out
858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r2 =>  *pu2_iscal_mat
868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r3 =>  *pu2_weigh_mat
878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r4 =>  u4_qp_div_6
888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S.text
908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S.p2align 2
918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    .global ih264_ihadamard_scaling_4x4_a9
938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Sih264_ihadamard_scaling_4x4_a9:
958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@VLD4.S16 is used because the pointer is incremented by SUB_BLK_WIDTH_4x4
978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@If the macro value changes need to change the instruction according to it.
988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@Only one shift is done in horizontal inverse because,
998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value
1008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0
1018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    stmfd         sp!, {r4-r12, r14}    @ stack stores the values of the arguments
1038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ldr           r4, [sp, #40]         @ Loads u4_qp_div_6
1048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vdup.s32      q10, r4               @ Populate the u4_qp_div_6 in Q10
1058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ldrh          r6, [r3]              @ load pu2_weight_mat[0] , H for unsigned halfword load
1068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ldrh          r7, [r2]              @ load pu2_iscal_mat[0] , H for unsigned halfword load
1078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    mul           r6, r6, r7            @ pu2_iscal_mat[0]*pu2_weigh_mat[0]
1088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vdup.s32      q9, r6                @ Populate pu2_iscal_mat[0]*pu2_weigh_mat[0] 32-bit in Q9
1098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vpush         {d8-d15}
1108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@=======================INVERSE HADAMARD TRANSFORM================================
1118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld4.s16      {d0, d1, d2, d3}, [r0] @load x4,x5,x6,x7
1138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddl.s16     q12, d0, d3           @x0 = x4 + x7
1148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddl.s16     q13, d1, d2           @x1 = x5 + x6
1158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsubl.s16     q14, d1, d2           @x2 = x5 - x6
1168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsubl.s16     q15, d0, d3           @x3 = x4 - x7
1178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vadd.s32      q2, q12, q13          @pi4_tmp_ptr[0] = x0 + x1
1198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vadd.s32      q3, q15, q14          @pi4_tmp_ptr[1] = x3 + x2
1208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsub.s32      q4, q12, q13          @pi4_tmp_ptr[2] = x0 - x1
1218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsub.s32      q5, q15, q14          @pi4_tmp_ptr[3] = x3 - x2
1228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.32       q2, q3                @Transpose the register for vertical transform
1248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.32       q4, q5
1258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vswp          d5, d8                @Q2 = x4, Q4 = x6
1278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vswp          d7, d10               @Q3 = x5, Q5 = x7
1288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vadd.s32      q12, q2, q5           @x0 = x4+x7
1318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vadd.s32      q13, q3, q4           @x1 = x5+x6
1328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsub.s32      q14, q3, q4           @x2 = x5-x6
1338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsub.s32      q15, q2, q5           @x3 = x4-x7
1348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vadd.s32      q0, q12, q13          @pi4_tmp_ptr[0] = x0 + x1
1368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vadd.s32      q1, q15, q14          @pi4_tmp_ptr[1] = x3 + x2
1378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsub.s32      q2, q12, q13          @pi4_tmp_ptr[2] = x0 - x1
1388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsub.s32      q3, q15, q14          @pi4_tmp_ptr[3] = x3 - x2
1398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmul.s32      q0, q0, q9            @ Q0  = p[i] = (x[i] * trns_coeff[i]) where i = 0..3
1428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmul.s32      q1, q1, q9            @ Q1  = p[i] = (x[i] * trns_coeff[i]) where i = 4..7
1438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmul.s32      q2, q2, q9            @ Q2  = p[i] = (x[i] * trns_coeff[i]) where i = 8..11
1448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmul.s32      q3, q3, q9            @ Q3  = p[i] = (x[i] * trns_coeff[i]) where i = 12..15
1458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshl.s32      q0, q0, q10           @ Q0  = q[i] = (p[i] << (qP/6)) where i = 0..3
1478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshl.s32      q1, q1, q10           @ Q1  = q[i] = (p[i] << (qP/6)) where i = 4..7
1488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshl.s32      q2, q2, q10           @ Q2  = q[i] = (p[i] << (qP/6)) where i = 8..11
1498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshl.s32      q3, q3, q10           @ Q3  = q[i] = (p[i] << (qP/6)) where i = 12..15
1508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqrshrn.s32   d0, q0, #0x6          @ D0  = c[i] = ((q[i] + 32) >> 4) where i = 0..3
1528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqrshrn.s32   d1, q1, #0x6          @ D1  = c[i] = ((q[i] + 32) >> 4) where i = 4..7
1538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqrshrn.s32   d2, q2, #0x6          @ D2  = c[i] = ((q[i] + 32) >> 4) where i = 8..11
1548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vqrshrn.s32   d3, q3, #0x6          @ D3  = c[i] = ((q[i] + 32) >> 4) where i = 12..15
1558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst1.s16      {d0, d1, d2, d3}, [r1] @IV row store the value
1578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vpop          {d8-d15}
1598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ldmfd         sp!, {r4-r12, r15}    @Reload the registers from SP
1608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *******************************************************************************
1647497191460a9504f8b4f64df169ab633f0b74353Harish Mahendrakar@ *
1658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @brief This function performs a 2x2 inverse hadamard transform for chroma block
1668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
1678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @par Description:
1688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *  The DC coefficients pass through a 2-stage inverse hadamard transform.
1698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *  This inverse transformed content is scaled to based on Qp value.
1708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *  Both DC blocks of U and v blocks are processesd
1718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
1728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] pi2_src
1738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *  input 1x8 block of ceffs. First 4 are from U and next from V
1748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
1758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[out] pi2_out
1768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *  output 1x8 block
1778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
1788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] pu2_iscal_mat
1798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *  pointer to scaling list
1808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
1818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] pu2_weigh_mat
1828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *  pointer to weight matrix
1838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
1848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] u4_qp_div_6
1858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *  Floor (qp/6)
1868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
1878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @returns none
1888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
1898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @remarks none
1908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
1918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *******************************************************************************
1927497191460a9504f8b4f64df169ab633f0b74353Harish Mahendrakar@ *
1938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *
1948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *******************************************************************************
1957497191460a9504f8b4f64df169ab633f0b74353Harish Mahendrakar@ *
1968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ void ih264_ihadamard_scaling_2x2_uv(WORD16* pi2_src,
1978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@                                  WORD16* pi2_out,
1988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@                                  const UWORD16 *pu2_iscal_mat,
1998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@                                  const UWORD16 *pu2_weigh_mat,
2008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@                                  UWORD32 u4_qp_div_6,
2018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    .global ih264_ihadamard_scaling_2x2_uv_a9
2038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Sih264_ihadamard_scaling_2x2_uv_a9:
2048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@Registers used
2068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@   r0 : *pi2_src
2078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@   r1 : *pi2_out
2088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@   r2 : *pu2_iscal_mat
2098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@   r3 : *pu2_weigh_mat
2108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.u16      d26[0], [r2]
2128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.u16      d27[0], [r3]
2138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmull.u16     q15, d26, d27         @pu2_iscal_mat[0] *  pu2_weigh_mat[0]
2148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vdup.u32      q15, d30[0]
2158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld1.u16      d28[0], [sp]          @load qp/6
2178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vpush         {d8-d15}
2198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmov.u16      d29, #5
2218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsubl.u16     q14, d28, d29         @qp\6 - 5
2228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vdup.s32      q14, d28[0]
2238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vld2.s16      {d0, d1}, [r0]        @load 8 dc coeffs
2258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                        @i2_x4,i2_x6,i2_y4,i1_y6 -> d0
2268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                        @i2_x5,i2_x7,i2_y5,i1_y6 -> d1
2278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vaddl.s16     q1, d0, d1            @  i4_x0 = i4_x4 + i4_x5;...x2
2298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsubl.s16     q2, d0, d1            @  i4_x1 = i4_x4 - i4_x5;...x3
2308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vtrn.s32      q1, q2                @i4_x0 i4_x1 -> q1
2328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vadd.s32      q3, q1, q2            @i4_x4 = i4_x0+i4_x2;.. i4_x5
2348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vsub.s32      q1, q1, q2            @i4_x6 = i4_x0-i4_x2;.. i4_x7
2358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmul.s32      q5, q3, q15
2378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmul.s32      q6, q1, q15
2388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshl.s32      q7, q5, q14
2408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vshl.s32      q8, q6, q14
2418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmovn.s32     d18, q7               @i4_x4 i4_x5 i4_y4 i4_y5
2438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vmovn.s32     d19, q8               @i4_x6 i4_x7 i4_y6 i4_y7
2448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vst2.s32      {d18-d19}, [r1]
2468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    vpop          {d8-d15}
2488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    bx            lr
2498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
251