18d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@/****************************************************************************** 28d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 38d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * Copyright (C) 2015 The Android Open Source Project 48d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 58d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * Licensed under the Apache License, Version 2.0 (the "License"); 68d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * you may not use this file except in compliance with the License. 78d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * You may obtain a copy of the License at: 88d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 98d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * http://www.apache.org/licenses/LICENSE-2.0 108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * Unless required by applicable law or agreed to in writing, software 128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * distributed under the License is distributed on an "AS IS" BASIS, 138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * See the License for the specific language governing permissions and 158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * limitations under the License. 168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ ***************************************************************************** 188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*/ 207497191460a9504f8b4f64df169ab633f0b74353Harish Mahendrakar@** 218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ ******************************************************************************* 228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @file 238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * ih264_ihadamard_scaling_a9.s 248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @brief 268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * Contains function definitions for inverse hadamard transform on 4x4 DC outputs 278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * of 16x16 intra-prediction 288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @author 308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * Mohit 318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @par List of Functions: 338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * - ih264_ihadamard_scaling_4x4_a9() 348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * - ih264_ihadamard_scaling_2x2_uv_a9() 358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @remarks 378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * None 388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ ******************************************************************************* 407497191460a9504f8b4f64df169ab633f0b74353Harish Mahendrakar@ * 418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @brief This function performs a 4x4 inverse hadamard transform on the 4x4 DC coefficients 428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * of a 16x16 intra prediction macroblock, and then performs scaling. 438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * prediction buffer 448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @par Description: 468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * The DC coefficients pass through a 2-stage inverse hadamard transform. 478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * This inverse transformed content is scaled to based on Qp value. 488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] pi2_src 508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * input 4x4 block of DC coefficients 518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[out] pi2_out 538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * output 4x4 block 548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] pu2_iscal_mat 568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * pointer to scaling list 578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] pu2_weigh_mat 598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * pointer to weight matrix 608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] u4_qp_div_6 628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * Floor (qp/6) 638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] pi4_tmp 658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * temporary buffer of size 1*16 668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @returns none 688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @remarks none 708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ ******************************************************************************* 727497191460a9504f8b4f64df169ab633f0b74353Harish Mahendrakar@ * 738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ ******************************************************************************* 757497191460a9504f8b4f64df169ab633f0b74353Harish Mahendrakar@ * 768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ void ih264_ihadamard_scaling_4x4(WORD16* pi2_src, 778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ WORD16* pi2_out, 788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ const UWORD16 *pu2_iscal_mat, 798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ const UWORD16 *pu2_weigh_mat, 808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ UWORD32 u4_qp_div_6, 818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ WORD32* pi4_tmp) 828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@**************Variables Vs Registers***************************************** 838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r0 => *pi2_src 848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r1 => *pi2_out 858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r2 => *pu2_iscal_mat 868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r3 => *pu2_weigh_mat 878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r4 => u4_qp_div_6 888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S.text 908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S.p2align 2 918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S .global ih264_ihadamard_scaling_4x4_a9 938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Sih264_ihadamard_scaling_4x4_a9: 958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@VLD4.S16 is used because the pointer is incremented by SUB_BLK_WIDTH_4x4 978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@If the macro value changes need to change the instruction according to it. 988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@Only one shift is done in horizontal inverse because, 998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value 1008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0 1018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S stmfd sp!, {r4-r12, r14} @ stack stores the values of the arguments 1038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ldr r4, [sp, #40] @ Loads u4_qp_div_6 1048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vdup.s32 q10, r4 @ Populate the u4_qp_div_6 in Q10 1058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ldrh r6, [r3] @ load pu2_weight_mat[0] , H for unsigned halfword load 1068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ldrh r7, [r2] @ load pu2_iscal_mat[0] , H for unsigned halfword load 1078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S mul r6, r6, r7 @ pu2_iscal_mat[0]*pu2_weigh_mat[0] 1088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vdup.s32 q9, r6 @ Populate pu2_iscal_mat[0]*pu2_weigh_mat[0] 32-bit in Q9 1098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vpush {d8-d15} 1108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@=======================INVERSE HADAMARD TRANSFORM================================ 1118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vld4.s16 {d0, d1, d2, d3}, [r0] @load x4,x5,x6,x7 1138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vaddl.s16 q12, d0, d3 @x0 = x4 + x7 1148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vaddl.s16 q13, d1, d2 @x1 = x5 + x6 1158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsubl.s16 q14, d1, d2 @x2 = x5 - x6 1168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsubl.s16 q15, d0, d3 @x3 = x4 - x7 1178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vadd.s32 q2, q12, q13 @pi4_tmp_ptr[0] = x0 + x1 1198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vadd.s32 q3, q15, q14 @pi4_tmp_ptr[1] = x3 + x2 1208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsub.s32 q4, q12, q13 @pi4_tmp_ptr[2] = x0 - x1 1218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsub.s32 q5, q15, q14 @pi4_tmp_ptr[3] = x3 - x2 1228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vtrn.32 q2, q3 @Transpose the register for vertical transform 1248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vtrn.32 q4, q5 1258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vswp d5, d8 @Q2 = x4, Q4 = x6 1278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vswp d7, d10 @Q3 = x5, Q5 = x7 1288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vadd.s32 q12, q2, q5 @x0 = x4+x7 1318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vadd.s32 q13, q3, q4 @x1 = x5+x6 1328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsub.s32 q14, q3, q4 @x2 = x5-x6 1338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsub.s32 q15, q2, q5 @x3 = x4-x7 1348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vadd.s32 q0, q12, q13 @pi4_tmp_ptr[0] = x0 + x1 1368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vadd.s32 q1, q15, q14 @pi4_tmp_ptr[1] = x3 + x2 1378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsub.s32 q2, q12, q13 @pi4_tmp_ptr[2] = x0 - x1 1388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsub.s32 q3, q15, q14 @pi4_tmp_ptr[3] = x3 - x2 1398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vmul.s32 q0, q0, q9 @ Q0 = p[i] = (x[i] * trns_coeff[i]) where i = 0..3 1428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vmul.s32 q1, q1, q9 @ Q1 = p[i] = (x[i] * trns_coeff[i]) where i = 4..7 1438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vmul.s32 q2, q2, q9 @ Q2 = p[i] = (x[i] * trns_coeff[i]) where i = 8..11 1448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vmul.s32 q3, q3, q9 @ Q3 = p[i] = (x[i] * trns_coeff[i]) where i = 12..15 1458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vshl.s32 q0, q0, q10 @ Q0 = q[i] = (p[i] << (qP/6)) where i = 0..3 1478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vshl.s32 q1, q1, q10 @ Q1 = q[i] = (p[i] << (qP/6)) where i = 4..7 1488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vshl.s32 q2, q2, q10 @ Q2 = q[i] = (p[i] << (qP/6)) where i = 8..11 1498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vshl.s32 q3, q3, q10 @ Q3 = q[i] = (p[i] << (qP/6)) where i = 12..15 1508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vqrshrn.s32 d0, q0, #0x6 @ D0 = c[i] = ((q[i] + 32) >> 4) where i = 0..3 1528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vqrshrn.s32 d1, q1, #0x6 @ D1 = c[i] = ((q[i] + 32) >> 4) where i = 4..7 1538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vqrshrn.s32 d2, q2, #0x6 @ D2 = c[i] = ((q[i] + 32) >> 4) where i = 8..11 1548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vqrshrn.s32 d3, q3, #0x6 @ D3 = c[i] = ((q[i] + 32) >> 4) where i = 12..15 1558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vst1.s16 {d0, d1, d2, d3}, [r1] @IV row store the value 1578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vpop {d8-d15} 1598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ldmfd sp!, {r4-r12, r15} @Reload the registers from SP 1608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ ******************************************************************************* 1647497191460a9504f8b4f64df169ab633f0b74353Harish Mahendrakar@ * 1658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @brief This function performs a 2x2 inverse hadamard transform for chroma block 1668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 1678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @par Description: 1688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * The DC coefficients pass through a 2-stage inverse hadamard transform. 1698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * This inverse transformed content is scaled to based on Qp value. 1708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * Both DC blocks of U and v blocks are processesd 1718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 1728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] pi2_src 1738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * input 1x8 block of ceffs. First 4 are from U and next from V 1748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 1758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[out] pi2_out 1768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * output 1x8 block 1778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 1788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] pu2_iscal_mat 1798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * pointer to scaling list 1808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 1818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] pu2_weigh_mat 1828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * pointer to weight matrix 1838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 1848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] u4_qp_div_6 1858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * Floor (qp/6) 1868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 1878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @returns none 1888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 1898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @remarks none 1908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 1918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ ******************************************************************************* 1927497191460a9504f8b4f64df169ab633f0b74353Harish Mahendrakar@ * 1938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 1948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ ******************************************************************************* 1957497191460a9504f8b4f64df169ab633f0b74353Harish Mahendrakar@ * 1968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ void ih264_ihadamard_scaling_2x2_uv(WORD16* pi2_src, 1978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ WORD16* pi2_out, 1988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ const UWORD16 *pu2_iscal_mat, 1998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ const UWORD16 *pu2_weigh_mat, 2008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ UWORD32 u4_qp_div_6, 2018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S .global ih264_ihadamard_scaling_2x2_uv_a9 2038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Sih264_ihadamard_scaling_2x2_uv_a9: 2048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@Registers used 2068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ r0 : *pi2_src 2078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ r1 : *pi2_out 2088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ r2 : *pu2_iscal_mat 2098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ r3 : *pu2_weigh_mat 2108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vld1.u16 d26[0], [r2] 2128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vld1.u16 d27[0], [r3] 2138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vmull.u16 q15, d26, d27 @pu2_iscal_mat[0] * pu2_weigh_mat[0] 2148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vdup.u32 q15, d30[0] 2158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vld1.u16 d28[0], [sp] @load qp/6 2178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vpush {d8-d15} 2198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vmov.u16 d29, #5 2218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsubl.u16 q14, d28, d29 @qp\6 - 5 2228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vdup.s32 q14, d28[0] 2238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vld2.s16 {d0, d1}, [r0] @load 8 dc coeffs 2258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S @i2_x4,i2_x6,i2_y4,i1_y6 -> d0 2268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S @i2_x5,i2_x7,i2_y5,i1_y6 -> d1 2278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vaddl.s16 q1, d0, d1 @ i4_x0 = i4_x4 + i4_x5;...x2 2298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsubl.s16 q2, d0, d1 @ i4_x1 = i4_x4 - i4_x5;...x3 2308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vtrn.s32 q1, q2 @i4_x0 i4_x1 -> q1 2328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vadd.s32 q3, q1, q2 @i4_x4 = i4_x0+i4_x2;.. i4_x5 2348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsub.s32 q1, q1, q2 @i4_x6 = i4_x0-i4_x2;.. i4_x7 2358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vmul.s32 q5, q3, q15 2378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vmul.s32 q6, q1, q15 2388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vshl.s32 q7, q5, q14 2408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vshl.s32 q8, q6, q14 2418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vmovn.s32 d18, q7 @i4_x4 i4_x5 i4_y4 i4_y5 2438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vmovn.s32 d19, q8 @i4_x6 i4_x7 i4_y6 i4_y7 2448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vst2.s32 {d18-d19}, [r1] 2468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vpop {d8-d15} 2488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S bx lr 2498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 251