18d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@/****************************************************************************** 28d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 38d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * Copyright (C) 2015 The Android Open Source Project 48d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 58d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * Licensed under the Apache License, Version 2.0 (the "License"); 68d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * you may not use this file except in compliance with the License. 78d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * You may obtain a copy of the License at: 88d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 98d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * http://www.apache.org/licenses/LICENSE-2.0 108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * Unless required by applicable law or agreed to in writing, software 128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * distributed under the License is distributed on an "AS IS" BASIS, 138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * See the License for the specific language governing permissions and 158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * limitations under the License. 168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ ***************************************************************************** 188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@*/ 207497191460a9504f8b4f64df169ab633f0b74353Harish Mahendrakar@** 218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ ******************************************************************************* 228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @file 238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * ih264_iquant_itrans_recon_a9.s 248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @brief 268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * Contains function definitions for single stage inverse transform 278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @author 298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * Mohit 308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * Harinarayanaan 318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @par List of Functions: 338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * - ih264_iquant_itrans_recon_4x4_a9() 348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * - ih264_iquant_itrans_recon_8x8_a9() 358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * - ih264_iquant_itrans_recon_chroma_4x4_a9() 368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @remarks 388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * None 398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ ******************************************************************************* 417497191460a9504f8b4f64df169ab633f0b74353Harish Mahendrakar@* 427497191460a9504f8b4f64df169ab633f0b74353Harish Mahendrakar@** 438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ ******************************************************************************* 448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @brief 468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * This function performs inverse quant and Inverse transform type Ci4 for 4*4 block 478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @par Description: 498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * Performs inverse transform Ci4 and adds the residue to get the 508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * reconstructed block 518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] pi2_src 538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * Input 4x4 coefficients 548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] pu1_pred 568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * Prediction 4x4 block 578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[out] pu1_out 598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * Output 4x4 block 608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] u4_qp_div_6 628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * QP 638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] pu2_weigh_mat 658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * Pointer to weight matrix 668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] pred_strd, 688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * Prediction stride 698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] out_strd 718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * Output Stride 728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *@param[in] pi2_tmp 748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * temporary buffer of size 1*16 758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] pu2_iscal_mat 778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * Pointer to the inverse quantization matrix 788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @returns Void 808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @remarks 828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * None 838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ ******************************************************************************* 857497191460a9504f8b4f64df169ab633f0b74353Harish Mahendrakar@ * 868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@void ih264_iquant_itrans_recon_4x4(WORD16 *pi2_src, 878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ UWORD8 *pu1_pred, 888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ UWORD8 *pu1_out, 898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ WORD32 pred_strd, 908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ WORD32 out_strd, 918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ const UWORD16 *pu2_iscal_mat, 928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ const UWORD16 *pu2_weigh_mat, 938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ UWORD32 u4_qp_div_6, 948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ WORD32 *pi4_tmp, 958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ WORD32 iq_start_idx 968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ WORD16 *pi2_dc_ld_addr) 978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@**************Variables Vs Registers***************************************** 988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r0 => *pi2_src 998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r1 => *pu1_pred 1008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r2 => *pu1_out 1018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r3 => pred_strd 1028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r4 => out_strd 1038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r5 => *pu2_iscal_mat 1048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r6 => *pu2_weigh_mat 1058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r7 => u4_qp_div_6 1068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r8 => iq_start_idx 1078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r10=> pi2_dc_ld_addr 1088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S.text 109b5cec4f95fef51237ac5239983f636efacd2d63fMartin Storsjo.syntax unified 1108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S.p2align 2 1118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S .global ih264_iquant_itrans_recon_4x4_a9 1138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Sih264_iquant_itrans_recon_4x4_a9: 1158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@VLD4.S16 is used because the pointer is incremented by SUB_BLK_WIDTH_4x4 1178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@If the macro value changes need to change the instruction according to it. 1188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@Only one shift is done in horizontal inverse because, 1198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value 1208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0 1218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments 1238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ldr r7, [sp, #52] @Loads u4_qp_div_6 1248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ldr r4, [sp, #40] @Loads out_strd 1258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vdup.s32 q15, r7 @Populate the u4_qp_div_6 in Q15 1268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ldr r5, [sp, #44] @Loads *pu2_iscal_mat 1278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ldr r6, [sp, #48] @Loads *pu2_weigh_mat 1298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ldr r8, [sp, #60] @Loads iq_start_idx 1318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ldr r10, [sp, #64] @Load alternate dc address 1338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vpush {d8-d15} 1358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@=======================DEQUANT FROM HERE=================================== 1368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vld4.s16 {d20, d21, d22, d23}, [r5] @Load pu2_iscal_mat[i], i =0..15 1388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vld4.s16 {d26, d27, d28, d29}, [r6] @pu2_weigh_mat[i], i =0..15 1398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vmul.s16 q10, q10, q13 @x[i]=(scale[i] * dequant[i]) where i = 0..7 1408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vld4.s16 {d16, d17, d18, d19}, [r0] @pi2_src_tmp[i], i =0..15 1418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vmul.s16 q11, q11, q14 @x[i]=(scale[i] * dequant[i]) where i = 8..15 1438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S subs r8, r8, #1 @ if r8 == 1 => intra case , so result of subtraction is zero and Z flag is set 145b5cec4f95fef51237ac5239983f636efacd2d63fMartin Storsjo ldrsheq r9, [r10] @ Loads signed halfword pi2_dc_ld_addr[0], if r8==1 1468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vmull.s16 q0, d16, d20 @ Q0 = p[i] = (x[i] * trns_coeff[i]) where i = 0..3 1488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vmull.s16 q1, d17, d21 @ Q1 = p[i] = (x[i] * trns_coeff[i]) where i = 4..7 1498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vmull.s16 q2, d18, d22 @ Q2 = p[i] = (x[i] * trns_coeff[i]) where i = 8..11 1508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vmull.s16 q3, d19, d23 @ Q3 = p[i] = (x[i] * trns_coeff[i]) where i = 12..15 1518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vshl.s32 q0, q0, q15 @ Q0 = q[i] = (p[i] << (qP/6)) where i = 0..3 1538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vshl.s32 q1, q1, q15 @ Q1 = q[i] = (p[i] << (qP/6)) where i = 4..7 1548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vshl.s32 q2, q2, q15 @ Q2 = q[i] = (p[i] << (qP/6)) where i = 8..11 1558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vshl.s32 q3, q3, q15 @ Q3 = q[i] = (p[i] << (qP/6)) where i = 12..15 1568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vqrshrn.s32 d0, q0, #0x4 @ D0 = c[i] = ((q[i] + 32) >> 4) where i = 0..3 1588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vqrshrn.s32 d1, q1, #0x4 @ D1 = c[i] = ((q[i] + 32) >> 4) where i = 4..7 1598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vqrshrn.s32 d2, q2, #0x4 @ D2 = c[i] = ((q[i] + 32) >> 4) where i = 8..11 1608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vqrshrn.s32 d3, q3, #0x4 @ D3 = c[i] = ((q[i] + 32) >> 4) where i = 12..15 1618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vmoveq.16 d0[0], r9 @ Restore dc value in case of intra, i.e. r8 == 1 1638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@========= PROCESS IDCT FROM HERE ======= 1658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@Steps for Stage 1: 1668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@------------------ 1678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vld1.32 d30[0], [r1], r3 @I row Load pu1_pred buffer 1688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vadd.s16 d4, d0, d2 @x0 = q0 + q1; 1698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsub.s16 d5, d0, d2 @x1 = q0 - q1; 1718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vshr.s16 d8, d1, #1 @q0>>1 1738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vshr.s16 d9, d3, #1 @q1>>1 1748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsub.s16 d6, d8, d3 @x2 = (q0 >> 1) - q1; 1768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vadd.s16 d7, d1, d9 @x3 = q0+ (q1 >> 1); 1778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vld1.32 d30[1], [r1], r3 @II row Load pu1_pred buffer 1788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vswp d6, d7 @Reverse positions of x2 and x3 1808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsub.s16 q6, q2, q3 @x0-x3 and x1-x2 combined 1828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vadd.s16 q5, q2, q3 @x0 + x3 and x1+x2 combined 1838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vld1.32 d31[0], [r1], r3 @III row Load pu1_pred buf 1858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vswp d12, d13 1878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@Steps for Stage 2: 1888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@------------------ 1898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vtrn.16 d10, d11 1908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vtrn.16 d12, d13 1918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vtrn.32 d10, d12 1928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vtrn.32 d11, d13 1938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vadd.s16 d14, d10, d12 @x0 = q0 + q1; 1948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsub.s16 d15, d10, d12 @x1 = q0 - q1; 1968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vshr.s16 d18, d11, #1 @q0>>1 1988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vshr.s16 d19, d13, #1 @q1>>1 1998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsub.s16 d16, d18, d13 @x2 = (q0 >> 1) - q1; 2018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vadd.s16 d17, d11, d19 @x3 = q0+ (q1 >> 1); 2028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vld1.32 d31[1], [r1], r3 @IV row Load pu1_pred buffer 2048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vswp d16, d17 @Reverse positions of x2 and x3 2058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsub.s16 q11, q7, q8 @x0-x3 and x1-x2 combined 2078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vadd.s16 q10, q7, q8 @x0 + x3 and x1+x2 combined 2088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vswp d22, d23 2108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vrshr.s16 q10, q10, #6 @ 2128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vrshr.s16 q11, q11, #6 2138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vaddw.u8 q10, q10, d30 2158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vaddw.u8 q11, q11, d31 2168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vqmovun.s16 d0, q10 2188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vqmovun.s16 d1, q11 2198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vst1.32 d0[0], [r2], r4 @I row store the value 2218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vst1.32 d0[1], [r2], r4 @II row store the value 2228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vst1.32 d1[0], [r2], r4 @III row store the value 2238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vst1.32 d1[1], [r2] @IV row store the value 2248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vpop {d8-d15} 2268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ldmfd sp!, {r4-r12, r15} @Reload the registers from SP 2278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2297497191460a9504f8b4f64df169ab633f0b74353Harish Mahendrakar@** 2308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ ******************************************************************************* 2318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 2328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @brief 2338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * This function performs inverse quant and Inverse transform type Ci4 for 4*4 block 2348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 2358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @par Description: 2368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * Performs inverse transform Ci4 and adds the residue to get the 2378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * reconstructed block 2388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 2398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] pi2_src 2408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * Input 4x4 coefficients 2418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 2428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] pu1_pred 2438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * Prediction 4x4 block 2448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 2458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[out] pu1_out 2468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * Output 4x4 block 2478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 2488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] u4_qp_div_6 2498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * QP 2508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 2518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] pu2_weigh_mat 2528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * Pointer to weight matrix 2538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 2548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] pred_strd, 2558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * Prediction stride 2568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 2578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] out_strd 2588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * Output Stride 2598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 2608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *@param[in] pi2_tmp 2618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * temporary buffer of size 1*16 2628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 2638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] pu2_iscal_mat 2648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * Pointer to the inverse quantization matrix 2658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 2668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @returns Void 2678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 2688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @remarks 2698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * None 2708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 2718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ ******************************************************************************* 2727497191460a9504f8b4f64df169ab633f0b74353Harish Mahendrakar@ * 2738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@void ih264_iquant_itrans_recon_chroma_4x4(WORD16 *pi2_src, 2748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ UWORD8 *pu1_pred, 2758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ UWORD8 *pu1_out, 2768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ WORD32 pred_strd, 2778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ WORD32 out_strd, 2788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ const UWORD16 *pu2_iscal_mat, 2798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ const UWORD16 *pu2_weigh_mat, 2808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ UWORD32 u4_qp_div_6, 2818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ WORD32 *pi4_tmp 2828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ WORD16 *pi2_dc_src) 2838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@**************Variables Vs Registers***************************************** 2848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r0 => *pi2_src 2858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r1 => *pu1_pred 2868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r2 => *pu1_out 2878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r3 => pred_strd 2888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r4 => out_strd 2898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r5 => *pu2_iscal_mat 2908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r6 => *pu2_weigh_mat 2918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r7 => u4_qp_div_6 2928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S .global ih264_iquant_itrans_recon_chroma_4x4_a9 2948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Sih264_iquant_itrans_recon_chroma_4x4_a9: 2958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@VLD4.S16 is used because the pointer is incremented by SUB_BLK_WIDTH_4x4 2978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@If the macro value changes need to change the instruction according to it. 2988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@Only one shift is done in horizontal inverse because, 2998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value 3008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0 3018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 3028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments 3038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ldr r7, [sp, #52] @Loads u4_qp_div_6 3048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ldr r4, [sp, #40] @Loads out_strd 3058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vdup.s32 q15, r7 @Populate the u4_qp_div_6 in Q15 3068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ldr r5, [sp, #44] @Loads *pu2_iscal_mat 3078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ldr r6, [sp, #48] @Loads *pu2_weigh_mat 3088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ldr r8, [sp, #60] @loads *pi2_dc_src 3098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 3108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vpush {d8-d15} 3118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@=======================DEQUANT FROM HERE=================================== 3128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 3138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vld4.s16 {d20, d21, d22, d23}, [r5] @Load pu2_iscal_mat[i], i =0..15 3148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vld4.s16 {d26, d27, d28, d29}, [r6] @pu2_weigh_mat[i], i =0..15 3158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vmul.s16 q10, q10, q13 @x[i]=(scale[i] * dequant[i]) where i = 0..7 3168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vld4.s16 {d16, d17, d18, d19}, [r0] @pi2_src_tmp[i], i =0..15 3178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 3188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vmul.s16 q11, q11, q14 @x[i]=(scale[i] * dequant[i]) where i = 8..15 3198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 3208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vmull.s16 q0, d16, d20 @ Q0 = p[i] = (x[i] * trns_coeff[i]) where i = 0..3 3218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vmull.s16 q1, d17, d21 @ Q1 = p[i] = (x[i] * trns_coeff[i]) where i = 4..7 3228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vmull.s16 q2, d18, d22 @ Q2 = p[i] = (x[i] * trns_coeff[i]) where i = 8..11 3238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vmull.s16 q3, d19, d23 @ Q3 = p[i] = (x[i] * trns_coeff[i]) where i = 12..15 3248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 3258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vshl.s32 q0, q0, q15 @ Q0 = q[i] = (p[i] << (qP/6)) where i = 0..3 3268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vshl.s32 q1, q1, q15 @ Q1 = q[i] = (p[i] << (qP/6)) where i = 4..7 3278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vshl.s32 q2, q2, q15 @ Q2 = q[i] = (p[i] << (qP/6)) where i = 8..11 3288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vshl.s32 q3, q3, q15 @ Q3 = q[i] = (p[i] << (qP/6)) where i = 12..15 3298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 3308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vqrshrn.s32 d0, q0, #0x4 @ D0 = c[i] = ((q[i] + 32) >> 4) where i = 0..3 3318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vqrshrn.s32 d1, q1, #0x4 @ D1 = c[i] = ((q[i] + 32) >> 4) where i = 4..7 3328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vqrshrn.s32 d2, q2, #0x4 @ D2 = c[i] = ((q[i] + 32) >> 4) where i = 8..11 3338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vqrshrn.s32 d3, q3, #0x4 @ D3 = c[i] = ((q[i] + 32) >> 4) where i = 12..15 3348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 3358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ldrsh r9, [r8] @ Loads signed halfword pi2_dc_src[0] 3368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vmov.16 d0[0], r9 @ Restore dc value since its chroma iq-it 3378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 3388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@========= PROCESS IDCT FROM HERE ======= 3398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@Steps for Stage 1: 3408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@------------------ 3418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vld2.8 {d28, d29}, [r1], r3 @I row Load pu1_pred buffer 3428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vadd.s16 d4, d0, d2 @x0 = q0 + q1; 3438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 3448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsub.s16 d5, d0, d2 @x1 = q0 - q1; 3458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 3468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vshr.s16 d8, d1, #1 @q0>>1 3478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vshr.s16 d9, d3, #1 @q1>>1 3488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 3498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsub.s16 d6, d8, d3 @x2 = (q0 >> 1) - q1; 3508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vadd.s16 d7, d1, d9 @x3 = q0+ (q1 >> 1); 3518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vld2.8 {d29, d30}, [r1], r3 @II row Load pu1_pred buffer 3528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 3538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vswp d6, d7 @Reverse positions of x2 and x3 3548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 3558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsub.s16 q6, q2, q3 @x0-x3 and x1-x2 combined 3568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vtrn.32 d28, d29 @ D28 -- row I and II of pu1_pred_buffer 3578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vadd.s16 q5, q2, q3 @x0 + x3 and x1+x2 combined 3588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 3598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vld2.8 {d29, d30}, [r1], r3 @III row Load pu1_pred buf 3608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 3618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vswp d12, d13 3628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@Steps for Stage 2: 3638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@------------------ 3648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vtrn.16 d10, d11 3658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vtrn.16 d12, d13 3668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vtrn.32 d10, d12 3678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vtrn.32 d11, d13 3688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vadd.s16 d14, d10, d12 @x0 = q0 + q1; 3698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 3708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsub.s16 d15, d10, d12 @x1 = q0 - q1; 3718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 3728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vshr.s16 d18, d11, #1 @q0>>1 3738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vshr.s16 d19, d13, #1 @q1>>1 3748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 3758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsub.s16 d16, d18, d13 @x2 = (q0 >> 1) - q1; 3768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vadd.s16 d17, d11, d19 @x3 = q0+ (q1 >> 1); 3778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 3788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vld2.8 {d30, d31}, [r1], r3 @IV row Load pu1_pred buffer 3798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vswp d16, d17 @Reverse positions of x2 and x3 3808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 3818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsub.s16 q11, q7, q8 @x0-x3 and x1-x2 combined 3828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vtrn.32 d29, d30 @ D29 -- row III and IV of pu1_pred_buf 3838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vadd.s16 q10, q7, q8 @x0 + x3 and x1+x2 combined 3848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 3858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vswp d22, d23 3868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 3878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vrshr.s16 q10, q10, #6 @ 3888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vrshr.s16 q11, q11, #6 3898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 3908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vaddw.u8 q10, q10, d28 3918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vaddw.u8 q11, q11, d29 3928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 3938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vld1.u8 d0, [r2], r4 @Loading out buffer 16 coeffs 3948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vld1.u8 d1, [r2], r4 3958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vld1.u8 d2, [r2], r4 3968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vld1.u8 d3, [r2], r4 3978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 3988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sub r2, r2, r4, lsl #2 3998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vqmovun.s16 d20, q10 @Getting quantized coeffs 4018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vqmovun.s16 d22, q11 4028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vmovl.u8 q10, d20 @Move the coffs into 16 bit 4048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vmovl.u8 q11, d22 @so that we can use vbit to copy 4058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vmov.u16 q14, #0x00ff @Copy lsb from qantized(long)coeffs 4078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vbit.u8 q0, q10, q14 4098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vbit.u8 q1, q11, q14 4108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vst1.u8 d0, [r2], r4 4128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vst1.u8 d1, [r2], r4 4138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vst1.u8 d2, [r2], r4 4148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vst1.u8 d3, [r2] 4158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vpop {d8-d15} 4178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ldmfd sp!, {r4-r12, r15} @Reload the registers from SP 4188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4207497191460a9504f8b4f64df169ab633f0b74353Harish Mahendrakar@* 4218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ ******************************************************************************* 4228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 4238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @brief 4248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * This function performs inverse quant and Inverse transform type Ci4 for 8*8 block 4258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 4268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @par Description: 4278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * Performs inverse transform Ci8 and adds the residue to get the 4288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * reconstructed block 4298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 4308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] pi2_src 4318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * Input 4x4 coefficients 4328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 4338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] pu1_pred 4348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * Prediction 4x4 block 4358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 4368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[out] pu1_out 4378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * Output 4x4 block 4388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 4398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] u4_qp_div_6 4408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * QP 4418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 4428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] pu2_weigh_mat 4438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * Pointer to weight matrix 4448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 4458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] pred_strd, 4468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * Prediction stride 4478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 4488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] out_strd 4498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * Output Stride 4508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 4518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ *@param[in] pi2_tmp 4528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * temporary buffer of size 1*64 4538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 4548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @param[in] pu2_iscal_mat 4558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * Pointer to the inverse quantization matrix 4568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 4578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @returns Void 4588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 4598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * @remarks 4608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * None 4618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ * 4628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ ******************************************************************************* 4637497191460a9504f8b4f64df169ab633f0b74353Harish Mahendrakar@ * 4648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@void ih264_iquant_itrans_recon_8x8(WORD16 *pi2_src, 4658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ UWORD8 *pu1_pred, 4668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ UWORD8 *pu1_out, 4678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ WORD32 pred_strd, 4688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ WORD32 out_strd, 4698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ const UWORD16 *pu2_iscal_mat, 4708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ const UWORD16 *pu2_weigh_mat, 4718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ UWORD32 u4_qp_div_6, 4728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ WORD32 *pi4_tmp, 4738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ WORD32 iq_start_idx) 4748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@**************Variables Vs Registers***************************************** 4758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r0 => *pi2_src 4768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r1 => *pu1_pred 4778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r2 => *pu1_out 4788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r3 => pred_strd 4798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r4 => out_strd 4808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r5 => *pu2_iscal_mat 4818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r6 => *pu2_weigh_mat 4828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@r7 => u4_qp_div_6 4838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S .global ih264_iquant_itrans_recon_8x8_a9 4868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Sih264_iquant_itrans_recon_8x8_a9: 4878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments 4898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ldr r7, [sp, #52] @Loads u4_qp_div_6 4908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ldr r4, [sp, #40] @Loads out_strd 4918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ldr r5, [sp, #44] @Loads *pu2_iscal_mat 4938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ldr r6, [sp, #48] @Loads *pu2_weigh_mat 4948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vdup.s32 q15, r7 @Populate the u4_qp_div_6 in Q15 4958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vpush {d8-d15} 4968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Sidct_8x8_begin: 4988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@========= DEQUANT FROM HERE =========== 5008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vld1.32 {q13}, [r5]! @ Q13 = dequant values row 0 5028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vld1.32 {q10}, [r6]! @ Q10 = scaling factors row 0 5038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vld1.32 {q14}, [r5]! @ Q14 = dequant values row 1 5048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vmul.s16 q10, q10, q13 @ Q10 = x[i] = (scale[i] * dequant[i]) where i = 0..7 5058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vld1.32 {q11}, [r6]! @ Q11 = scaling factors row 1 5068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vld1.32 {q8}, [r0]! @ Q8 = Source row 0 5078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vmul.s16 q11, q11, q14 @ Q11 = x[i] = (scale[i] * dequant[i]) where i = 8..15 5088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vmull.s16 q0, d16, d20 @ Q0 = p[i] = (x[i] * trns_coeff[i]) where i = 0..3 5098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vld1.32 {q9}, [r0]! @ Q8 = Source row 1 5108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vmull.s16 q1, d17, d21 @ Q1 = p[i] = (x[i] * trns_coeff[i]) where i = 4..7 5118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vmull.s16 q2, d18, d22 @ Q2 = p[i] = (x[i] * trns_coeff[i]) where i = 8..11 5128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vld1.32 {q13}, [r6]! @ Scaling factors row 2 5138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vmull.s16 q3, d19, d23 @ Q3 = p[i] = (x[i] * trns_coeff[i]) where i = 12..15 5148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vld1.32 {q14}, [r6]! @ Scaling factors row 3 5158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vshl.s32 q0, q0, q15 @ Q0 = q[i] = (p[i] << (qP/6)) where i = 0..3 5168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vld1.32 {q10}, [r5]! @ Q10 = Dequant values row 2 5178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vshl.s32 q1, q1, q15 @ Q1 = q[i] = (p[i] << (qP/6)) where i = 4..7 5188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vld1.32 {q8}, [r0]! @ Source Row 2 5198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vshl.s32 q2, q2, q15 @ Q2 = q[i] = (p[i] << (qP/6)) where i = 8..11 5208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vld1.32 {q11}, [r5]! @ Q11 = Dequant values row 3 5218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vshl.s32 q3, q3, q15 @ Q3 = q[i] = (p[i] << (qP/6)) where i = 12..15 5228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vld1.32 {q9}, [r0]! @ Source Row 3 5238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vmul.s16 q10, q10, q13 @ Dequant row2*scale matrix row 2 5248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vmul.s16 q11, q11, q14 @ Dequant row 3*scale matrix row 3 5258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vld1.32 {q4}, [r6]! @ Scaling factors row 4 5268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vqrshrn.s32 d0, q0, #0x6 @ D0 = c[i] = ((q[i] + 32) >> 6) where i = 0..3 5278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vqrshrn.s32 d1, q1, #0x6 @ D1 = c[i] = ((q[i] + 32) >> 6) where i = 4..7 5288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vld1.32 {q5}, [r6]! @ Scaling factors row 5 5298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vqrshrn.s32 d2, q2, #0x6 @ D2 = c[i] = ((q[i] + 32) >> 6) where i = 8..11 5308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vqrshrn.s32 d3, q3, #0x6 @ D3 = c[i] = ((q[i] + 32) >> 6) where i = 12..15 5318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vld1.32 {q13}, [r5]! @ Q13 = Dequant values row 4 5328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vmull.s16 q2, d16, d20 @ p[i] = (x[i] * trns_coeff[i]) where i=16..19 5338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vmull.s16 q3, d17, d21 @ p[i] = (x[i] * trns_coeff[i]) where i=20..23 5348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vld1.32 {q12}, [r5]! @ Q12 = Dequant values row 5 5358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vmull.s16 q6, d18, d22 @ p[i] = (x[i] * trns_coeff[i]) where i=24..27 5368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vmull.s16 q7, d19, d23 @ p[i] = (x[i] * trns_coeff[i]) where i=28..31 5378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vld1.32 {q14}, [r0]! @ Source row 4 5398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vmul.s16 q10, q4, q13 @ Dequant row4*scale matrix row 4 5408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vmul.s16 q11, q5, q12 @ Dequant row5*scale matrix row 5 5418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vld1.32 {q9}, [r0]! @ Source row 5 5428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vshl.s32 q2, q2, q15 @ 5438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vshl.s32 q3, q3, q15 @ 5448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vld1.32 {q13}, [r6]! @ Scaling factors row 6 5458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vshl.s32 q6, q6, q15 @ 5468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vshl.s32 q7, q7, q15 @ 5478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vmull.s16 q4, d28, d20 @ i = 32..35 5488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vqrshrn.s32 d4, q2, #0x6 @ D4 = c[i] = ((q[i] + 32) >> 6) where i = 16..19 5498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vqrshrn.s32 d5, q3, #0x6 @ D5 = c[i] = ((q[i] + 32) >> 6) where i = 20..23 5508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vmull.s16 q5, d29, d21 @ i =36..39 5518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vld1.32 {q10}, [r5]! @ Dequant values row 6 5528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vqrshrn.s32 d6, q6, #0x6 @ D6 = c[i] = ((q[i] + 32) >> 6) where i = 24..27 5538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vqrshrn.s32 d7, q7, #0x6 @ D7 = c[i] = ((q[i] + 32) >> 6) where i = 28..31 5548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vld1.32 {q14}, [r6]! @ Scaling factors row 7 5558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vmull.s16 q6, d18, d22 @ 5568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vld1.32 {q8}, [r0]! @ Source row 6 5578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vmull.s16 q7, d19, d23 @ 5588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vld1.32 {q11}, [r5]! @ Dequant values row 7 5598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vshl.s32 q4, q4, q15 @ 5608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vld1.32 {q9}, [r0]! @ Source row 7 5618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vshl.s32 q5, q5, q15 @ 5628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vshl.s32 q6, q6, q15 @ 5648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vshl.s32 q7, q7, q15 @ 5658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vmul.s16 q10, q10, q13 @ Dequant*scaling row 6 5668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vmul.s16 q11, q11, q14 @ Dequant*scaling row 7 5678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vqrshrn.s32 d8, q4, #0x6 @ D8 = c[i] = ((q[i] + 32) >> 6) where i = 32..35 5688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vqrshrn.s32 d9, q5, #0x6 @ D9 = c[i] = ((q[i] + 32) >> 6) where i = 36..39 5698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vqrshrn.s32 d10, q6, #0x6 @ D10 = c[i] = ((q[i] + 32) >> 6) where i = 40..43 5708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vqrshrn.s32 d11, q7, #0x6 @ D11 = c[i] = ((q[i] + 32) >> 6) where i = 44..47 5718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vmull.s16 q6, d16, d20 @ i= 48..51 5728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vmull.s16 q7, d17, d21 @ i= 52..55 5738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vmull.s16 q8, d18, d22 @ i=56..59 5748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vmull.s16 q9, d19, d23 @ i=60..63 5758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vshl.s32 q6, q6, q15 @ 5768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vzip.s16 q0, q1 @Transpose 5778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vshl.s32 q7, q7, q15 @ 5788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vshl.s32 q8, q8, q15 @ 5798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vzip.s16 q2, q3 @ 5808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vshl.s32 q9, q9, q15 @ 5818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vqrshrn.s32 d12, q6, #0x6 @ D12 = c[i] = ((q[i] + 32) >> 6) where i = 48..51 5828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vzip.s16 q4, q5 @Transpose 5838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vqrshrn.s32 d13, q7, #0x6 @ D13 = c[i] = ((q[i] + 32) >> 6) where i = 52..55 5848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vqrshrn.s32 d14, q8, #0x6 @ D14 = c[i] = ((q[i] + 32) >> 6) where i = 56..59 5858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vzip.s32 q0, q2 @Transpose 5868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vqrshrn.s32 d15, q9, #0x6 @ D15 = c[i] = ((q[i] + 32) >> 6) where i = 60..63 5878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@========= PROCESS IDCT FROM HERE ======= 5898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@Steps for Stage 2: 5918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@------------------ 5928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ TRANSPOSE 8x8 coeffs to actual order 5948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vzip.s16 q6, q7 @ 5968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vzip.s32 q1, q3 @ 5988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vzip.s32 q4, q6 @ 5998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vzip.s32 q5, q7 @ 6008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vswp d1, d8 @ Q0/Q1 = Row order x0/x1 6028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vswp d3, d10 @ Q2/Q3 = Row order x2/x3 6038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vswp d5, d12 @ Q4/Q5 = Row order x4/x5 6048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vswp d7, d14 @ Q6/Q7 = Row order x6/x7 6058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vswp q1, q4 @ 6078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vshr.s16 q10, q2, #0x1 @ 6088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vswp q3, q6 @ 6098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@Steps for Stage 1: 6118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@------------------ 6128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vadd.s16 q8, q0, q4 @ Q8 = y0 6148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsub.s16 q9, q0, q4 @ Q9 = y2 6158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsra.s16 q2, q6, #0x1 @ Q2 = y6 6178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsub.s16 q6, q10, q6 @ Q6 = y4 6188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vaddl.s16 q12, d14, d2 @ y3 (0-3) 1+7 6208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vaddl.s16 q13, d15, d3 @ y3 (4-7) 1+7 6218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsubl.s16 q10, d14, d2 @ y5 (0-3) 7-1 6238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsubl.s16 q11, d15, d3 @ y5 (4-7) 7-1 6248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vadd.s16 q0, q8, q2 @ Q0 = z0 6268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsub.s16 q4, q8, q2 @ Q4 = z6 6278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vadd.s16 q8, q9, q6 @ Q8 = z2 6298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsub.s16 q2, q9, q6 @ Q2 = z4 6308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsubw.s16 q12, q12, d6 @ y3 (0-3) 1+7-3 6328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsubw.s16 q13, q13, d7 @ y3 (0-7) 1+7-3 6338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vshr.s16 q6, q3, #0x1 @ 6358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vaddw.s16 q10, q10, d10 @ 6378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vaddw.s16 q11, q11, d11 @ 6388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vshr.s16 q9, q5, #0x1 @ 6408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsubw.s16 q12, q12, d12 @ 6428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsubw.s16 q13, q13, d13 @ 6438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vaddw.s16 q10, q10, d18 @ 6458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vaddw.s16 q11, q11, d19 @ 6468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vqmovn.s32 d12, q12 @ 6488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vaddl.s16 q12, d10, d6 @ 6498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vqmovn.s32 d13, q13 @ Q6 = y3 6508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vaddl.s16 q13, d11, d7 @ 6518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vqmovn.s32 d18, q10 @ 6528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsubl.s16 q10, d10, d6 @ 6538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vqmovn.s32 d19, q11 @ Q9 = y5 6548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsubl.s16 q11, d11, d7 @ 6558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vshr.s16 q3, q6, #0x2 @ 6578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsra.s16 q6, q9, #0x2 @ Q6 = z3 6598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vaddw.s16 q12, q12, d2 @ 6618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vaddw.s16 q13, q13, d3 @ 6628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vshr.s16 q1, #0x1 @ 6648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsub.s16 q5, q3, q9 @ Q5 = z5 6668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsubw.s16 q10, q10, d14 @ 6688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsubw.s16 q11, q11, d15 @ 6698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vshr.s16 q7, #0x1 @ 6718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vaddw.s16 q12, q12, d2 @ 6738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vaddw.s16 q13, q13, d3 @ 6748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsubw.s16 q10, q10, d14 @ 6768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsubw.s16 q11, q11, d15 @ 6778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vqmovn.s32 d14, q12 @ 6808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vadd.s16 q1, q8, q5 @ Q1 = x1 6818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vqmovn.s32 d15, q13 @ Q7 = y7 6828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsub.s16 q3, q8, q5 @ Q3 = x6 6838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vqmovn.s32 d18, q10 @ 6848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsub.s16 q5, q2, q6 @ Q5 = x5 6858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vqmovn.s32 d19, q11 @ Q9 = y1 6868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vadd.s16 q2, q2, q6 @ Q2 = x2 6878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vshr.s16 q12, q9, #0x2 @ 6898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsra.s16 q9, q7, #0x2 @ Q9 = z1 6908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsub.s16 q11, q7, q12 @ Q11 = z7 6928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vadd.s16 q6, q4, q9 @ Q6 = x3 6948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsub.s16 q4, q4, q9 @ Q4 = x4 6958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsub.s16 q7, q0, q11 @ Q7 = x7 6978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vadd.s16 q0, q0, q11 @ Q0 = x0 6988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vswp.s16 q3, q6 @ Q3 = x3, Q6 = x6 7008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@Steps for Stage 2: 7038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@------------------ 7048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ TRANSPOSE 8x8 coeffs to actual order 7068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vzip.s16 q0, q1 @ 7088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vzip.s16 q2, q3 @ 7098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vzip.s16 q4, q5 @ 7108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vzip.s16 q6, q7 @ 7118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vzip.s32 q0, q2 @ 7138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vzip.s32 q1, q3 @ 7148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vzip.s32 q4, q6 @ 7158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vzip.s32 q5, q7 @ 7168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vswp d1, d8 @ Q0/Q1 = Row order x0/x1 7188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vswp d3, d10 @ Q2/Q3 = Row order x2/x3 7198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vswp d5, d12 @ Q4/Q5 = Row order x4/x5 7208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vswp d7, d14 @ Q6/Q7 = Row order x6/x7 7218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vswp q1, q4 @ 7238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vshr.s16 q10, q2, #0x1 @ 7248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vswp q3, q6 @ 7258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@Steps for Stage 3: 7278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@------------------ 7288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@Repeat stage 1 again for vertical transform 7308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vadd.s16 q8, q0, q4 @ Q8 = y0 7328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vld1.32 d28, [r1], r3 @ Q12 = 0x070605....0x070605.... 7338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsub.s16 q9, q0, q4 @ Q9 = y2 7348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsra.s16 q2, q6, #0x1 @ Q2 = y6 7368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsub.s16 q6, q10, q6 @ Q6 = y4 7378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vaddl.s16 q12, d14, d2 @ 7398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vld1.32 d29, [r1], r3 @ Q12 = 0x070605....0x070605.... 7408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vaddl.s16 q13, d15, d3 @ 7418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsubl.s16 q10, d14, d2 @ 7438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vld1.32 d30, [r1], r3 @ Q12 = 0x070605....0x070605.... 7448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsubl.s16 q11, d15, d3 @ 7458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vadd.s16 q0, q8, q2 @ Q0 = z0 7478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vld1.32 d31, [r1], r3 @ Q12 = 0x070605....0x070605.... 7488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsub.s16 q4, q8, q2 @ Q4 = z6 7498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vadd.s16 q8, q9, q6 @ Q8 = z2 7518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsub.s16 q2, q9, q6 @ Q2 = z4 7528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsubw.s16 q12, q12, d6 @ 7548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsubw.s16 q13, q13, d7 @ 7558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vshr.s16 q6, q3, #0x1 @ 7578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vaddw.s16 q10, q10, d10 @ 7598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vaddw.s16 q11, q11, d11 @ 7608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vshr.s16 q9, q5, #0x1 @ 7628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsubw.s16 q12, q12, d12 @ 7648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsubw.s16 q13, q13, d13 @ 7658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vaddw.s16 q10, q10, d18 @ 7678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vaddw.s16 q11, q11, d19 @ 7688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vqmovn.s32 d12, q12 @ 7708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vaddl.s16 q12, d10, d6 @ 7718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vqmovn.s32 d13, q13 @ Q6 = y3 7728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vaddl.s16 q13, d11, d7 @ 7738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vqmovn.s32 d18, q10 @ 7748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsubl.s16 q10, d10, d6 @ 7758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vqmovn.s32 d19, q11 @ Q9 = y5 7768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsubl.s16 q11, d11, d7 @ 7778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vshr.s16 q3, q6, #0x2 @ 7798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsra.s16 q6, q9, #0x2 @ Q6 = z3 7818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vaddw.s16 q12, q12, d2 @ 7838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vaddw.s16 q13, q13, d3 @ 7848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vshr.s16 q1, #0x1 @ 7868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsub.s16 q5, q3, q9 @ Q5 = z5 7888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsubw.s16 q10, q10, d14 @ 7908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsubw.s16 q11, q11, d15 @ 7918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vshr.s16 q7, #0x1 @ 7938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vaddw.s16 q12, q12, d2 @ 7958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vaddw.s16 q13, q13, d3 @ 7968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsubw.s16 q10, q10, d14 @ 7988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsubw.s16 q11, q11, d15 @ 7998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 8008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vqmovn.s32 d14, q12 @ 8018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vadd.s16 q1, q8, q5 @ Q1 = x1 8028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vqmovn.s32 d15, q13 @ Q7 = y7 8038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsub.s16 q3, q8, q5 @ Q3 = x6 8048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vqmovn.s32 d18, q10 @ 8058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsub.s16 q5, q2, q6 @ Q5 = x5 8068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vqmovn.s32 d19, q11 @ Q9 = y1 8078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vadd.s16 q2, q2, q6 @ Q2 = x2 8088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 8098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vshr.s16 q12, q9, #0x2 @ 8108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsra.s16 q9, q7, #0x2 @ Q9 = z1 8118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 8128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsub.s16 q11, q7, q12 @ Q11 = z7 8138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 8148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vadd.s16 q6, q4, q9 @ Q6 = x3 8158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsub.s16 q4, q4, q9 @ Q4 = x4 8168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 8178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vsub.s16 q7, q0, q11 @ Q7 = x7 8188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vadd.s16 q0, q0, q11 @ Q0 = x0 8198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 8208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vswp.s16 q3, q6 @ Q3 <-> Q6 8218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 8228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vrshr.s16 q1, q1, #6 @ 8238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vld1.32 d16, [r1], r3 @ Q12 = 0x070605....0x070605.... 8248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vrshr.s16 q2, q2, #6 @ 8258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vrshr.s16 q4, q4, #6 @ 8268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vld1.32 d17, [r1], r3 @ Q12 = 0x070605....0x070605.... 8278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vrshr.s16 q5, q5, #6 @ 8288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vrshr.s16 q7, q7, #6 @ 8298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vld1.32 d18, [r1], r3 @ Q12 = 0x070605....0x070605.... 8308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vrshr.s16 q0, q0, #6 @ 8318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vrshr.s16 q3, q3, #6 @ 8328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vld1.32 d19, [r1], r3 @ Q12 = 0x070605....0x070605.... 8338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vrshr.s16 q6, q6, #6 @ 8348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 8358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S@ Code Added to pack sign and magnitudes 8368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 8378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vaddw.u8 q0, q0, d28 8388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vaddw.u8 q1, q1, d29 8398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vaddw.u8 q2, q2, d30 8408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vaddw.u8 q3, q3, d31 8418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vqmovun.s16 d0, q0 8428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vaddw.u8 q4, q4, d16 8438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vqmovun.s16 d1, q1 8448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vaddw.u8 q5, q5, d17 8458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vqmovun.s16 d2, q2 8468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vaddw.u8 q6, q6, d18 8478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vqmovun.s16 d3, q3 8488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vaddw.u8 q7, q7, d19 8498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 8508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vqmovun.s16 d4, q4 8518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vst1.32 d0, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs 8528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vqmovun.s16 d5, q5 8538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vst1.32 d1, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs 8548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vqmovun.s16 d6, q6 8558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vst1.32 d2, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs 8568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vqmovun.s16 d7, q7 8578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vst1.32 d3, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs 8588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vst1.32 d4, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs 8598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 8608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vst1.32 d5, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs 8618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 8628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 8638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vst1.32 d6, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs 8648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 8658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 8668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vst1.32 d7, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs 8678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 8688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Sidct_8x8_end: 8698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 8708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S vpop {d8-d15} 8718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ldmfd sp!, {r4-r12, r15} 8728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 873