omxVCM4P10_TransformDequantLumaDCFromPair_s.s revision 78e52bfac041d71ce53b5b13c2abf78af742b09d
1;// 2;// Copyright (C) 2007-2008 ARM Limited 3;// 4;// Licensed under the Apache License, Version 2.0 (the "License"); 5;// you may not use this file except in compliance with the License. 6;// You may obtain a copy of the License at 7;// 8;// http://www.apache.org/licenses/LICENSE-2.0 9;// 10;// Unless required by applicable law or agreed to in writing, software 11;// distributed under the License is distributed on an "AS IS" BASIS, 12;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13;// See the License for the specific language governing permissions and 14;// limitations under the License. 15;// 16;// 17;// 18;// File Name: omxVCM4P10_TransformDequantLumaDCFromPair_s.s 19;// OpenMAX DL: v1.0.2 20;// Revision: 12290 21;// Date: Wednesday, April 9, 2008 22;// 23;// 24;// 25;// 26;// Description: 27;// H.264 inverse quantize and transform module 28;// 29;// 30 31;// Include standard headers 32 33 INCLUDE omxtypes_s.h 34 INCLUDE armCOMM_s.h 35 36;// Import/Export symbols required from/to other files 37;// (For example tables) 38 39 IMPORT armVCM4P10_UnpackBlock4x4 40 IMPORT armVCM4P10_QPDivTable 41 IMPORT armVCM4P10_VMatrixQPModTable 42 43 M_VARIANTS CortexA8 44 45;// Set debugging level 46;//DEBUG_ON SETL {TRUE} 47 48 49;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4 50 51 52;// Guarding implementation by the processor name 53 54 55 56;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4 57 58;// Guarding implementation by the processor name 59 60 IF CortexA8 61 62;//Input Registers 63pData RN 0 64QP RN 1 65 66 67;//Local Scratch Registers 68 69;// ARM Registers 70 71pQPDivTable RN 2 72pQPModTable RN 3 73Shift RN 4 74Scale RN 5 75 76;// NEON Registers 77 78;// Packed Input pixels 79dIn0 DN D0.S16 80dIn1 DN D1.S16 81dIn2 DN D2.S16 82dIn3 DN D3.S16 83 84;// Intermediate calculations 85dRowSum1 DN D4.S16 86dRowSum2 DN D5.S16 87dRowDiff1 DN D6.S16 88dRowDiff2 DN D7.S16 89 90;// Row operated pixels 91dRowOp0 DN D0.S16 92dRowOp1 DN D1.S16 93dRowOp2 DN D2.S16 94dRowOp3 DN D3.S16 95qRowOp01 QN Q0.32 96qRowOp23 QN Q1.32 97 98;// Intermediate calculations 99dColSum1 DN D4.S16 100dColSum2 DN D5.S16 101dColDiff1 DN D6.S16 102dColDiff2 DN D7.S16 103 104;// Coloumn operated pixels 105dColOp0 DN D0.S16 106dColOp1 DN D1.S16 107dColOp2 DN D2.S16 108dColOp3 DN D3.S16 109 110;// Temporary scratch varaibles 111 112dScale DN D5.S16 113qRound0 QN Q3.S32 114qRound1 QN Q4.S32 115qRound2 QN Q5.S32 116qRound3 QN Q6.S32 117 118;// InvTransformed and Dequantized pixels 119dOut0 DN D0.S16 120dOut1 DN D1.S16 121dOut2 DN D2.S16 122dOut3 DN D3.S16 123 124 125 ;// Allocate stack memory required by the function 126 127 128 ;// Write function header 129 M_START armVCM4P10_InvTransformDequantLumaDC4x4,r5,d13 130 131 ;****************************************************************** 132 ;// The strategy used in implementing the transform is as follows:* 133 ;// Load the 4x4 block into 4 D-registers * 134 ;// Transpose the 4x4 matrix * 135 ;// Perform the row operations (on columns) using SIMD * 136 ;// Transpose the 4x4 result matrix * 137 ;// Perform the coloumn operations * 138 ;****************************************************************** 139 140 ;// Load all the 4x4 pixels in Transposed form 141 142 VLD4 {dIn0,dIn1,dIn2,dIn3},[pData] 143 LDR pQPDivTable, =armVCM4P10_QPDivTable ;// QP Division look-up-table base pointer 144 LDR pQPModTable, =armVCM4P10_VMatrixQPModTable ;// QP Modulo look-up-table base pointer 145 146 ;**************************************** 147 ;// Row Operations (Performed on columns) 148 ;**************************************** 149 ;// Scale factor calculation is done using ARM instructions 150 ;// Interleaved with NEON instructions inorder to Dual issue 151 152 VADD dRowSum1,dIn0,dIn1 153 VADD dRowSum2,dIn2,dIn3 154 VSUB dRowDiff1,dIn0,dIn1 155 LDRSB Shift, [pQPDivTable, QP] ;// ARM CODE: Shift = pQPDivTable[QP] 156 VSUB dRowDiff2,dIn2,dIn3 157 LDRSB Scale, [pQPModTable, QP] ;// ARM CODE: Scale = pQPModTable[QP] 158 VADD dRowOp0,dRowSum1,dRowSum2 159 VSUB dRowOp1,dRowSum1,dRowSum2 160 VSUB dRowOp2,dRowDiff1,dRowDiff2 161 LSL Scale, Scale, Shift ;// ARM CODE: Scale = Scale << Shift 162 VADD dRowOp3,dRowDiff1,dRowDiff2 163 164 ;**************************************** 165 ;// Transpose the resultant matrix 166 ;**************************************** 167 168 VTRN dRowOp0,dRowOp1 169 VTRN dRowOp2,dRowOp3 170 VTRN qRowOp01,qRowOp23 171 172 ;**************************************** 173 ;// Coloumn Operations 174 ;**************************************** 175 176 VADD dColSum1,dRowOp0,dRowOp1 177 VADD dColSum2,dRowOp2,dRowOp3 178 VSUB dColDiff1,dRowOp0,dRowOp1 179 VSUB dColDiff2,dRowOp2,dRowOp3 180 VADD dColOp0,dColSum1,dColSum2 181 VSUB dColOp1,dColSum1,dColSum2 182 VSUB dColOp2,dColDiff1,dColDiff2 183 VADD dColOp3,dColDiff1,dColDiff2 184 185 ;//---------------------------------------------------------------------- 186 ;// 187 ;// <Dequantize> improves on the c-reference code 188 ;// Both the cases i.e., Shift>=0 and Shift<0 cases are covered together 189 ;// We do not subtract 2 from Shift as in C reference, instead perform a 190 ;// Scale << Shift once in the beginning and do a right shift by a 191 ;// constant 2 after the Multiplication. The value of Round would be 2 192 ;// 193 ;// By doing this we aviod the Branches required and also 194 ;// reduce the code size substantially 195 ;// 196 ;//---------------------------------------------------------------------- 197 198 199 VDUP dScale, Scale ;// ARM -> NEON copy 'scale' to vector 200 201 202 VMOV qRound0,#2 ;// Set the Round Value 203 VMOV qRound1,#2 204 VMOV qRound2,#2 205 VMOV qRound3,#2 206 207 VMLAL qRound0,dColOp0,dScale ;// pDst[i] * Scale + Round 208 VMLAL qRound1,dColOp1,dScale 209 VMLAL qRound2,dColOp2,dScale 210 VMLAL qRound3,dColOp3,dScale 211 212 VSHRN dOut0,qRound0,#2 ;// Right shift by 2 & (OMX_S16)Value 213 VSHRN dOut1,qRound1,#2 214 VSHRN dOut2,qRound2,#2 215 VSHRN dOut3,qRound3,#2 216 217 ;*************************** 218 ;// Store all the 4x4 pixels 219 ;*************************** 220 221 VST1 {dOut0,dOut1,dOut2,dOut3}, [pData] 222 223 224 ;// Set return value 225 226 ;// Write function tail 227 M_END 228 229 ENDIF ;//CORTEXA8 230 231 232 233;// Function: omxVCM4P10_TransformDequantLumaDCFromPair 234 235;//Input Registers 236ppSrc RN 0 237pDst RN 1 238QPR2 RN 2 239 240;//Output Registers 241result RN 0 242 243;//Local Scratch Registers 244pDstR4 RN 4 245pDstR0 RN 0 246QPR1 RN 1 247QPR5 RN 5 248 249;// Guarding implementation by the processor name 250 251 IF CortexA8 252 253 ;// Allocate stack memory required by the function 254 255 256 ;// Write function header 257 M_START omxVCM4P10_TransformDequantLumaDCFromPair,r5 258 259 MOV pDstR4,pDst ;// Saving register r1 260 MOV QPR5,QPR2 ;// Saving register r2 261 BL armVCM4P10_UnpackBlock4x4 262 263 MOV pDstR0,pDstR4 ;// Setting up register r0 264 MOV QPR1,QPR5 ;// Setting up register r1 265 BL armVCM4P10_InvTransformDequantLumaDC4x4 266 267 268 ;// Set return value 269 MOV result,#OMX_Sts_NoErr 270 271 ;// Write function tail 272 M_END 273 274 275 ENDIF ;//ARM1136JS 276 277 278 END 279