1;// 2;// 3;// File Name: omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.s 4;// OpenMAX DL: v1.0.2 5;// Revision: 12290 6;// Date: Wednesday, April 9, 2008 7;// 8;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. 9;// 10;// 11;// 12;// Description: 13;// H.264 inverse quantize and transform module 14;// 15;// 16 17 18 19;// Include standard headers 20 21 INCLUDE omxtypes_s.h 22 INCLUDE armCOMM_s.h 23 24;// Import symbols required from other files 25;// (For example tables) 26 27 IMPORT armVCM4P10_UnpackBlock4x4 28 IMPORT armVCM4P10_TransformResidual4x4 29 IMPORT armVCM4P10_QPDivTable 30 IMPORT armVCM4P10_VMatrixU16 31 IMPORT armVCM4P10_QPModuloTable 32 33 M_VARIANTS CortexA8 34 35;// Set debugging level 36;//DEBUG_ON SETL {TRUE} 37 38 39;// Static Function: armVCM4P10_DequantLumaAC4x4 40 41;// Guarding implementation by the processor name 42 43 44 45;// Guarding implementation by the processor name 46 47 48 49 50 51 52;// Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd 53 54;// Guarding implementation by the processor name 55 56 57 58;// Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd 59 60;// Guarding implementation by the processor name 61 62 IF CortexA8 63 64 65;// ARM Registers 66 67;//Input Registers 68ppSrc RN 0 69pPred RN 1 70pDC RN 2 71pDst RN 3 72 73 74;//Output Registers 75result RN 0 76 77;//Local Scratch Registers 78 79;//Registers used in armVCM4P10_DequantLumaAC4x4 80pQPdiv RN 10 81pQPmod RN 11 82pVRow RN 2 83QPmod RN 12 84shift RN 14 85index0 RN 1 86index1 RN 10 87 88;//Registers used in DequantTransformResidualFromPairAndAdd 89pDelta RN 4 90pDeltaTmp RN 6 91AC RN 5 ;//Load from stack 92pPredTemp RN 7 93pDCTemp RN 8 94pDstTemp RN 9 95pDeltaArg1 RN 1 96pDeltaArg0 RN 0 97QP RN 1 ;//Load from stack 98DCval RN 10 99predstep RN 1 100dstStep RN 10 101PredVal1 RN 3 102PredVal2 RN 5 103 104 105 106 107;// Neon Registers 108 109;// Registers used in armVCM4P10_DequantLumaAC4x4 110 111dVmatrix DN D6.8 112dindexRow0 DN D7.32 113dindexRow1 DN D9.32 114dByteIndexRow0 DN D7.8 115dByteIndexRow1 DN D9.8 116dVRow0 DN D8.8 117dVRow1 DN D4.8 118dVRow0U16 DN D8.U16 119dVRow1U16 DN D4.U16 120dVRow2U16 DN D8.U16 121dVRow3U16 DN D4.U16 122 123dShift DN D5.U16 124dSrcRow0 DN D0.I16 125dSrcRow1 DN D1.I16 126dSrcRow2 DN D2.I16 127dSrcRow3 DN D3.I16 128dDqntRow0 DN D0.I16 129dDqntRow1 DN D1.I16 130dDqntRow2 DN D2.I16 131dDqntRow3 DN D3.I16 132 133;// Registers used in TransformResidual4x4 134 135;// Packed Input pixels 136dIn0 DN D0.S16 137dIn1 DN D1.S16 138dIn2 DN D2.S16 139dIn3 DN D3.S16 140qIn01 QN Q0.32 141qIn23 QN Q1.32 142 143;// Intermediate calculations 144dZero DN D4.S16 145de0 DN D5.S16 146de1 DN D6.S16 147de2 DN D7.S16 148de3 DN D8.S16 149dIn1RS DN D7.S16 150dIn3RS DN D8.S16 151df0 DN D0.S16 152df1 DN D1.S16 153df2 DN D2.S16 154df3 DN D3.S16 155qf01 QN Q0.32 156qf23 QN Q1.32 157dg0 DN D5.S16 158dg1 DN D6.S16 159dg2 DN D7.S16 160dg3 DN D8.S16 161df1RS DN D7.S16 162df3RS DN D8.S16 163 164;// Output pixels 165dh0 DN D0.S16 166dh1 DN D1.S16 167dh2 DN D2.S16 168dh3 DN D3.S16 169 170;// Registers used in DequantTransformResidualFromPairAndAdd 171 172dDeltaRow0 DN D0.S16 173dDeltaRow1 DN D1.S16 174dDeltaRow2 DN D2.S16 175dDeltaRow3 DN D3.S16 176qDeltaRow01 QN Q0.S16 177qDeltaRow23 QN Q1.S16 178 179dPredValRow01 DN D4.U8 180dPredValRow23 DN D5.U8 181 182qSumRow01 QN Q3.S16 183qSumRow23 QN Q4.S16 184dDstRow01 DN D0.U8 185dDstRow23 DN D1.U8 186dDstRow0 DN D0.32[0] 187dDstRow1 DN D0.32[1] 188dDstRow2 DN D1.32[0] 189dDstRow3 DN D1.32[1] 190 191 192 ;// Allocate stack memory required by the function 193 M_ALLOC8 pBuffer, 32 194 195 196 ;// Write function header 197 M_START omxVCM4P10_DequantTransformResidualFromPairAndAdd,r11,d9 198 199 ;// Define stack arguments 200 M_ARG predStepOnStack, 4 201 M_ARG dstStepOnStack,4 202 M_ARG QPOnStack, 4 203 M_ARG ACOnStack,4 204 205 206 M_ADR pDelta,pBuffer 207 M_LDR AC,ACOnStack 208 209 210 ;// Save registers r1,r2,r3 before function call 211 MOV pPredTemp,pPred 212 MOV pDCTemp,pDC 213 MOV pDstTemp,pDst 214 215 CMP AC,#0 216 BEQ DCcase 217 MOV pDeltaArg1,pDelta ;// Set up r1 for armVCM4P10_UnpackBlock4x4 218 219 BL armVCM4P10_UnpackBlock4x4 220 221 ;//-------------------------------------------------------- 222 ;// armVCM4P10_DequantLumaAC4x4 : static function inlined 223 ;//-------------------------------------------------------- 224 225 ;//BL armVCM4P10_DequantLumaAC4x4 226 M_LDR QP,QPOnStack ;// Set up r1 for armVCM4P10_DequantLumaAC4x4 227 228 LDR pQPmod,=armVCM4P10_QPModuloTable 229 LDR pQPdiv,=armVCM4P10_QPDivTable 230 LDR pVRow,=armVCM4P10_VMatrixU16 231 232 233 LDRSB QPmod,[pQPmod,QP] ;// (QP%6) * 6 234 LDRSB shift,[pQPdiv,QP] ;// Shift = QP / 6 235 236 LDR index1,=0x03020504 237 LDR index0,=0x05040100 ;// Indexes into dVmatrix 238 ADD pVRow,pVRow,QPmod 239 VDUP dindexRow0,index0 240 VDUP dindexRow1,index1 241 VDUP dShift,shift 242 243 ;// Load all 4x4 pVRow[] values 244 VLD1 dVmatrix,[pVRow] ;// dVmatrix = [0d|0c|0b|0a] 245 246 247 VTBL dVRow0,dVmatrix,dByteIndexRow0 ;// row0 = row2 = [pVRow[2] | pVRow[0] | pVRow[2] | pVRow[0]] 248 VTBL dVRow1,dVmatrix,dByteIndexRow1 ;// row1 = row3 = [pVRow[1] | pVRow[2] | pVRow[1] | pVRow[2]] 249 CMP pDCTemp,#0 250 ;// Load all the 4x4 'src' values 251 VLD1 { dSrcRow0,dSrcRow1,dSrcRow2,dSrcRow3 },[pDelta] 252 253 VSHL dVRow0U16,dVRow0U16,dShift 254 VSHL dVRow1U16,dVRow1U16,dShift 255 LDRSHNE DCval,[pDCTemp] 256 257 258 ;// Multiply src[] with pVRow[] 259 VMUL dDqntRow0,dSrcRow0,dVRow0U16 260 VMUL dDqntRow1,dSrcRow1,dVRow1U16 261 VMUL dDqntRow2,dSrcRow2,dVRow2U16 262 VMUL dDqntRow3,dSrcRow3,dVRow3U16 263 264 265 266 ;//------------------------------------------------------------- 267 ;// TransformResidual4x4 : Inlined to avoid Load/Stores 268 ;//------------------------------------------------------------- 269 270 271 ;//BL armVCM4P10_TransformResidual4x4 272 ;//STRHNE DCval,[pDelta] 273 VMOVNE dIn0[0],DCval 274 275 276 277 ;//***************************************************************** 278 ;// Transpose the input pixels : perform Row ops as Col ops 279 ;//***************************************************************** 280 281 VTRN dIn0,dIn1 282 VTRN dIn2,dIn3 283 VTRN qIn01,qIn23 284 285 286 VMOV dZero,#0 ;// Used to right shift by 1 287 288 289 ;//**************************************** 290 ;// Row Operations (Performed on columns) 291 ;//**************************************** 292 293 294 VADD de0,dIn0,dIn2 ;// e0 = d0 + d2 295 VSUB de1,dIn0,dIn2 ;// e1 = d0 - d2 296 VHADD dIn1RS,dIn1,dZero ;// (f1>>1) constZero is a register holding 0 297 VHADD dIn3RS,dIn3,dZero 298 VSUB de2,dIn1RS,dIn3 ;// e2 = (d1>>1) - d3 299 VADD de3,dIn1,dIn3RS ;// e3 = d1 + (d3>>1) 300 VADD df0,de0,de3 ;// f0 = e0 + e3 301 VADD df1,de1,de2 ;// f1 = e1 + e2 302 VSUB df2,de1,de2 ;// f2 = e1 - e2 303 VSUB df3,de0,de3 ;// f3 = e0 - e3 304 305 306 307 ;//***************************************************************** 308 ;// Transpose the resultant matrix 309 ;//***************************************************************** 310 311 VTRN df0,df1 312 VTRN df2,df3 313 VTRN qf01,qf23 314 315 316 ;//******************************* 317 ;// Coloumn Operations 318 ;//******************************* 319 320 321 VADD dg0,df0,df2 ;// e0 = d0 + d2 322 VSUB dg1,df0,df2 ;// e1 = d0 - d2 323 VHADD df1RS,df1,dZero ;// (f1>>1) constZero is a register holding 0 324 VHADD df3RS,df3,dZero 325 VSUB dg2,df1RS,df3 ;// e2 = (d1>>1) - d3 326 VADD dg3,df1,df3RS ;// e3 = d1 + (d3>>1) 327 VADD dh0,dg0,dg3 ;// f0 = e0 + e3 328 VADD dh1,dg1,dg2 ;// f1 = e1 + e2 329 VSUB dh2,dg1,dg2 ;// f2 = e1 - e2 330 VSUB dh3,dg0,dg3 ;// f3 = e0 - e3 331 332 333 ;//************************************************ 334 ;// Calculate final value (colOp[i][j] + 32)>>6 335 ;//************************************************ 336 337 VRSHR dh0,#6 338 VRSHR dh1,#6 339 VRSHR dh2,#6 340 VRSHR dh3,#6 341 342 343 B OutDCcase 344 345 346DCcase 347 ;// Calculate the Transformed DCvalue : (DCval+32)>>6 348 LDRSH DCval,[pDCTemp] 349 ADD DCval,DCval,#32 350 ASR DCval,DCval,#6 351 352 VDUP dDeltaRow0, DCval ;// pDelta[0] = pDelta[1] = pDelta[2] = pDelta[3] = DCval 353 VDUP dDeltaRow1, DCval ;// pDelta[4] = pDelta[5] = pDelta[6] = pDelta[7] = DCval 354 VDUP dDeltaRow2, DCval ;// pDelta[8] = pDelta[9] = pDelta[10] = pDelta[11] = DCval 355 VDUP dDeltaRow3, DCval 356 357 358OutDCcase 359 M_LDR predstep,predStepOnStack 360 M_LDR dstStep,dstStepOnStack 361 362 LDR PredVal1,[pPredTemp],predstep 363 LDR PredVal2,[pPredTemp],predstep 364 VMOV dPredValRow01,PredVal1,PredVal2 365 366 LDR PredVal1,[pPredTemp],predstep 367 LDR PredVal2,[pPredTemp] 368 VMOV dPredValRow23,PredVal1,PredVal2 369 370 371 VADDW qSumRow01,qDeltaRow01,dPredValRow01 372 VADDW qSumRow23,qDeltaRow23,dPredValRow23 373 VQMOVUN dDstRow01,qSumRow01 374 VQMOVUN dDstRow23,qSumRow23 375 376 377 VST1 dDstRow0,[pDstTemp],dstStep 378 VST1 dDstRow1,[pDstTemp],dstStep 379 VST1 dDstRow2,[pDstTemp],dstStep 380 VST1 dDstRow3,[pDstTemp] 381 382 ;// Set return value 383 MOV result,#OMX_Sts_NoErr 384 385End 386 387 388 ;// Write function tail 389 390 M_END 391 392 ENDIF ;//CORTEXA8 393 394 395 396 END 397