omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.s revision 78e52bfac041d71ce53b5b13c2abf78af742b09d
1;// 2;// Copyright (C) 2007-2008 ARM Limited 3;// 4;// Licensed under the Apache License, Version 2.0 (the "License"); 5;// you may not use this file except in compliance with the License. 6;// You may obtain a copy of the License at 7;// 8;// http://www.apache.org/licenses/LICENSE-2.0 9;// 10;// Unless required by applicable law or agreed to in writing, software 11;// distributed under the License is distributed on an "AS IS" BASIS, 12;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13;// See the License for the specific language governing permissions and 14;// limitations under the License. 15;// 16;// 17;// 18;// File Name: omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.s 19;// OpenMAX DL: v1.0.2 20;// Revision: 12290 21;// Date: Wednesday, April 9, 2008 22;// 23;// 24;// 25;// 26;// Description: 27;// H.264 inverse quantize and transform module 28;// 29;// 30 31 32 33;// Include standard headers 34 35 INCLUDE omxtypes_s.h 36 INCLUDE armCOMM_s.h 37 38;// Import symbols required from other files 39;// (For example tables) 40 41 IMPORT armVCM4P10_UnpackBlock4x4 42 IMPORT armVCM4P10_TransformResidual4x4 43 IMPORT armVCM4P10_QPDivTable 44 IMPORT armVCM4P10_VMatrixU16 45 IMPORT armVCM4P10_QPModuloTable 46 47 M_VARIANTS CortexA8 48 49;// Set debugging level 50;//DEBUG_ON SETL {TRUE} 51 52 53;// Static Function: armVCM4P10_DequantLumaAC4x4 54 55;// Guarding implementation by the processor name 56 57 58 59;// Guarding implementation by the processor name 60 61 62 63 64 65 66;// Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd 67 68;// Guarding implementation by the processor name 69 70 71 72;// Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd 73 74;// Guarding implementation by the processor name 75 76 IF CortexA8 77 78 79;// ARM Registers 80 81;//Input Registers 82ppSrc RN 0 83pPred RN 1 84pDC RN 2 85pDst RN 3 86 87 88;//Output Registers 89result RN 0 90 91;//Local Scratch Registers 92 93;//Registers used in armVCM4P10_DequantLumaAC4x4 94pQPdiv RN 10 95pQPmod RN 11 96pVRow RN 2 97QPmod RN 12 98shift RN 14 99index0 RN 1 100index1 RN 10 101 102;//Registers used in DequantTransformResidualFromPairAndAdd 103pDelta RN 4 104pDeltaTmp RN 6 105AC RN 5 ;//Load from stack 106pPredTemp RN 7 107pDCTemp RN 8 108pDstTemp RN 9 109pDeltaArg1 RN 1 110pDeltaArg0 RN 0 111QP RN 1 ;//Load from stack 112DCval RN 10 113predstep RN 1 114dstStep RN 10 115PredVal1 RN 3 116PredVal2 RN 5 117 118 119 120 121;// Neon Registers 122 123;// Registers used in armVCM4P10_DequantLumaAC4x4 124 125dVmatrix DN D6.8 126dindexRow0 DN D7.32 127dindexRow1 DN D9.32 128dByteIndexRow0 DN D7.8 129dByteIndexRow1 DN D9.8 130dVRow0 DN D8.8 131dVRow1 DN D4.8 132dVRow0U16 DN D8.U16 133dVRow1U16 DN D4.U16 134dVRow2U16 DN D8.U16 135dVRow3U16 DN D4.U16 136 137dShift DN D5.U16 138dSrcRow0 DN D0.I16 139dSrcRow1 DN D1.I16 140dSrcRow2 DN D2.I16 141dSrcRow3 DN D3.I16 142dDqntRow0 DN D0.I16 143dDqntRow1 DN D1.I16 144dDqntRow2 DN D2.I16 145dDqntRow3 DN D3.I16 146 147;// Registers used in TransformResidual4x4 148 149;// Packed Input pixels 150dIn0 DN D0.S16 151dIn1 DN D1.S16 152dIn2 DN D2.S16 153dIn3 DN D3.S16 154qIn01 QN Q0.32 155qIn23 QN Q1.32 156 157;// Intermediate calculations 158dZero DN D4.S16 159de0 DN D5.S16 160de1 DN D6.S16 161de2 DN D7.S16 162de3 DN D8.S16 163dIn1RS DN D7.S16 164dIn3RS DN D8.S16 165df0 DN D0.S16 166df1 DN D1.S16 167df2 DN D2.S16 168df3 DN D3.S16 169qf01 QN Q0.32 170qf23 QN Q1.32 171dg0 DN D5.S16 172dg1 DN D6.S16 173dg2 DN D7.S16 174dg3 DN D8.S16 175df1RS DN D7.S16 176df3RS DN D8.S16 177 178;// Output pixels 179dh0 DN D0.S16 180dh1 DN D1.S16 181dh2 DN D2.S16 182dh3 DN D3.S16 183 184;// Registers used in DequantTransformResidualFromPairAndAdd 185 186dDeltaRow0 DN D0.S16 187dDeltaRow1 DN D1.S16 188dDeltaRow2 DN D2.S16 189dDeltaRow3 DN D3.S16 190qDeltaRow01 QN Q0.S16 191qDeltaRow23 QN Q1.S16 192 193dPredValRow01 DN D4.U8 194dPredValRow23 DN D5.U8 195 196qSumRow01 QN Q3.S16 197qSumRow23 QN Q4.S16 198dDstRow01 DN D0.U8 199dDstRow23 DN D1.U8 200dDstRow0 DN D0.32[0] 201dDstRow1 DN D0.32[1] 202dDstRow2 DN D1.32[0] 203dDstRow3 DN D1.32[1] 204 205 206 ;// Allocate stack memory required by the function 207 M_ALLOC8 pBuffer, 32 208 209 210 ;// Write function header 211 M_START omxVCM4P10_DequantTransformResidualFromPairAndAdd,r11,d9 212 213 ;// Define stack arguments 214 M_ARG predStepOnStack, 4 215 M_ARG dstStepOnStack,4 216 M_ARG QPOnStack, 4 217 M_ARG ACOnStack,4 218 219 220 M_ADR pDelta,pBuffer 221 M_LDR AC,ACOnStack 222 223 224 ;// Save registers r1,r2,r3 before function call 225 MOV pPredTemp,pPred 226 MOV pDCTemp,pDC 227 MOV pDstTemp,pDst 228 229 CMP AC,#0 230 BEQ DCcase 231 MOV pDeltaArg1,pDelta ;// Set up r1 for armVCM4P10_UnpackBlock4x4 232 233 BL armVCM4P10_UnpackBlock4x4 234 235 ;//-------------------------------------------------------- 236 ;// armVCM4P10_DequantLumaAC4x4 : static function inlined 237 ;//-------------------------------------------------------- 238 239 ;//BL armVCM4P10_DequantLumaAC4x4 240 M_LDR QP,QPOnStack ;// Set up r1 for armVCM4P10_DequantLumaAC4x4 241 242 LDR pQPmod,=armVCM4P10_QPModuloTable 243 LDR pQPdiv,=armVCM4P10_QPDivTable 244 LDR pVRow,=armVCM4P10_VMatrixU16 245 246 247 LDRSB QPmod,[pQPmod,QP] ;// (QP%6) * 6 248 LDRSB shift,[pQPdiv,QP] ;// Shift = QP / 6 249 250 LDR index1,=0x03020504 251 LDR index0,=0x05040100 ;// Indexes into dVmatrix 252 ADD pVRow,pVRow,QPmod 253 VDUP dindexRow0,index0 254 VDUP dindexRow1,index1 255 VDUP dShift,shift 256 257 ;// Load all 4x4 pVRow[] values 258 VLD1 dVmatrix,[pVRow] ;// dVmatrix = [0d|0c|0b|0a] 259 260 261 VTBL dVRow0,dVmatrix,dByteIndexRow0 ;// row0 = row2 = [pVRow[2] | pVRow[0] | pVRow[2] | pVRow[0]] 262 VTBL dVRow1,dVmatrix,dByteIndexRow1 ;// row1 = row3 = [pVRow[1] | pVRow[2] | pVRow[1] | pVRow[2]] 263 CMP pDCTemp,#0 264 ;// Load all the 4x4 'src' values 265 VLD1 { dSrcRow0,dSrcRow1,dSrcRow2,dSrcRow3 },[pDelta] 266 267 VSHL dVRow0U16,dVRow0U16,dShift 268 VSHL dVRow1U16,dVRow1U16,dShift 269 LDRSHNE DCval,[pDCTemp] 270 271 272 ;// Multiply src[] with pVRow[] 273 VMUL dDqntRow0,dSrcRow0,dVRow0U16 274 VMUL dDqntRow1,dSrcRow1,dVRow1U16 275 VMUL dDqntRow2,dSrcRow2,dVRow2U16 276 VMUL dDqntRow3,dSrcRow3,dVRow3U16 277 278 279 280 ;//------------------------------------------------------------- 281 ;// TransformResidual4x4 : Inlined to avoid Load/Stores 282 ;//------------------------------------------------------------- 283 284 285 ;//BL armVCM4P10_TransformResidual4x4 286 ;//STRHNE DCval,[pDelta] 287 VMOVNE dIn0[0],DCval 288 289 290 291 ;//***************************************************************** 292 ;// Transpose the input pixels : perform Row ops as Col ops 293 ;//***************************************************************** 294 295 VTRN dIn0,dIn1 296 VTRN dIn2,dIn3 297 VTRN qIn01,qIn23 298 299 300 VMOV dZero,#0 ;// Used to right shift by 1 301 302 303 ;//**************************************** 304 ;// Row Operations (Performed on columns) 305 ;//**************************************** 306 307 308 VADD de0,dIn0,dIn2 ;// e0 = d0 + d2 309 VSUB de1,dIn0,dIn2 ;// e1 = d0 - d2 310 VHADD dIn1RS,dIn1,dZero ;// (f1>>1) constZero is a register holding 0 311 VHADD dIn3RS,dIn3,dZero 312 VSUB de2,dIn1RS,dIn3 ;// e2 = (d1>>1) - d3 313 VADD de3,dIn1,dIn3RS ;// e3 = d1 + (d3>>1) 314 VADD df0,de0,de3 ;// f0 = e0 + e3 315 VADD df1,de1,de2 ;// f1 = e1 + e2 316 VSUB df2,de1,de2 ;// f2 = e1 - e2 317 VSUB df3,de0,de3 ;// f3 = e0 - e3 318 319 320 321 ;//***************************************************************** 322 ;// Transpose the resultant matrix 323 ;//***************************************************************** 324 325 VTRN df0,df1 326 VTRN df2,df3 327 VTRN qf01,qf23 328 329 330 ;//******************************* 331 ;// Coloumn Operations 332 ;//******************************* 333 334 335 VADD dg0,df0,df2 ;// e0 = d0 + d2 336 VSUB dg1,df0,df2 ;// e1 = d0 - d2 337 VHADD df1RS,df1,dZero ;// (f1>>1) constZero is a register holding 0 338 VHADD df3RS,df3,dZero 339 VSUB dg2,df1RS,df3 ;// e2 = (d1>>1) - d3 340 VADD dg3,df1,df3RS ;// e3 = d1 + (d3>>1) 341 VADD dh0,dg0,dg3 ;// f0 = e0 + e3 342 VADD dh1,dg1,dg2 ;// f1 = e1 + e2 343 VSUB dh2,dg1,dg2 ;// f2 = e1 - e2 344 VSUB dh3,dg0,dg3 ;// f3 = e0 - e3 345 346 347 ;//************************************************ 348 ;// Calculate final value (colOp[i][j] + 32)>>6 349 ;//************************************************ 350 351 VRSHR dh0,#6 352 VRSHR dh1,#6 353 VRSHR dh2,#6 354 VRSHR dh3,#6 355 356 357 B OutDCcase 358 359 360DCcase 361 ;// Calculate the Transformed DCvalue : (DCval+32)>>6 362 LDRSH DCval,[pDCTemp] 363 ADD DCval,DCval,#32 364 ASR DCval,DCval,#6 365 366 VDUP dDeltaRow0, DCval ;// pDelta[0] = pDelta[1] = pDelta[2] = pDelta[3] = DCval 367 VDUP dDeltaRow1, DCval ;// pDelta[4] = pDelta[5] = pDelta[6] = pDelta[7] = DCval 368 VDUP dDeltaRow2, DCval ;// pDelta[8] = pDelta[9] = pDelta[10] = pDelta[11] = DCval 369 VDUP dDeltaRow3, DCval 370 371 372OutDCcase 373 M_LDR predstep,predStepOnStack 374 M_LDR dstStep,dstStepOnStack 375 376 LDR PredVal1,[pPredTemp],predstep 377 LDR PredVal2,[pPredTemp],predstep 378 VMOV dPredValRow01,PredVal1,PredVal2 379 380 LDR PredVal1,[pPredTemp],predstep 381 LDR PredVal2,[pPredTemp] 382 VMOV dPredValRow23,PredVal1,PredVal2 383 384 385 VADDW qSumRow01,qDeltaRow01,dPredValRow01 386 VADDW qSumRow23,qDeltaRow23,dPredValRow23 387 VQMOVUN dDstRow01,qSumRow01 388 VQMOVUN dDstRow23,qSumRow23 389 390 391 VST1 dDstRow0,[pDstTemp],dstStep 392 VST1 dDstRow1,[pDstTemp],dstStep 393 VST1 dDstRow2,[pDstTemp],dstStep 394 VST1 dDstRow3,[pDstTemp] 395 396 ;// Set return value 397 MOV result,#OMX_Sts_NoErr 398 399End 400 401 402 ;// Write function tail 403 404 M_END 405 406 ENDIF ;//CORTEXA8 407 408 409 410 END 411