omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.s revision 78e52bfac041d71ce53b5b13c2abf78af742b09d
1;// 2;// Copyright (C) 2007 ARM Limited 3;// 4;// Licensed under the Apache License, Version 2.0 (the "License"); 5;// you may not use this file except in compliance with the License. 6;// You may obtain a copy of the License at 7;// 8;// http://www.apache.org/licenses/LICENSE-2.0 9;// 10;// Unless required by applicable law or agreed to in writing, software 11;// distributed under the License is distributed on an "AS IS" BASIS, 12;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13;// See the License for the specific language governing permissions and 14;// limitations under the License. 15;// 16;// 17;// 18;// Description: 19;// H.264 inverse quantize and transform module 20;// 21;// 22 23 24 25;// Include standard headers 26 27 INCLUDE omxtypes_s.h 28 INCLUDE armCOMM_s.h 29 30;// Import symbols required from other files 31;// (For example tables) 32 33 IMPORT armVCM4P10_UnpackBlock4x4 34 IMPORT armVCM4P10_TransformResidual4x4 35 IMPORT armVCM4P10_QPDivTable 36 IMPORT armVCM4P10_VMatrixU16 37 IMPORT armVCM4P10_QPModuloTable 38 39 M_VARIANTS ARM1136JS, ARM1136JS_U 40 41;// Set debugging level 42;//DEBUG_ON SETL {TRUE} 43 44 45;// Static Function: armVCM4P10_DequantLumaAC4x4 46 47;// Guarding implementation by the processor name 48 49 IF ARM1136JS 50 51;//Input Registers 52pSrcDst RN 0 53QP RN 1 54 55 56;//Output Registers 57 58 59;//Local Scratch Registers 60pQPdiv RN 4 61pQPmod RN 5 62pVRow RN 2 63QPmod RN 6 64shift RN 3 65rowLuma01 RN 1 66rowLuma23 RN 4 67 68SrcDst00 RN 5 69SrcDst02 RN 6 70SrcDst10 RN 7 71SrcDst12 RN 8 72SrcDst20 RN 9 73SrcDst22 RN 10 74SrcDst30 RN 11 75SrcDst32 RN 12 76 77temp1 RN 2 78temp2 RN 3 79temp3 RN 14 80 81 82 ;// Allocate stack memory required by the function 83 84 ;// Write function header 85 M_START armVCM4P10_DequantLumaAC4x4,r11 86 87 LDR pQPmod,=armVCM4P10_QPModuloTable 88 LDR pQPdiv,=armVCM4P10_QPDivTable 89 LDR pVRow,=armVCM4P10_VMatrixU16 90 91 LDRSB QPmod,[pQPmod,QP] ;// (QP%6) * 6 92 LDRSB shift,[pQPdiv,QP] ;// Shift = QP / 6 93 94 LDRH rowLuma01,[pVRow,QPmod]! ;// rowLuma01 = [00|0a] 95 LDRH temp3,[pVRow,#2] ;// temp3 = [00|0b] 96 LDRH rowLuma23,[pVRow,#4] ;// rowLuma23 = [00|0c] 97 ORR rowLuma01,rowLuma01,temp3,LSL #16 ;// rowLuma01 = [0b|0a] 98 99 ;// Load all the 16 'src' values 100 LDMIA pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32} 101 102 103 ;//********************************************************************************************* 104 ;// 105 ;// 'Shift' ranges between [0,8] 106 ;// So we can shift the packed rowLuma values [0b|0a] with a single LSL operation 107 ;// 108 ;//********************************************************************************************* 109 110 LSL rowLuma01,rowLuma01,shift 111 LSL rowLuma23,rowLuma23,shift 112 113 114 ;//********************************************************************************************** 115 ;// 116 ;// The idea is to unroll the Loop completely 117 ;// All the 16 src values are loaded at once into 8 registers : SrcDst<y><x> (above) 118 ;// 0<= armVCM4P10_PosToVCol4x4[i] <=2 for any 'i<16' 119 ;// So the only values of pVRow[i] that need to be loaded are for i=0,1,2 120 ;// These 3 values are loaded into rowLuma01 and rowLuma23 (above) 121 ;// We first calculate pVRow[armVCM4P10_PosToVCol4x4[i]]) << Shift which fits into 16 bits (above) 122 ;// Then the product pSrcDst[i] * (pVRow[armVCM4P10_PosToVCol4x4[i]] << Shift) is calculated 123 ;// Here we interleave the PKHBT operations for various rows to avoide pipeline stalls 124 ;// 125 ;// We then pack the two 16 bit multiplication result into a word and store at one go 126 ;// 127 ;//********************************************************************************************** 128 129 130 ;// Row 1 131 132 133 SMULTB temp1,SrcDst00,rowLuma23 ;// pSrcDst[1] * (pVRow[2]<<Shift) 134 SMULBB SrcDst00,SrcDst00,rowLuma01 ;// pSrcDst[0] * (pVRow[0]<<Shift) 135 136 SMULTB temp2,SrcDst02,rowLuma23 ;// pSrcDst[3] * (pVRow[2]<<Shift) 137 SMULBB SrcDst02,SrcDst02,rowLuma01 ;// pSrcDst[2] * (pVRow[0]<<Shift) 138 139 PKHBT SrcDst00,SrcDst00,temp1,LSL #16 ;// Pack the first two product values 140 141 142 ;// Row 2 143 SMULTT temp1,SrcDst10,rowLuma01 ;// pSrcDst[5] * (pVRow[1]<<Shift) 144 SMULBB SrcDst10,SrcDst10,rowLuma23 ;// pSrcDst[4] * (pVRow[2]<<Shift) 145 146 PKHBT SrcDst02,SrcDst02,temp2,LSL #16 ;// Pack the next two product values 147 SMULTT temp2,SrcDst12,rowLuma01 ;// pSrcDst[7] * (pVRow[1]<<Shift) 148 SMULBB SrcDst12,SrcDst12,rowLuma23 ;// pSrcDst[6] * (pVRow[2]<<Shift) 149 150 PKHBT SrcDst10,SrcDst10,temp1,LSL #16 ;// Pack the next two product values 151 152 153 ;// Row 3 154 155 SMULTB temp1,SrcDst20,rowLuma23 ;// pSrcDst[9] * (pVRow[2]<<Shift) 156 SMULBB SrcDst20,SrcDst20,rowLuma01 ;// pSrcDst[8] * (pVRow[0]<<Shift) 157 158 PKHBT SrcDst12,SrcDst12,temp2,LSL #16 ;// Pack the next two product values 159 SMULTB temp2,SrcDst22,rowLuma23 ;// pSrcDst[11] * (pVRow[2]<<Shift) 160 SMULBB SrcDst22,SrcDst22,rowLuma01 ;// pSrcDst[10] * (pVRow[0]<<Shift) 161 162 PKHBT SrcDst20,SrcDst20,temp1,LSL #16 ;// Pack the next two product values 163 164 165 166 ;// Row 4 167 168 SMULTT temp1,SrcDst30,rowLuma01 ;// pSrcDst[13] * (pVRow[1]<<Shift) 169 SMULBB SrcDst30,SrcDst30,rowLuma23 ;// pSrcDst[12] * (pVRow[2]<<Shift) 170 171 SMULTT temp3,SrcDst32,rowLuma01 ;// pSrcDst[15] * (pVRow[1]<<Shift) 172 SMULBB SrcDst32,SrcDst32,rowLuma23 ;// pSrcDst[14] * (pVRow[2]<<Shift) 173 174 PKHBT SrcDst22,SrcDst22,temp2,LSL #16 ;// Pack the remaining product values 175 PKHBT SrcDst30,SrcDst30,temp1,LSL #16 176 PKHBT SrcDst32,SrcDst32,temp3,LSL #16 177 178 179 STMIA pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32} 180 181 182 ;// Set return value 183 184 185 186 ;// Write function tail 187 M_END 188 189 ENDIF ;//ARM1136JS 190 191 192;// Guarding implementation by the processor name 193 194 IF ARM1136JS_U 195 196;//Input Registers 197pSrcDst RN 0 198QP RN 1 199 200 201;//Output Registers 202 203 204;//Local Scratch Registers 205pQPdiv RN 4 206pQPmod RN 5 207pVRow RN 2 208QPmod RN 6 209shift RN 3 210rowLuma01 RN 1 211rowLuma23 RN 4 212 213SrcDst00 RN 5 214SrcDst02 RN 6 215SrcDst10 RN 7 216SrcDst12 RN 8 217SrcDst20 RN 9 218SrcDst22 RN 10 219SrcDst30 RN 11 220SrcDst32 RN 12 221 222temp1 RN 2 223temp2 RN 3 224temp3 RN 14 225 226 227 ;// Allocate stack memory required by the function 228 229 ;// Write function header 230 M_START armVCM4P10_DequantLumaAC4x4,r11 231 232 LDR pQPmod,=armVCM4P10_QPModuloTable 233 LDR pQPdiv,=armVCM4P10_QPDivTable 234 LDR pVRow,=armVCM4P10_VMatrixU16 235 236 LDRSB QPmod,[pQPmod,QP] ;// (QP%6) * 6 237 LDRSB shift,[pQPdiv,QP] ;// Shift = QP / 6 238 239 LDR rowLuma01,[pVRow,QPmod]! ;// rowLuma01 = [0b|0a] 240 LDR rowLuma23,[pVRow,#4] ;// rowLuma23 = [0d|0c] 241 242 ;// Load all the 16 'src' values 243 LDMIA pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32} 244 245 246 ;//********************************************************************************************* 247 ;// 248 ;// 'Shift' ranges between [0,8] 249 ;// So we can shift the packed rowLuma values [0b|0a] with a single LSL operation 250 ;// 251 ;//********************************************************************************************* 252 253 LSL rowLuma01,rowLuma01,shift 254 LSL rowLuma23,rowLuma23,shift 255 256 257 ;//********************************************************************************************** 258 ;// 259 ;// The idea is to unroll the Loop completely 260 ;// All the 16 src values are loaded at once into 8 registers : SrcDst<y><x> (above) 261 ;// 0<= armVCM4P10_PosToVCol4x4[i] <=2 for any 'i<16' 262 ;// So the only values of pVRow[i] that need to be loaded are for i=0,1,2 263 ;// These 3 values are loaded into rowLuma01 and rowLuma23 (above) 264 ;// We first calculate pVRow[armVCM4P10_PosToVCol4x4[i]]) << Shift which fits into 16 bits (above) 265 ;// Then the product pSrcDst[i] * (pVRow[armVCM4P10_PosToVCol4x4[i]] << Shift) is calculated 266 ;// Here we interleave the PKHBT operations for various rows to avoide pipeline stalls 267 ;// 268 ;// We then pack the two 16 bit multiplication result into a word and store at one go 269 ;// 270 ;//********************************************************************************************** 271 272 273 ;// Row 1 274 275 276 SMULTB temp1,SrcDst00,rowLuma23 ;// pSrcDst[1] * (pVRow[2]<<Shift) 277 SMULBB SrcDst00,SrcDst00,rowLuma01 ;// pSrcDst[0] * (pVRow[0]<<Shift) 278 279 SMULTB temp2,SrcDst02,rowLuma23 ;// pSrcDst[3] * (pVRow[2]<<Shift) 280 SMULBB SrcDst02,SrcDst02,rowLuma01 ;// pSrcDst[2] * (pVRow[0]<<Shift) 281 282 PKHBT SrcDst00,SrcDst00,temp1,LSL #16 ;// Pack the first two product values 283 284 285 ;// Row 2 286 SMULTT temp1,SrcDst10,rowLuma01 ;// pSrcDst[5] * (pVRow[1]<<Shift) 287 SMULBB SrcDst10,SrcDst10,rowLuma23 ;// pSrcDst[4] * (pVRow[2]<<Shift) 288 289 PKHBT SrcDst02,SrcDst02,temp2,LSL #16 ;// Pack the next two product values 290 SMULTT temp2,SrcDst12,rowLuma01 ;// pSrcDst[7] * (pVRow[1]<<Shift) 291 SMULBB SrcDst12,SrcDst12,rowLuma23 ;// pSrcDst[6] * (pVRow[2]<<Shift) 292 293 PKHBT SrcDst10,SrcDst10,temp1,LSL #16 ;// Pack the next two product values 294 295 296 ;// Row 3 297 298 SMULTB temp1,SrcDst20,rowLuma23 ;// pSrcDst[9] * (pVRow[2]<<Shift) 299 SMULBB SrcDst20,SrcDst20,rowLuma01 ;// pSrcDst[8] * (pVRow[0]<<Shift) 300 301 PKHBT SrcDst12,SrcDst12,temp2,LSL #16 ;// Pack the next two product values 302 SMULTB temp2,SrcDst22,rowLuma23 ;// pSrcDst[11] * (pVRow[2]<<Shift) 303 SMULBB SrcDst22,SrcDst22,rowLuma01 ;// pSrcDst[10] * (pVRow[0]<<Shift) 304 305 PKHBT SrcDst20,SrcDst20,temp1,LSL #16 ;// Pack the next two product values 306 307 308 309 ;// Row 4 310 311 SMULTT temp1,SrcDst30,rowLuma01 ;// pSrcDst[13] * (pVRow[1]<<Shift) 312 SMULBB SrcDst30,SrcDst30,rowLuma23 ;// pSrcDst[12] * (pVRow[2]<<Shift) 313 314 SMULTT temp3,SrcDst32,rowLuma01 ;// pSrcDst[15] * (pVRow[1]<<Shift) 315 SMULBB SrcDst32,SrcDst32,rowLuma23 ;// pSrcDst[14] * (pVRow[2]<<Shift) 316 317 PKHBT SrcDst22,SrcDst22,temp2,LSL #16 ;// Pack the remaining product values 318 PKHBT SrcDst30,SrcDst30,temp1,LSL #16 319 PKHBT SrcDst32,SrcDst32,temp3,LSL #16 320 321 322 STMIA pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32} 323 324 325 ;// Set return value 326 327 328 329 ;// Write function tail 330 M_END 331 332 ENDIF ;//ARM1136JS_U 333 334 335 336 337 338;// Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd 339 340;// Guarding implementation by the processor name 341 342 IF ARM1136JS 343 344;//Input Registers 345ppSrc RN 0 346pPred RN 1 347pDC RN 2 348pDst RN 3 349 350 351;//Output Registers 352result RN 0 353 354;//Local Scratch Registers 355pDelta RN 4 356pDeltaTmp RN 6 357AC RN 5 ;//Load from stack 358pPredTemp RN 7 359pDCTemp RN 8 360pDstTemp RN 9 361pDeltaArg1 RN 1 362pDeltaArg0 RN 0 363QP RN 1 ;//Load from stack 364DCval RN 10 365DCvalCopy RN 11 366predstep RN 1 367dstStep RN 10 368ycounter RN 0 369PredVal1 RN 3 370PredVal2 RN 5 371DeltaVal1 RN 2 372DeltaVal2 RN 11 373PredVal RN 8 374tmpDeltaVal RN 6 375sum1 RN 12 376sum2 RN 14 377 378 379 380 ;// Allocate stack memory required by the function 381 M_ALLOC8 pBuffer, 32 382 383 384 ;// Write function header 385 M_START omxVCM4P10_DequantTransformResidualFromPairAndAdd,r11 386 387 ;// Define stack arguments 388 M_ARG predStepOnStack, 4 389 M_ARG dstStepOnStack,4 390 M_ARG QPOnStack, 4 391 M_ARG ACOnStack,4 392 393 394 M_ADR pDelta,pBuffer 395 M_LDR AC,ACOnStack 396 397 398 ;// Save registers r1,r2,r3 before function call 399 MOV pPredTemp,pPred 400 MOV pDCTemp,pDC 401 MOV pDstTemp,pDst 402 403 CMP AC,#0 404 BEQ DCcase 405 MOV pDeltaArg1,pDelta ;// Set up r1 for armVCM4P10_UnpackBlock4x4 406 407 BL armVCM4P10_UnpackBlock4x4 408 409 M_LDR QP,QPOnStack ;// Set up r1 for DequantLumaAC4x4 410 MOV pDeltaArg0,pDelta ;// Set up r0 for DequantLumaAC4x4 411 412 BL armVCM4P10_DequantLumaAC4x4 413 414 415 CMP pDCTemp,#0 416 LDRSHNE DCval,[pDCTemp] 417 MOV pDeltaArg0,pDelta ;// Set up r0 for armVCM4P10_TransformResidual4x4 418 MOV pDeltaArg1,pDelta ;// Set up r1 for armVCM4P10_TransformResidual4x4 419 STRHNE DCval,[pDelta] 420 421 BL armVCM4P10_TransformResidual4x4 422 B OutDCcase 423 424 425DCcase 426 LDRSH DCval,[pDCTemp] 427 ADD DCval,DCval,#32 428 ASR DCval,DCval,#6 429 PKHBT DCval,DCval,DCval,LSL #16 ;// Duplicating the Lower halfword 430 MOV DCvalCopy, DCval ;// Needed for STRD 431 STRD DCval, [pDelta, #0] ;// pDelta[0] = pDelta[1] = pDelta[2] = pDelta[3] = DCval 432 STRD DCval, [pDelta, #8] ;// pDelta[4] = pDelta[5] = pDelta[6] = pDelta[7] = DCval 433 STRD DCval, [pDelta, #16] ;// pDelta[8] = pDelta[9] = pDelta[10] = pDelta[11] = DCval 434 STRD DCval, [pDelta, #24] 435 436 437OutDCcase 438 M_LDR predstep,predStepOnStack 439 M_LDR dstStep,dstStepOnStack 440 441 LDMIA pDelta!,{tmpDeltaVal,DeltaVal2} ;// Pre load 442 MOV ycounter,#4 ;// Counter for the PredPlusDeltaLoop 443 LDR PredVal,[pPredTemp] ;// Pre load 444 445PredPlusDeltaLoop 446 447 448 SUBS ycounter,ycounter,#1 449 ADD pPredTemp,pPredTemp,predstep ;// Increment pPred ptr 450 451 PKHBT DeltaVal1,tmpDeltaVal,DeltaVal2,LSL #16 ;// Deltaval1 = [C A] 452 PKHTB DeltaVal2,DeltaVal2,tmpDeltaVal,ASR #16 ;// DeltaVal2 = [D B] 453 454 UXTB16 PredVal1,PredVal ;// PredVal1 = [0c0a] 455 UXTB16 PredVal2,PredVal,ROR #8 ;// PredVal2 = [0d0b] 456 457 LDRGT PredVal,[pPredTemp] ;// Pre load 458 459 QADD16 sum2,DeltaVal2,PredVal2 ;// Add and saturate to 16 bits 460 QADD16 sum1,DeltaVal1,PredVal1 461 462 USAT16 sum2,#8,sum2 ;// armClip(0,255,sum2) 463 USAT16 sum1,#8,sum1 464 465 LDMGTIA pDelta!,{tmpDeltaVal,DeltaVal2} ;// Pre load 466 467 ORR sum1,sum1,sum2,LSL #8 ;// sum1 = [dcba] 468 STR sum1,[pDstTemp] 469 470 ADD pDstTemp,pDstTemp,dstStep ;// Increment pDst ptr 471 BGT PredPlusDeltaLoop 472 473 474 ;// Set return value 475 MOV result,#OMX_Sts_NoErr 476 477End 478 479 480 ;// Write function tail 481 482 M_END 483 484 ENDIF ;//ARM1136JS 485 486 487;// Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd 488 489;// Guarding implementation by the processor name 490 491 492 493 494 END 495