omxVCM4P10_InterpolateLuma_s.s revision 0c1bc742181ded4930842b46e9507372f0b1b963
1;// 2;// 3;// File Name: omxVCM4P10_InterpolateLuma_s.s 4;// OpenMAX DL: v1.0.2 5;// Revision: 12290 6;// Date: Wednesday, April 9, 2008 7;// 8;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. 9;// 10;// 11;// 12 13;// Function: 14;// omxVCM4P10_InterpolateLuma 15;// 16;// This function implements omxVCM4P10_InterpolateLuma in v6 assembly. 17;// Performs quarter pel interpolation of inter luma MB. 18;// It's assumed that the frame is already padded when calling this function. 19;// Parameters: 20;// [in] pSrc Pointer to the source reference frame buffer 21;// [in] srcStep Reference frame step in byte 22;// [in] dstStep Destination frame step in byte. Must be multiple of roi.width 23;// [in] dx Fractional part of horizontal motion vector 24;// component in 1/4 pixel unit; valid in the range [0,3] 25;// [in] dy Fractional part of vertical motion vector 26;// component in 1/4 pixel unit; valid in the range [0,3] 27;// [in] roi Dimension of the interpolation region;the parameters roi.width and roi.height must 28;// be equal to either 4, 8, or 16. 29;// [out] pDst Pointer to the destination frame buffer. 30;// if roi.width==4, 4-byte alignment required 31;// if roi.width==8, 8-byte alignment required 32;// if roi.width==16, 16-byte alignment required 33;// 34;// Return Value: 35;// If the function runs without error, it returns OMX_Sts_NoErr. 36;// It is assued that following cases are satisfied before calling this function: 37;// pSrc or pDst is not NULL. 38;// srcStep or dstStep >= roi.width. 39;// dx or dy is in the range [0-3]. 40;// roi.width or roi.height is not out of range {4, 8, 16}. 41;// If roi.width is equal to 4, Dst is 4 byte aligned. 42;// If roi.width is equal to 8, pDst is 8 byte aligned. 43;// If roi.width is equal to 16, pDst is 16 byte aligned. 44;// srcStep and dstStep is multiple of 8. 45;// 46;// 47 48 49 INCLUDE omxtypes_s.h 50 INCLUDE armCOMM_s.h 51 52 M_VARIANTS CortexA8 53 54 EXPORT omxVCM4P10_InterpolateLuma 55 56 57 IF CortexA8 58 IMPORT armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe 59 IMPORT armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe 60 IMPORT armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe 61 IMPORT armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe 62 ENDIF 63 64 65 66;// Declare input registers 67pSrc RN 0 68srcStep RN 1 69pDst RN 2 70dstStep RN 3 71iHeight RN 4 72iWidth RN 5 73 74;// Declare other intermediate registers 75idx RN 6 76idy RN 7 77index RN 6 78Temp RN 12 79pArgs RN 11 80 81 82 IF CortexA8 83 84 ;// 85 ;// Interpolation of luma is implemented by processing block of pixels, size 4x4 at a time. 86 ;// 87 M_ALLOC4 ppArgs, 16 88 89 ;// Function header 90 M_START omxVCM4P10_InterpolateLuma, r11, d15 91 92pSrcBK RN 8 93 94;// Declare Neon registers 95dCoeff5 DN 30.S16 96dCoeff20 DN 31.S16 97 98;// Registers used for implementing Horizontal interpolation 99dSrc0c DN 14.U8 100dSrc1c DN 16.U8 101dSrc2c DN 18.U8 102dSrc3c DN 20.U8 103dSrc0d DN 15.U8 104dSrc1d DN 17.U8 105dSrc2d DN 19.U8 106dSrc3d DN 21.U8 107dAccH0 DN 22.U8 108dAccH1 DN 24.U8 109dAccH2 DN 26.U8 110dAccH3 DN 28.U8 111dResultH0 DN 22.U32 112dResultH1 DN 24.U32 113dResultH2 DN 26.U32 114dResultH3 DN 28.U32 115 116;// Registers used for implementing Vertical interpolation 117dSrc0 DN 9.U8 118dSrc1 DN 10.U8 119dSrc2 DN 11.U8 120dSrc3 DN 12.U8 121dSrc4 DN 13.U8 122dAccV0 DN 0.U8 123dAccV1 DN 2.U8 124dAccV2 DN 4.U8 125dAccV3 DN 6.U8 126dResultV0 DN 0.U32 127dResultV1 DN 2.U32 128dResultV2 DN 4.U32 129dResultV3 DN 6.U32 130 131;// Registers used for implementing Diagonal interpolation 132dTAcc0 DN 0.U8 133dTAcc1 DN 2.U8 134dTAcc2 DN 4.U8 135dTAcc3 DN 6.U8 136dTRes0 DN 0.32 137dTRes1 DN 2.32 138dTRes2 DN 4.32 139dTRes3 DN 6.32 140dTResult0 DN 14.U8 141dTResult1 DN 16.U8 142dTResult2 DN 18.U8 143dTResult3 DN 20.U8 144dTempP0 DN 18.S16 145dTempP1 DN 19.S16 146dTempQ0 DN 20.S16 147dTempQ1 DN 21.S16 148dTempR0 DN 22.S16 149dTempR1 DN 23.S16 150dTempS0 DN 24.S16 151dTempS1 DN 25.S16 152qTempP01 QN 9.S16 153qTempQ01 QN 10.S16 154qTempR01 QN 11.S16 155qTempS01 QN 12.S16 156 157;// Intermediate values for averaging 158qRes2 QN 7.S16 159qRes3 QN 8.S16 160qRes4 QN 9.S16 161qRes5 QN 10.S16 162qRes6 QN 11.S16 163 164;// For implementing copy 165dDst0 DN 9.32 166dDst1 DN 10.32 167dDst2 DN 11.32 168dDst3 DN 12.32 169 170 ;// Define stack arguments 171 M_ARG ptridx, 4 172 M_ARG ptridy, 4 173 M_ARG ptrWidth, 4 174 M_ARG ptrHeight, 4 175 176 ;// Load structure elements of roi 177 M_LDR idx, ptridx 178 M_LDR idy, ptridy 179 M_LDR iWidth, ptrWidth 180 M_LDR iHeight, ptrHeight 181 182 ADD index, idx, idy, LSL #2 ;// [index] = [idy][idx] 183 M_ADR pArgs, ppArgs 184 185 ;// Move coefficients Neon registers 186 VMOV dCoeff20, #20 187 VMOV dCoeff5, #5 188 189Block4x4WidthLoop 190Block4x4HeightLoop 191 192 STM pArgs, {pSrc,srcStep,pDst,dstStep} 193 194 ;// switch table using motion vector as index 195 ADD pc, pc, index, LSL #2 196 B Case_f 197 B Case_0 198 B Case_1 199 B Case_2 200 B Case_3 201 B Case_4 202 B Case_5 203 B Case_6 204 B Case_7 205 B Case_8 206 B Case_9 207 B Case_a 208 B Case_b 209 B Case_c 210 B Case_d 211 B Case_e 212 B Case_f 213 214Case_0 215 ;// Case G 216 M_PRINTF "Case 0 \n" 217 218 ;// Loads a 4x4 block of .8 and stores as .32 219 ADD Temp, pSrc, srcStep, LSL #1 220 VLD1 dSrc0, [pSrc], srcStep 221 VLD1 dSrc2, [Temp], srcStep 222 VLD1 dSrc1, [pSrc] 223 VLD1 dSrc3, [Temp] 224 225 ADD Temp, pDst, dstStep, LSL #1 226 VST1 dDst0[0], [pDst], dstStep 227 VST1 dDst2[0], [Temp], dstStep 228 VST1 dDst1[0], [pDst] 229 VST1 dDst3[0], [Temp] 230 M_ADR pArgs, ppArgs 231 B Block4x4LoopEnd 232Case_1 233 ;// Case a 234 M_PRINTF "Case 1 \n" 235 236 SUB pSrc, pSrc, #2 237 BL armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe 238 VRHADD dAccH0, dAccH0, dSrc0c 239 VRHADD dAccH2, dAccH2, dSrc2c 240 VRHADD dAccH1, dAccH1, dSrc1c 241 VRHADD dAccH3, dAccH3, dSrc3c 242 ADD Temp, pDst, dstStep, LSL #1 243 VST1 dResultH0[0], [pDst], dstStep 244 VST1 dResultH2[0], [Temp], dstStep 245 VST1 dResultH1[0], [pDst] 246 VST1 dResultH3[0], [Temp] 247 M_ADR pArgs, ppArgs 248 B Block4x4LoopEnd 249Case_2 250 ;// Case b 251 M_PRINTF "Case 2 \n" 252 253 SUB pSrc, pSrc, #2 254 BL armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe 255 ADD Temp, pDst, dstStep, LSL #1 256 VST1 dResultH0[0], [pDst], dstStep 257 VST1 dResultH2[0], [Temp], dstStep 258 VST1 dResultH1[0], [pDst] 259 VST1 dResultH3[0], [Temp] 260 M_ADR pArgs, ppArgs 261 B Block4x4LoopEnd 262Case_3 263 ;// Case c 264 M_PRINTF "Case 3 \n" 265 266 SUB pSrc, pSrc, #2 267 BL armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe 268 VRHADD dAccH0, dAccH0, dSrc0d 269 VRHADD dAccH2, dAccH2, dSrc2d 270 VRHADD dAccH1, dAccH1, dSrc1d 271 VRHADD dAccH3, dAccH3, dSrc3d 272 ADD Temp, pDst, dstStep, LSL #1 273 VST1 dResultH0[0], [pDst], dstStep 274 VST1 dResultH2[0], [Temp], dstStep 275 VST1 dResultH1[0], [pDst] 276 VST1 dResultH3[0], [Temp] 277 M_ADR pArgs, ppArgs 278 B Block4x4LoopEnd 279Case_4 280 ;// Case d 281 M_PRINTF "Case 4 \n" 282 283 SUB pSrc, pSrc, srcStep, LSL #1 284 BL armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe 285 VRHADD dAccV0, dAccV0, dSrc0 286 VRHADD dAccV2, dAccV2, dSrc2 287 VRHADD dAccV1, dAccV1, dSrc1 288 VRHADD dAccV3, dAccV3, dSrc3 289 ADD Temp, pDst, dstStep, LSL #1 290 VST1 dResultV0[0], [pDst], dstStep 291 VST1 dResultV2[0], [Temp], dstStep 292 VST1 dResultV1[0], [pDst] 293 VST1 dResultV3[0], [Temp] 294 M_ADR pArgs, ppArgs 295 B Block4x4LoopEnd 296Case_5 297 ;// Case e 298 M_PRINTF "Case 5 \n" 299 300 MOV pSrcBK, pSrc 301 SUB pSrc, pSrc, srcStep, LSL #1 302 BL armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe 303 SUB pSrc, pSrcBK, #2 304 BL armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe 305 VRHADD dAccH0, dAccH0, dAccV0 306 VRHADD dAccH2, dAccH2, dAccV2 307 VRHADD dAccH1, dAccH1, dAccV1 308 VRHADD dAccH3, dAccH3, dAccV3 309 ADD Temp, pDst, dstStep, LSL #1 310 VST1 dResultH0[0], [pDst], dstStep 311 VST1 dResultH2[0], [Temp], dstStep 312 VST1 dResultH1[0], [pDst] 313 VST1 dResultH3[0], [Temp] 314 315 M_ADR pArgs, ppArgs 316 B Block4x4LoopEnd 317Case_6 318 ;// Case f 319 M_PRINTF "Case 6 \n" 320 321 SUB pSrc, pSrc, srcStep, LSL #1 322 SUB pSrc, pSrc, #2 323 BL armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe 324 VQRSHRUN dTResult0, qRes2, #5 325 VQRSHRUN dTResult1, qRes3, #5 326 VQRSHRUN dTResult2, qRes4, #5 327 VQRSHRUN dTResult3, qRes5, #5 328 VRHADD dTAcc0, dTAcc0, dTResult0 329 VRHADD dTAcc2, dTAcc2, dTResult2 330 VRHADD dTAcc1, dTAcc1, dTResult1 331 VRHADD dTAcc3, dTAcc3, dTResult3 332 ADD Temp, pDst, dstStep, LSL #1 333 VST1 dTRes0[0], [pDst], dstStep 334 VST1 dTRes2[0], [Temp], dstStep 335 VST1 dTRes1[0], [pDst] 336 VST1 dTRes3[0], [Temp] 337 338 M_ADR pArgs, ppArgs 339 B Block4x4LoopEnd 340Case_7 341 ;// Case g 342 M_PRINTF "Case 7 \n" 343 MOV pSrcBK, pSrc 344 ADD pSrc, pSrc, #1 345 SUB pSrc, pSrc, srcStep, LSL #1 346 BL armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe 347 SUB pSrc, pSrcBK, #2 348 BL armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe 349 VRHADD dAccH0, dAccH0, dAccV0 350 VRHADD dAccH2, dAccH2, dAccV2 351 VRHADD dAccH1, dAccH1, dAccV1 352 VRHADD dAccH3, dAccH3, dAccV3 353 ADD Temp, pDst, dstStep, LSL #1 354 VST1 dResultH0[0], [pDst], dstStep 355 VST1 dResultH2[0], [Temp], dstStep 356 VST1 dResultH1[0], [pDst] 357 VST1 dResultH3[0], [Temp] 358 359 M_ADR pArgs, ppArgs 360 B Block4x4LoopEnd 361Case_8 362 ;// Case h 363 M_PRINTF "Case 8 \n" 364 365 SUB pSrc, pSrc, srcStep, LSL #1 366 BL armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe 367 ADD Temp, pDst, dstStep, LSL #1 368 VST1 dResultV0[0], [pDst], dstStep 369 VST1 dResultV2[0], [Temp], dstStep 370 VST1 dResultV1[0], [pDst] 371 VST1 dResultV3[0], [Temp] 372 M_ADR pArgs, ppArgs 373 B Block4x4LoopEnd 374Case_9 375 ;// Case i 376 M_PRINTF "Case 9 \n" 377 SUB pSrc, pSrc, srcStep, LSL #1 378 SUB pSrc, pSrc, #2 379 BL armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe 380 VEXT dTempP0, dTempP0, dTempP1, #2 381 VEXT dTempQ0, dTempQ0, dTempQ1, #2 382 VEXT dTempR0, dTempR0, dTempR1, #2 383 VEXT dTempS0, dTempS0, dTempS1, #2 384 385 VQRSHRUN dTResult0, qTempP01, #5 386 VQRSHRUN dTResult1, qTempQ01, #5 387 VQRSHRUN dTResult2, qTempR01, #5 388 VQRSHRUN dTResult3, qTempS01, #5 389 390 VRHADD dTAcc0, dTAcc0, dTResult0 391 VRHADD dTAcc2, dTAcc2, dTResult2 392 VRHADD dTAcc1, dTAcc1, dTResult1 393 VRHADD dTAcc3, dTAcc3, dTResult3 394 ADD Temp, pDst, dstStep, LSL #1 395 VST1 dTRes0[0], [pDst], dstStep 396 VST1 dTRes2[0], [Temp], dstStep 397 VST1 dTRes1[0], [pDst] 398 VST1 dTRes3[0], [Temp] 399 M_ADR pArgs, ppArgs 400 B Block4x4LoopEnd 401Case_a 402 ;// Case j 403 M_PRINTF "Case a \n" 404 405 SUB pSrc, pSrc, srcStep, LSL #1 406 SUB pSrc, pSrc, #2 407 BL armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe 408 ADD Temp, pDst, dstStep, LSL #1 409 VST1 dTRes0[0], [pDst], dstStep 410 VST1 dTRes2[0], [Temp], dstStep 411 VST1 dTRes1[0], [pDst] 412 VST1 dTRes3[0], [Temp] 413 M_ADR pArgs, ppArgs 414 B Block4x4LoopEnd 415Case_b 416 ;// Case k 417 M_PRINTF "Case b \n" 418 SUB pSrc, pSrc, srcStep, LSL #1 419 SUB pSrc, pSrc, #2 420 BL armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe 421 VEXT dTempP0, dTempP0, dTempP1, #3 422 VEXT dTempQ0, dTempQ0, dTempQ1, #3 423 VEXT dTempR0, dTempR0, dTempR1, #3 424 VEXT dTempS0, dTempS0, dTempS1, #3 425 426 VQRSHRUN dTResult0, qTempP01, #5 427 VQRSHRUN dTResult1, qTempQ01, #5 428 VQRSHRUN dTResult2, qTempR01, #5 429 VQRSHRUN dTResult3, qTempS01, #5 430 431 VRHADD dTAcc0, dTAcc0, dTResult0 432 VRHADD dTAcc2, dTAcc2, dTResult2 433 VRHADD dTAcc1, dTAcc1, dTResult1 434 VRHADD dTAcc3, dTAcc3, dTResult3 435 ADD Temp, pDst, dstStep, LSL #1 436 VST1 dTRes0[0], [pDst], dstStep 437 VST1 dTRes2[0], [Temp], dstStep 438 VST1 dTRes1[0], [pDst] 439 VST1 dTRes3[0], [Temp] 440 M_ADR pArgs, ppArgs 441 B Block4x4LoopEnd 442Case_c 443 ;// Case n 444 M_PRINTF "Case c \n" 445 446 SUB pSrc, pSrc, srcStep, LSL #1 447 BL armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe 448 VRHADD dAccV0, dAccV0, dSrc1 449 VRHADD dAccV2, dAccV2, dSrc3 450 VRHADD dAccV1, dAccV1, dSrc2 451 VRHADD dAccV3, dAccV3, dSrc4 452 ADD Temp, pDst, dstStep, LSL #1 453 VST1 dResultV0[0], [pDst], dstStep 454 VST1 dResultV2[0], [Temp], dstStep 455 VST1 dResultV1[0], [pDst] 456 VST1 dResultV3[0], [Temp] 457 M_ADR pArgs, ppArgs 458 B Block4x4LoopEnd 459Case_d 460 ;// Case p 461 M_PRINTF "Case d \n" 462 463 MOV pSrcBK, pSrc 464 SUB pSrc, pSrc, srcStep, LSL #1 465 BL armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe 466 ADD pSrc, pSrcBK, srcStep 467 SUB pSrc, pSrc, #2 468 BL armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe 469 VRHADD dAccH0, dAccH0, dAccV0 470 VRHADD dAccH2, dAccH2, dAccV2 471 VRHADD dAccH1, dAccH1, dAccV1 472 VRHADD dAccH3, dAccH3, dAccV3 473 ADD Temp, pDst, dstStep, LSL #1 474 VST1 dResultH0[0], [pDst], dstStep 475 VST1 dResultH2[0], [Temp], dstStep 476 VST1 dResultH1[0], [pDst] 477 VST1 dResultH3[0], [Temp] 478 M_ADR pArgs, ppArgs 479 B Block4x4LoopEnd 480Case_e 481 ;// Case q 482 M_PRINTF "Case e \n" 483 484 SUB pSrc, pSrc, srcStep, LSL #1 485 SUB pSrc, pSrc, #2 486 BL armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe 487 VQRSHRUN dTResult0, qRes3, #5 488 VQRSHRUN dTResult1, qRes4, #5 489 VQRSHRUN dTResult2, qRes5, #5 490 VQRSHRUN dTResult3, qRes6, #5 491 492 VRHADD dTAcc0, dTAcc0, dTResult0 493 VRHADD dTAcc2, dTAcc2, dTResult2 494 VRHADD dTAcc1, dTAcc1, dTResult1 495 VRHADD dTAcc3, dTAcc3, dTResult3 496 ADD Temp, pDst, dstStep, LSL #1 497 VST1 dTRes0[0], [pDst], dstStep 498 VST1 dTRes2[0], [Temp], dstStep 499 VST1 dTRes1[0], [pDst] 500 VST1 dTRes3[0], [Temp] 501 M_ADR pArgs, ppArgs 502 B Block4x4LoopEnd 503Case_f 504 ;// Case r 505 M_PRINTF "Case f \n" 506 MOV pSrcBK, pSrc 507 ADD pSrc, pSrc, #1 508 SUB pSrc, pSrc, srcStep, LSL #1 509 BL armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe 510 ADD pSrc, pSrcBK, srcStep 511 SUB pSrc, pSrc, #2 512 BL armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe 513 VRHADD dAccH0, dAccH0, dAccV0 514 VRHADD dAccH2, dAccH2, dAccV2 515 VRHADD dAccH1, dAccH1, dAccV1 516 VRHADD dAccH3, dAccH3, dAccV3 517 ADD Temp, pDst, dstStep, LSL #1 518 VST1 dResultH0[0], [pDst], dstStep 519 VST1 dResultH2[0], [Temp], dstStep 520 VST1 dResultH1[0], [pDst] 521 VST1 dResultH3[0], [Temp] 522 M_ADR pArgs, ppArgs 523 524 525Block4x4LoopEnd 526 527 ;// Width Loop 528 ;//M_ADR pArgs, ppArgs 529 LDM pArgs, {pSrc,srcStep,pDst,dstStep} ;// Load arguments 530 SUBS iWidth, iWidth, #4 531 ADD pSrc, pSrc, #4 532 ADD pDst, pDst, #4 533 BGT Block4x4WidthLoop 534 535 ;// Height Loop 536 SUBS iHeight, iHeight, #4 537 M_LDR iWidth, ptrWidth 538 M_ADR pArgs, ppArgs 539 ADD pSrc, pSrc, srcStep, LSL #2 540 ADD pDst, pDst, dstStep, LSL #2 541 SUB pSrc, pSrc, iWidth 542 SUB pDst, pDst, iWidth 543 BGT Block4x4HeightLoop 544 545EndOfInterpolation 546 MOV r0, #0 547 M_END 548 549 ENDIF 550 ;// End of CortexA8 551 552 END 553 554