omxVCM4P10_FilterDeblockingLuma_VerEdge_I_s.s revision 78e52bfac041d71ce53b5b13c2abf78af742b09d
1;// 2;// Copyright (C) 2007-2008 ARM Limited 3;// 4;// Licensed under the Apache License, Version 2.0 (the "License"); 5;// you may not use this file except in compliance with the License. 6;// You may obtain a copy of the License at 7;// 8;// http://www.apache.org/licenses/LICENSE-2.0 9;// 10;// Unless required by applicable law or agreed to in writing, software 11;// distributed under the License is distributed on an "AS IS" BASIS, 12;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13;// See the License for the specific language governing permissions and 14;// limitations under the License. 15;// 16;// 17;// 18;// File Name: omxVCM4P10_FilterDeblockingLuma_VerEdge_I_s.s 19;// OpenMAX DL: v1.0.2 20;// Revision: 9641 21;// Date: Thursday, February 7, 2008 22;// 23;// 24;// 25;// 26 27 INCLUDE omxtypes_s.h 28 INCLUDE armCOMM_s.h 29 30 M_VARIANTS ARM1136JS 31 32 IMPORT armVCM4P10_DeblockingLumabSLT4_unsafe 33 IMPORT armVCM4P10_DeblockingLumabSGE4_unsafe 34 35 36 IF ARM1136JS 37 38MASK_0 EQU 0x00000000 39MASK_1 EQU 0x01010101 40MASK_2 EQU 0xff00ff00 41LOOP_COUNT EQU 0x11110000 42 43;// Declare input registers 44 45pSrcDst RN 0 46srcdstStep RN 1 47pAlphaArg RN 2 48pBetaArg RN 3 49 50pThresholds RN 14 51pBS RN 9 52pQ0 RN 0 53bS RN 2 54 55alpha RN 6 56alpha0 RN 6 57alpha1 RN 8 58 59beta RN 7 60beta0 RN 7 61beta1 RN 9 62 63;// Declare Local/Temporary variables 64 65;// Pixels 66p_0 RN 3 67p_1 RN 5 68p_2 RN 4 69p_3 RN 2 70q_0 RN 8 71q_1 RN 9 72q_2 RN 10 73q_3 RN 12 74 75;// Unpacking 76mask RN 11 77 78row0 RN 2 79row1 RN 4 80row2 RN 5 81row3 RN 3 82 83row4 RN 8 84row5 RN 9 85row6 RN 10 86row7 RN 12 87row8 RN 14 88row9 RN 7 89 90tunpk0 RN 8 91tunpk1 RN 9 92tunpk2 RN 10 93tunpk3 RN 12 94tunpk4 RN 0 95 96tunpk5 RN 1 97tunpk6 RN 14 98tunpk7 RN 2 99tunpk8 RN 5 100tunpk9 RN 6 101 102 103;// Filtering 104 105dp0q0 RN 12 106dp1p0 RN 12 107dq1q0 RN 12 108dp2p0 RN 12 109dq2q0 RN 12 110 111ap0q0 RN 1 112filt RN 2 113 114m00 RN 14 115m01 RN 11 116 117apflg RN 0 118aqflg RN 6 119apqflg RN 0 120 121 122;//Declarations for bSLT4 kernel 123 124tC0 RN 7 125ptC0 RN 1 126 127pQ0a RN 0 128Stepa RN 1 129maska RN 14 130 131P0a RN 1 132P1a RN 8 133Q0a RN 7 134Q1a RN 11 135 136;//Declarations for bSGE4 kernel 137 138pQ0b RN 0 139Stepb RN 1 140maskb RN 14 141 142P0b RN 6 143P1b RN 7 144P2b RN 1 145P3b RN 3 146 147Q0b RN 9 148Q1b RN 0 149Q2b RN 2 150Q3b RN 3 151 152;// Miscellanous 153XY RN 8 154t0 RN 3 155t1 RN 12 156t2 RN 14 157t7 RN 7 158t4 RN 4 159t5 RN 1 160t8 RN 6 161a RN 0 162 163 164 165 ;// Allocate stack memory 166 M_ALLOC4 ppThresholds,4 167 M_ALLOC4 pQ_3,4 168 M_ALLOC4 pP_3,4 169 M_ALLOC8 pAlphaBeta0,8 170 M_ALLOC8 pAlphaBeta1,8 171 M_ALLOC8 pXYBS,4 172 M_ALLOC4 ppBS,4 173 M_ALLOC8 ppQ0Step,4 174 M_ALLOC4 pStep,4 175 176 ;// Function header 177 M_START omxVCM4P10_FilterDeblockingLuma_VerEdge_I, r11 178 179 ;//Input arguments on the stack 180 M_ARG ppThresholdsArg, 4 181 M_ARG ppBSArg, 4 182 183 LDR t4,=MASK_1 184 185 LDRB alpha0, [pAlphaArg] 186 LDRB beta0, [pBetaArg] 187 LDRB alpha1, [pAlphaArg,#1] 188 LDRB beta1, [pBetaArg,#1] 189 190 MUL alpha0, alpha0, t4 191 MUL beta0, beta0, t4 192 MUL alpha1, alpha1, t4 193 MUL beta1, beta1, t4 194 195 M_STRD alpha0, beta0, pAlphaBeta0 196 M_STRD alpha1, beta1, pAlphaBeta1 197 198 LDR XY,=LOOP_COUNT 199 M_LDR pBS, ppBSArg 200 M_LDR pThresholds, ppThresholdsArg 201 M_STR srcdstStep, pStep 202 M_STRD XY, pBS, pXYBS 203 M_STR pThresholds, ppThresholds 204 205 SUB pQ0, pQ0, #4 206LoopY 207;//---------------Load Pixels------------------- 208 209;//----------------Pack p0-p3----------------------- 210 LDR mask, =MASK_2 211 212 M_LDR row0, [pQ0], srcdstStep 213 M_LDR row1, [pQ0], srcdstStep 214 LDR row2, [pQ0] 215 LDR row3, [pQ0, srcdstStep] 216 SUB pQ0, pQ0, srcdstStep, LSL #1 217 218 ;// row0 = [r0p0 r0p1 r0p2 r0p3] 219 ;// row1 = [r1p0 r1p1 r1p2 r1p3] 220 ;// row2 = [r2p0 r2p1 r2p2 r2p3] 221 ;// row3 = [r3p0 r3p1 r3p2 r3p3] 222 223 AND tunpk0, mask, row0 224 AND tunpk6, mask, row0, LSL#8 225 UXTAB16 tunpk0, tunpk0, row1, ROR#8 226 UXTAB16 tunpk6, tunpk6, row1 227 AND tunpk2, mask, row2 228 AND tunpk3, mask, row2, LSL#8 229 UXTAB16 tunpk2, tunpk2, row3, ROR#8 230 UXTAB16 tunpk3, tunpk3, row3 231 232 ;// tunpk0 = [r0p0 r1p0 r0p2 r1p2] 233 ;// tunpk6 = [r0p1 r1p1 r0p3 r1p3] 234 ;// tunpk2 = [r2p0 r3p0 r2p2 r3p2] 235 ;// tunpk3 = [r2p1 r3p1 r2p3 r3p3] 236 237 PKHTB p_0, tunpk0, tunpk2, ASR#16 238 PKHTB p_1, tunpk6, tunpk3, ASR#16 239 PKHBT p_2, tunpk2, tunpk0, LSL#16 240 PKHBT p_3, tunpk3, tunpk6, LSL#16 241 242 243 ;// p_0 = [r0p0 r1p0 r2p0 r3p0] 244 ;// p_1 = [r0p1 r1p1 r2p1 r3p1] 245 ;// p_2 = [r0p2 r1p2 r2p1 r3p2] 246 ;// p_3 = [r0p3 r1p3 r2p3 r3p3] 247 248 M_STR p_3, pP_3 249 250;//----------------Pack q0-q3----------------------- 251LoopX 252 LDRB bS, [pBS], #4 253 M_STR pQ0, ppQ0Step 254 LDR mask, =MASK_2 255 CMP bS, #0 256 M_STR pBS, ppBS 257 258 LDR row4, [pQ0, #4]! 259 BEQ.W NoFilterBS0 260 M_LDR row5, [pQ0, srcdstStep]! 261 M_LDR row6, [pQ0, srcdstStep]! 262 M_LDR row7, [pQ0, srcdstStep] 263 264 ;// row4 = [r0q3 r0q2 r0q1 r0q0] 265 ;// row5 = [r1q3 r1q2 r1q1 r1q0] 266 ;// row6 = [r2q3 r2q2 r2q1 r2q0] 267 ;// row7 = [r3q3 r3q2 r3q1 r3q0] 268 269 AND tunpk4, mask, row4 270 CMP bS, #4 271 AND tunpk5, mask, row4, LSL#8 272 UXTAB16 tunpk4, tunpk4, row5, ROR#8 273 UXTAB16 tunpk5, tunpk5, row5 274 AND tunpk6, mask, row6 275 AND tunpk7, mask, row6, LSL#8 276 UXTAB16 tunpk6, tunpk6, row7, ROR#8 277 UXTAB16 tunpk7, tunpk7, row7 278 279 ;// tunpk4 = [r0q0 r1q0 r0q2 r1q2] 280 ;// tunpk5 = [r0q1 r1q1 r0q3 r1q3] 281 ;// tunpk6 = [r2q0 r3q0 r2q2 r3q2] 282 ;// tunpk7 = [r2q1 r3q1 r2q3 r3q3] 283 284 PKHTB q_3, tunpk4, tunpk6, ASR#16 285 PKHTB q_2, tunpk5, tunpk7, ASR#16 286 PKHBT q_1, tunpk6, tunpk4, LSL#16 287 M_STR q_3, pQ_3 288 PKHBT q_0, tunpk7, tunpk5, LSL#16 289 290 291 ;// q_0 = [r0q0 r1q0 r2q0 r3q0] 292 ;// q_1 = [r0q1 r1q1 r2q1 r3q1] 293 ;// q_2 = [r0q2 r1q2 r2q1 r3q2] 294 ;// q_3 = [r0q3 r1q3 r2q3 r3q3] 295 296 297;//--------------Filtering Decision ------------------- 298 LDR m01, =MASK_1 ;// 01010101 mask 299 MOV m00, #MASK_0 ;// 00000000 mask 300 301 ;// Check |p0-q0|<Alpha 302 USUB8 dp0q0, p_0, q_0 303 USUB8 a, q_0, p_0 304 SEL ap0q0, a, dp0q0 305 USUB8 a, ap0q0, alpha 306 SEL filt, m00, m01 307 308 ;// Check |p1-p0|<Beta 309 USUB8 dp1p0, p_1, p_0 310 USUB8 a, p_0, p_1 311 SEL a, a, dp1p0 312 USUB8 a, a, beta 313 SEL filt, m00, filt 314 315 ;// Check |q1-q0|<Beta 316 USUB8 dq1q0, q_1, q_0 317 USUB8 a, q_0, q_1 318 SEL a, a, dq1q0 319 USUB8 a, a, beta 320 SEL filt, m00, filt 321 322 ;// Check ap<Beta 323 USUB8 dp2p0, p_2, p_0 324 USUB8 a, p_0, p_2 325 SEL a, a, dp2p0 326 USUB8 a, a, beta 327 SEL apflg, m00, filt ;// apflg = filt && (ap<beta) 328 329 ;// Check aq<Beta 330 USUB8 dq2q0, q_2, q_0 331 USUB8 t2, q_0, q_2 332 SEL t2, t2, dq2q0 333 USUB8 t2, t2, beta 334 MOV t7,#0 335 336 337 BLT bSLT4 338;//-------------------Filter-------------------- 339bSGE4 340 ;//---------bSGE4 Execution--------------- 341 SEL t1, t7, filt ;// aqflg = filt && (aq<beta) 342 CMP filt, #0 343 ORR apqflg, apflg, t1, LSL #1 344 M_LDRD pQ0, srcdstStep, ppQ0Step, EQ 345 BEQ NoFilterFilt0 346 347 BL armVCM4P10_DeblockingLumabSGE4_unsafe 348 349 ;//---------Store result--------------- 350 351 LDR maskb,=MASK_2 352 353 ;// P0b = [r0p0 r1p0 r2p0 r3p0] 354 ;// P1b = [r0p1 r1p1 r2p1 r3p1] 355 ;// P2b = [r0p2 r1p2 r2p2 r3p2] 356 ;// P3b = [r0p3 r1p3 r2p3 r3p3] 357 358 M_LDR P3b, pP_3 359 M_STR Q0b, pP_3 360 361 ;//------Pack p0-p3------ 362 AND tunpk0, maskb, P0b 363 AND tunpk2, maskb, P0b, LSL#8 364 UXTAB16 tunpk0, tunpk0, P1b, ROR#8 365 UXTAB16 tunpk2, tunpk2, P1b 366 367 AND tunpk3, maskb, P2b 368 AND tunpk8, maskb, P2b, LSL#8 369 UXTAB16 tunpk3, tunpk3, P3b, ROR#8 370 UXTAB16 tunpk8, tunpk8, P3b 371 372 ;// tunpk0 = [r0p0 r0p1 r2p0 r2p1] 373 ;// tunpk2 = [r1p0 r1p1 r3p0 r3p1] 374 ;// tunpk3 = [r0p2 r0p3 r2p2 r2p3] 375 ;// tunpk8 = [r1p2 r1p3 r3p2 r3p3] 376 377 MOV p_2, Q1b 378 M_LDRD pQ0b, Stepb, ppQ0Step 379 380 PKHTB row9, tunpk0, tunpk3, ASR#16 381 PKHBT row7, tunpk3, tunpk0, LSL#16 382 PKHTB row3, tunpk2, tunpk8, ASR#16 383 PKHBT row6, tunpk8, tunpk2, LSL#16 384 385 ;// row9 = [r0p0 r0p1 r0p2 r0p3] 386 ;// row3 = [r1p0 r1p1 r1p2 r1p3] 387 ;// row7 = [r2p0 r2p1 r2p2 r2p3] 388 ;// row6 = [r3p0 r3p1 r3p2 r3p3] 389 390 M_STR row9, [pQ0b], Stepb 391 STR row7, [pQ0b, Stepb] 392 STR row6, [pQ0b, Stepb, LSL #1] 393 STR row3, [pQ0b], #4 394 395 M_LDR Q3b, pQ_3 396 397 ;// Q0b = [r0q0 r1q0 r2q0 r3q0] 398 ;// Q1b = [r0q1 r1q1 r2q1 r3q1] 399 ;// Q2b = [r0q2 r1q2 r2q2 r3q2] 400 ;// Q3b = [r0q3 r1q3 r2q3 r3q3] 401 402 ;//------Pack q0-q3------ 403 AND tunpk0, maskb, p_2 404 AND tunpk2, maskb, p_2, LSL#8 405 UXTAB16 tunpk0, tunpk0, Q0b, ROR#8 406 UXTAB16 tunpk2, tunpk2, Q0b 407 408 AND tunpk3, maskb, Q3b 409 AND tunpk8, maskb, Q3b, LSL#8 410 UXTAB16 tunpk3, tunpk3, Q2b, ROR#8 411 UXTAB16 tunpk8, tunpk8, Q2b 412 413 ;// tunpk0 = [r0q1 r0q0 r2q1 r2q0] 414 ;// tunpk2 = [r1q1 r1q0 r3q1 r3q0] 415 ;// tunpk3 = [r0q3 r0q2 r2q3 r2q2] 416 ;// tunpk8 = [r1q3 r1q2 r3q3 r3q2] 417 418 PKHTB row8, tunpk3, tunpk0, ASR#16 419 PKHBT row7, tunpk0, tunpk3, LSL#16 420 PKHTB row4, tunpk8, tunpk2, ASR#16 421 PKHBT row6, tunpk2, tunpk8, LSL#16 422 423 ;// row8 = [r0q0 r0q1 r0q2 r0q3] 424 ;// row4 = [r1q0 r1q1 r1q2 r1q3] 425 ;// row7 = [r2q0 r2q1 r2q2 r2q3] 426 ;// row6 = [r3q0 r3q1 r3q2 r3q3] 427 428 STR row4, [pQ0b] 429 STR row7, [pQ0b, Stepb] 430 STR row6, [pQ0b, Stepb, LSL #1] 431 432 SUB pQ0, pQ0b, Stepb 433 MOV p_1, Q2b 434 435 STR row8, [pQ0] 436 437 M_LDRD XY, pBS, pXYBS 438 M_LDR pThresholds, ppThresholds 439 M_LDRD alpha, beta, pAlphaBeta1 440 441 ADDS XY, XY, XY 442 ADD pThresholds, #4 443 M_STR pThresholds, ppThresholds 444 M_STR XY, pXYBS 445 BCC LoopX 446 B ExitLoopY 447 448;//---------- Exit of LoopX -------------- 449;//---- for the case of no filtering ----- 450 451NoFilterFilt0 452 ADD pQ0, pQ0, #4 453NoFilterBS0 454 ;// Load counter for LoopX 455 M_LDRD XY, pBS, pXYBS 456 M_LDR pThresholds, ppThresholds 457 M_LDRD alpha, beta, pAlphaBeta1 458 459 ;// Align the pointer 460 ADDS XY, XY, XY 461 ADD pThresholds, pThresholds, #4 462 M_STR pThresholds, ppThresholds 463 M_STR XY, pXYBS 464 BCC LoopY 465 B ExitLoopY 466 467bSLT4 468 ;//---------bSLT4 Execution--------------- 469 SEL aqflg, t7, filt ;// aqflg = filt && (aq<beta) 470 M_LDR ptC0, ppThresholds 471 CMP filt, #0 472 M_LDRD pQ0, srcdstStep, ppQ0Step, EQ 473 BEQ NoFilterFilt0 474 475 LDRB tC0, [ptC0], #4 476 M_STR ptC0, ppThresholds 477 478 BL armVCM4P10_DeblockingLumabSLT4_unsafe 479 480 ;//---------Store result--------------- 481 ;//--------Pack p1,p0,q1,q0------------ 482 483 ;//Load destination pointer 484 LDR maska,=MASK_2 485 M_STR Q0a, pP_3 486 MOV p_1, q_2 487 488 ;// P1a = [r0p1 r1p1 r2p1 r3p1] 489 ;// P0a = [r0p0 r1p0 r2p0 r3p0] 490 ;// Q0a = [r0q0 r1q0 r2q0 r3q0] 491 ;// Q1a = [r0q1 r1q1 r2q1 r3q1] 492 493 AND tunpk1, maska, P0a 494 AND tunpk2, maska, P0a, LSL#8 495 UXTAB16 tunpk1, tunpk1, P1a, ROR#8 496 UXTAB16 tunpk2, tunpk2, P1a 497 498 M_LDRD pQ0a, Stepa, ppQ0Step 499 500 AND tunpk9, maska, Q1a 501 AND tunpk3, maska, Q1a, LSL#8 502 UXTAB16 tunpk9, tunpk9, Q0a, ROR#8 503 UXTAB16 tunpk3, tunpk3, Q0a 504 505 ;// tunpk1 = [r0p0 r0p1 r2p0 r2p1] 506 ;// tunpk2 = [r1p0 r1p1 r3p0 r3p1] 507 ;// tunpk9 = [r0q1 r0q0 r2q1 r2q0] 508 ;// tunpk3 = [r1q1 r1q0 r3q1 r3q0] 509 510 MOV t4, tunpk1, LSR #16 511 MOV t0, tunpk9, LSR #16 512 513 STRH t4,[pQ0a, #2]! ;//Stores [r0p0 r0p1] 514 STRH t0,[pQ0a, #2] ;//Stores [r0q0 r0q1] 515 516 MOV t4, tunpk2, LSR #16 517 MOV t0, tunpk3, LSR #16 518 519 M_STRH t4,[pQ0a, Stepa]! ;//Stores [r1p0 r1p1] 520 STRH t0,[pQ0a, #2] ;//Stores [r1q0 r1q1] 521 522 M_STRH tunpk1,[pQ0a, Stepa]! ;//Stores [r2p0 r2p1] 523 STRH tunpk2,[pQ0a, Stepa] ;//Stores [r3p0 r3p1] 524 STRH tunpk9,[pQ0a, #2]! ;//Stores [r2q0 r2q1] 525 STRH tunpk3,[pQ0a, Stepa] ;//Stores [r3q0 r3q1] 526 527 SUB pQ0, pQ0a, Stepa, LSL #1 528 529 ;// Load counter 530 M_LDRD XY, pBS, pXYBS 531 532 ;// Reload Pixels 533 M_LDR p_0, pQ_3 534 MOV p_2, Q1a 535 536 M_LDRD alpha, beta, pAlphaBeta1 537 538 ADDS XY, XY, XY 539 M_STR XY, pXYBS 540 BCC LoopX 541 542;//-------- Common Exit of LoopY ----------------- 543 ;// Align the pointers 544 M_LDR pThresholds, ppThresholds 545ExitLoopY 546 SUB pQ0, pQ0, #16 547 ADD pQ0, pQ0, srcdstStep, LSL #2 548 SUB pBS, pBS, #15 549 SUB pThresholds, pThresholds, #15 550 M_STR pThresholds, ppThresholds 551 552 M_LDRD alpha, beta, pAlphaBeta0 553 554 BNE LoopY 555 MOV r0, #OMX_Sts_NoErr 556 557 M_END 558;//-----------------End Filter-------------------- 559 560 ENDIF 561 562 END 563 564 565