1;// 2;// 3;// File Name: omxVCM4P10_FilterDeblockingLuma_VerEdge_I_s.s 4;// OpenMAX DL: v1.0.2 5;// Revision: 12290 6;// Date: Wednesday, April 9, 2008 7;// 8;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. 9;// 10;// 11;// 12 13 INCLUDE omxtypes_s.h 14 INCLUDE armCOMM_s.h 15 16 M_VARIANTS CortexA8 17 18 IMPORT armVCM4P10_DeblockingLumabSLT4_unsafe 19 IMPORT armVCM4P10_DeblockingLumabSGE4_unsafe 20 21 IF CortexA8 22 23LOOP_COUNT EQU 0x11000000 24 25 26;// Function arguments 27 28pSrcDst RN 0 29srcdstStep RN 1 30pAlpha RN 2 31pBeta RN 3 32 33pThresholds RN 5 34pBS RN 4 35bS10 RN 12 36 37pAlpha_0 RN 2 38pBeta_0 RN 3 39 40pAlpha_1 RN 7 41pBeta_1 RN 8 42 43pTmp RN 10 44pTmpStep RN 11 45 46;// Loop 47 48XY RN 9 49 50;// Rows input 51dRow0 DN D7.U8 52dRow1 DN D8.U8 53dRow2 DN D5.U8 54dRow3 DN D10.U8 55dRow4 DN D6.U8 56dRow5 DN D9.U8 57dRow6 DN D4.U8 58dRow7 DN D11.U8 59 60;// dRow0 - dP_3, dRow1 - dQ_0, dRow2 - dP_1, dRow3 - dQ_2 61;// dRow4 - dP_2, dRow5 - dQ_1, dRow6 - dP_0, dRow7 - dQ_3 62 63;// Rows output 64dRown0 DN D7.U8 65dRown1 DN D24.U8 66dRown2 DN D30.U8 67dRown3 DN D10.U8 68dRown4 DN D6.U8 69dRown5 DN D25.U8 70dRown6 DN D29.U8 71dRown7 DN D11.U8 72 73;// dP_0n DN D29.U8 74;// dP_1n DN D30.U8 75;// dP_2n DN D31.U8 76;// 77;// dQ_0n DN D24.U8 ;!!;Temp2 78;// dQ_1n DN D25.U8 ;!!;Temp2 79;// dQ_2n DN D28.U8 ;!!;dQ_0t 80;// 81;// dRown0 - dP_3, dRown1 - dQ_0n 82;// dRown2 - dP_1n, dRown3 - dQ_2 83;// dRown4 - dP_2, dRown5 - dQ_1n 84;// dRown6 - dP_0n, dRown7 - dQ_3 85 86dRow0n DN D7.U8 87dRow1n DN D24.U8 88dRow2n DN D30.U8 89dRow3n DN D28.U8 90dRow4n DN D31.U8 91dRow5n DN D25.U8 92dRow6n DN D29.U8 93dRow7n DN D11.U8 94 95;// dRow0n - dP_3, dRow1n - dQ_0n, dRow2n - dP_1n, dRow3n - dQ_2n 96;// dRow4n - dP_2, dRow5n - dQ_1n, dRow6n - dP_0n, dRow7n - dQ_3 97 98;// Pixels 99dP_0 DN D4.U8 100dP_1 DN D5.U8 101dP_2 DN D6.U8 102dP_3 DN D7.U8 103dQ_0 DN D8.U8 104dQ_1 DN D9.U8 105dQ_2 DN D10.U8 106dQ_3 DN D11.U8 107 108 109;// Filtering Decision 110dAlpha DN D0.U8 111dBeta DN D2.U8 112 113dFilt DN D16.U8 114dAqflg DN D12.U8 115dApflg DN D17.U8 116 117dAp0q0 DN D13.U8 118dAp1p0 DN D12.U8 119dAq1q0 DN D18.U8 120dAp2p0 DN D19.U8 121dAq2q0 DN D17.U8 122 123;// bSLT4 124dTC0 DN D18.U8 125dTC1 DN D19.U8 126dTC01 DN D18.U8 127 128dTCs DN D31.S8 129dTC DN D31.U8 130 131dMask_0 DN D14.U8 132dMask_1 DN D15.U8 133 134Mask_0 RN 6 135 136dTemp DN D19.U8 137 138;// Computing P0,Q0 139qDq0p0 QN Q10.S16 140qDp1q1 QN Q11.S16 141qDelta QN Q10.S16 ; reuse qDq0p0 142dDelta DN D20.S8 143 144 145;// Computing P1,Q1 146dRp0q0 DN D24.U8 147 148dMaxP DN D23.U8 149dMinP DN D22.U8 150 151dMaxQ DN D19.U8 152dMinQ DN D21.U8 153 154dDeltaP DN D26.U8 155dDeltaQ DN D27.U8 156 157qP_0n QN Q14.S16 158qQ_0n QN Q12.S16 159 160dQ_0n DN D24.U8 161dQ_1n DN D25.U8 162dP_0n DN D29.U8 163dP_1n DN D30.U8 164 165;// bSGE4 166 167qSp0q0 QN Q10.U16 168 169qSp2q1 QN Q11.U16 170qSp0q0p1 QN Q12.U16 171qSp3p2 QN Q13.U16 172dHSp0q1 DN D28.U8 173 174qSq2p1 QN Q11.U16 175qSp0q0q1 QN Q12.U16 176qSq3q2 QN Q13.U16 ;!! 177dHSq0p1 DN D28.U8 ;!! 178 179qTemp1 QN Q11.U16 ;!!;qSp2q1 180qTemp2 QN Q12.U16 ;!!;qSp0q0p1 181 182dP_0t DN D28.U8 ;!!;dHSp0q1 183dQ_0t DN D22.U8 ;!!;Temp1 184 185dP_0n DN D29.U8 186dP_1n DN D30.U8 187dP_2n DN D31.U8 188 189dQ_0n DN D24.U8 ;!!;Temp2 190dQ_1n DN D25.U8 ;!!;Temp2 191dQ_2n DN D28.U8 ;!!;dQ_0t 192 193 194 ;// Function header 195 M_START omxVCM4P10_FilterDeblockingLuma_VerEdge_I, r11, d15 196 197 ;//Arguments on the stack 198 M_ARG ppThresholds, 4 199 M_ARG ppBS, 4 200 201 ;// d0-dAlpha_0 202 ;// d2-dBeta_0 203 204 ADD pAlpha_1, pAlpha_0, #1 205 ADD pBeta_1, pBeta_0, #1 206 207 VLD1 {dAlpha[]}, [pAlpha_0] 208 SUB pSrcDst, pSrcDst, #4 209 VLD1 {dBeta[]}, [pBeta_0] 210 211 M_LDR pBS, ppBS 212 M_LDR pThresholds, ppThresholds 213 214 MOV Mask_0,#0 215 216 ;dMask_0-14 217 ;dMask_1-15 218 219 VMOV dMask_0, #0 220 VMOV dMask_1, #1 221 222 LDR XY,=LOOP_COUNT 223 224 ADD pTmpStep, srcdstStep, srcdstStep 225 226 ;// p0-p3 - d4-d7 227 ;// q0-q3 - d8-d11 228LoopY 229LoopX 230 LDRH bS10, [pBS], #4 231 232 CMP bS10, #0 233 BEQ NoFilterBS0 234 235 ;// Load 8 rows of data 236 ADD pTmp, pSrcDst, srcdstStep 237 VLD1 dRow0, [pSrcDst], pTmpStep 238 VLD1 dRow1, [pTmp], pTmpStep 239 VLD1 dRow2, [pSrcDst], pTmpStep 240 VZIP.8 dRow0, dRow1 241 VLD1 dRow3, [pTmp], pTmpStep 242 VLD1 dRow4, [pSrcDst], pTmpStep 243 VZIP.8 dRow2, dRow3 244 VLD1 dRow5, [pTmp], pTmpStep 245 VLD1 dRow6, [pSrcDst], pTmpStep 246 VLD1 dRow7, [pTmp], pTmpStep 247 VZIP.8 dRow4, dRow5 248 VZIP.16 dRow1, dRow3 249 250 251 ;// dRow0 = [q3r0 q2r0 q1r0 q0r0 p0r0 p1r0 p2r0 p3r0] 252 ;// dRow1 = [q3r1 q2r1 q1r1 q0r1 p0r1 p1r1 p2r1 p3r1] 253 ;// dRow2 = [q3r2 q2r2 q1r2 q0r2 p0r2 p1r2 p2r2 p3r2] 254 ;// dRow3 = [q3r3 q2r3 q1r3 q0r3 p0r3 p1r3 p2r3 p3r3] 255 ;// dRow4 = [q3r4 q2r4 q1r4 q0r4 p0r4 p1r4 p2r4 p3r4] 256 ;// dRow5 = [q3r5 q2r5 q1r5 q0r5 p0r5 p1r5 p2r5 p3r5] 257 ;// dRow6 = [q3r6 q2r6 q1r6 q0r6 p0r6 p1r6 p2r6 p3r6] 258 ;// dRow7 = [q3r7 q2r7 q1r7 q0r7 p0r7 p1r7 p2r7 p3r7] 259 260 ;// 8x8 Transpose 261 262 VZIP.8 dRow6, dRow7 263 264 SUB pSrcDst, pSrcDst, srcdstStep, LSL #3 265 VZIP.16 dRow0, dRow2 266 VZIP.16 dRow5, dRow7 267 268 269 VZIP.16 dRow4, dRow6 270 VZIP.32 dRow1, dRow5 271 VZIP.32 dRow2, dRow6 272 VZIP.32 dRow3, dRow7 273 VZIP.32 dRow0, dRow4 274 275 276 ;// dRow0 - dP_3, dRow1 - dQ_0, dRow2 - dP_1, dRow3 - dQ_2 277 ;// dRow4 - dP_2, dRow5 - dQ_1, dRow6 - dP_0, dRow7 - dQ_3 278 279 ;// dQ_0 = [q0r7 q0r6 q0r5 q0r4 q0r3 q0r2 q0r1 q0r0] 280 ;// dQ_1 = [q1r7 q1r6 q1r5 q1r4 q1r3 q1r2 q1r1 q1r0] 281 ;// dQ_2 = [q2r7 q2r6 q2r5 q2r4 q2r3 q2r2 q2r1 q2r0] 282 ;// dQ_3 = [q3r7 q3r6 q3r5 q3r4 q3r3 q3r2 q3r1 q3r0] 283 284 ;// dP_0 = [p0r7 p0r6 p0r5 p0r4 p0r3 p0r2 p0r1 p0r0] 285 ;// dP_1 = [p1r7 p1r6 p1r5 p1r4 p1r3 p1r2 p1r1 p1r0] 286 ;// dP_2 = [p2r7 p2r6 p2r5 p2r4 p2r3 p2r2 p2r1 p2r0] 287 ;// dP_3 = [p3r7 p3r6 p3r5 p3r4 p3r3 p3r2 p3r1 p3r0] 288 289 VABD dAp0q0, dP_0, dQ_0 290 VABD dAp1p0, dP_1, dP_0 291 292 VABD dAq1q0, dQ_1, dQ_0 293 VABD dAp2p0, dP_2, dP_0 294 295 TST bS10, #0xff 296 VCGT dFilt, dAlpha, dAp0q0 297 298 VMAX dAp1p0, dAq1q0, dAp1p0 299 VABD dAq2q0, dQ_2, dQ_0 300 301 VMOVEQ.U32 dFilt[0], Mask_0 302 TST bS10, #0xff00 303 304 VCGT dAp2p0, dBeta, dAp2p0 305 VCGT dAp1p0, dBeta, dAp1p0 306 307 VMOVEQ.U32 dFilt[1], Mask_0 308 309 VCGT dAq2q0, dBeta, dAq2q0 310 VAND dFilt, dFilt, dAp1p0 311 TST bS10, #4 312 313 VAND dAqflg, dFilt, dAq2q0 314 VAND dApflg, dFilt, dAp2p0 315 316 BNE bSGE4 317bSLT4 318 ;// bS < 4 Filtering 319 320 BL armVCM4P10_DeblockingLumabSLT4_unsafe 321 322 ;// Transpose 323 324 VZIP.8 dP_3, dP_2 325 VZIP.8 dP_1n, dP_0n 326 VZIP.8 dQ_0n, dQ_1n 327 VZIP.8 dQ_2, dQ_3 328 329 330 VZIP.16 dP_3, dP_1n 331 ADD pTmp, pSrcDst, srcdstStep 332 VZIP.16 dQ_0n, dQ_2 333 VZIP.16 dQ_1n, dQ_3 334 VZIP.16 dP_2, dP_0n 335 336 VZIP.32 dP_3, dQ_0n 337 VZIP.32 dP_1n, dQ_2 338 VZIP.32 dP_2, dQ_1n 339 VZIP.32 dP_0n, dQ_3 340 341 ;// dRown0 - dP_3, dRown1 - dQ_0n 342 ;// dRown2 - dP_1n, dRown3 - dQ_2 343 ;// dRown4 - dP_2, dRown5 - dQ_1n 344 ;// dRown6 - dP_0n, dRown7 - dQ_3 345 346 VST1 dRown0, [pSrcDst], pTmpStep 347 VST1 dRown1, [pTmp], pTmpStep 348 VST1 dRown2, [pSrcDst], pTmpStep 349 VST1 dRown3, [pTmp], pTmpStep 350 ;1 351 VST1 dRown4, [pSrcDst], pTmpStep 352 VST1 dRown5, [pTmp], pTmpStep 353 ADDS XY, XY, XY 354 VST1 dRown6, [pSrcDst], pTmpStep 355 ADD pThresholds, pThresholds, #2 356 VST1 dRown7, [pTmp], srcdstStep 357 358 SUB pSrcDst, pSrcDst, srcdstStep, LSL #3 359 VLD1 {dAlpha[]}, [pAlpha_1] 360 ADD pSrcDst, pSrcDst, #4 361 VLD1 {dBeta[]}, [pBeta_1] 362 363 BCC LoopX 364 B ExitLoopY 365 366NoFilterBS0 367 ADD pSrcDst, pSrcDst, #4 368 ADDS XY, XY, XY 369 VLD1 {dAlpha[]}, [pAlpha_1] 370 ADD pThresholds, pThresholds, #4 371 VLD1 {dBeta[]}, [pBeta_1] 372 BCC LoopX 373 B ExitLoopY 374bSGE4 375 ;// bS >= 4 Filtering 376 377 BL armVCM4P10_DeblockingLumabSGE4_unsafe 378 379 ;// Transpose 380 381 VZIP.8 dP_3, dP_2n 382 VZIP.8 dP_1n, dP_0n 383 VZIP.8 dQ_0n, dQ_1n 384 VZIP.8 dQ_2n, dQ_3 385 386 VZIP.16 dP_3, dP_1n 387 ADD pTmp, pSrcDst, srcdstStep 388 VZIP.16 dQ_0n, dQ_2n 389 VZIP.16 dQ_1n, dQ_3 390 VZIP.16 dP_2n, dP_0n 391 392 VZIP.32 dP_3, dQ_0n 393 VZIP.32 dP_1n, dQ_2n 394 VZIP.32 dP_2n, dQ_1n 395 VZIP.32 dP_0n, dQ_3 396 397 ;// dRow0n - dP_3, dRow1n - dQ_0n, dRow2n - dP_1n, dRow3n - dQ_2n 398 ;// dRow4n - dP_2, dRow5n - dQ_1n, dRow6n - dP_0n, dRow7n - dQ_3 399 400 VST1 dRow0n, [pSrcDst], pTmpStep 401 VST1 dRow1n, [pTmp], pTmpStep 402 VST1 dRow2n, [pSrcDst], pTmpStep 403 VST1 dRow3n, [pTmp], pTmpStep 404 VST1 dRow4n, [pSrcDst], pTmpStep 405 VST1 dRow5n, [pTmp], pTmpStep 406 ADDS XY,XY,XY 407 VST1 dRow6n, [pSrcDst], pTmpStep 408 ADD pThresholds, pThresholds, #4 409 VST1 dRow7n, [pTmp], pTmpStep 410 411 SUB pSrcDst, pSrcDst, srcdstStep, LSL #3 412 VLD1 {dAlpha[]}, [pAlpha_1] 413 ADD pSrcDst, pSrcDst, #4 414 VLD1 {dBeta[]}, [pBeta_1] 415 416 BCC LoopX 417 418ExitLoopY 419 SUB pBS, pBS, #14 420 SUB pThresholds, pThresholds, #14 421 SUB pSrcDst, pSrcDst, #16 422 VLD1 {dAlpha[]}, [pAlpha_0] 423 ADD pSrcDst, pSrcDst, srcdstStep, LSL #3 424 VLD1 {dBeta[]}, [pBeta_0] 425 BNE LoopY 426 427 MOV r0, #OMX_Sts_NoErr 428 429 M_END 430 431 ENDIF 432 433 434 END 435 436 437