1;// 2;// Copyright (C) 2007-2008 ARM Limited 3;// 4;// Licensed under the Apache License, Version 2.0 (the "License"); 5;// you may not use this file except in compliance with the License. 6;// You may obtain a copy of the License at 7;// 8;// http://www.apache.org/licenses/LICENSE-2.0 9;// 10;// Unless required by applicable law or agreed to in writing, software 11;// distributed under the License is distributed on an "AS IS" BASIS, 12;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13;// See the License for the specific language governing permissions and 14;// limitations under the License. 15;// 16;// 17;// 18;// File Name: omxVCM4P10_FilterDeblockingChroma_VerEdge_I_s.s 19;// OpenMAX DL: v1.0.2 20;// Revision: 9641 21;// Date: Thursday, February 7, 2008 22;// 23;// 24;// 25;// 26 27 INCLUDE omxtypes_s.h 28 INCLUDE armCOMM_s.h 29 30 M_VARIANTS ARM1136JS 31 32 33 IF ARM1136JS 34 35 36MASK_0 EQU 0x00000000 37MASK_1 EQU 0x01010101 38MASK_2 EQU 0x0000ff00 39LOOP_COUNT EQU 0x50000000 40 41;// Declare input registers 42 43pSrcDst RN 0 44srcdstStep RN 1 45pAlphaArg RN 2 46pBetaArg RN 3 47 48pThresholds RN 6 49pBS RN 9 50pQ0 RN 0 51bS RN 2 52bSTemp RN 10 53 54alpha RN 6 55alpha0 RN 6 56alpha1 RN 8 57 58beta RN 7 59beta0 RN 7 60beta1 RN 9 61 62;// Declare Local/Temporary variables 63 64;// Pixels 65p_0 RN 3 66p_1 RN 5 67q_0 RN 8 68q_1 RN 9 69 70;// Unpacking 71mask RN 11 72 73row0 RN 2 74row1 RN 4 75row2 RN 5 76row3 RN 3 77 78row4 RN 8 79row5 RN 9 80row6 RN 10 81row7 RN 12 82 83tunpk0 RN 2 84tunpk2 RN 10 85tunpk3 RN 12 86 87tunpk4 RN 4 88tunpk5 RN 5 89tunpk6 RN 14 90tunpk7 RN 2 91 92;// Filtering 93 94dp0q0 RN 12 95dp1p0 RN 12 96dq1q0 RN 12 97 98ap0q0 RN 4 99filt RN 2 100 101m00 RN 14 102m01 RN 11 103 104pQ0 RN 0 105Step RN 1 106 107;// Output 108 109P_0 RN 6 110Q_0 RN 7 111 112;//Declarations for bSLT4 kernel 113 114tC RN 12 115tC0 RN 5 116tC1 RN 12 117pos RN 5 118neg RN 9 119 120;//Declarations for bSGE4 kernel 121 122 123;// Miscellanous 124XY RN 8 125 126a RN 10 127t1 RN 10 128t2 RN 12 129t3 RN 14 130t4 RN 6 131t5 RN 5 132 133 134 ;// Allocate stack memory 135 M_ALLOC4 ppThresholds,4 136 M_ALLOC8 pAlphaBeta0,8 137 M_ALLOC8 pAlphaBeta1,8 138 M_ALLOC8 pXYBS,4 139 M_ALLOC4 ppBS,4 140 141 ;// Function header 142 M_START omxVCM4P10_FilterDeblockingChroma_VerEdge_I, r11 143 144 ;//Input arguments on the stack 145 M_ARG ppThresholdsArg, 4 146 M_ARG ppBSArg, 4 147 148 LDRB alpha1, [pAlphaArg,#1] 149 LDRB beta1, [pBetaArg,#1] 150 M_LDR pThresholds, ppThresholdsArg 151 LDR a,=MASK_1 152 LDRB beta0, [pBetaArg] 153 M_STR pThresholds, ppThresholds 154 LDRB alpha0, [pAlphaArg] 155 156 MUL alpha1, alpha1, a 157 MUL beta1, beta1, a 158 MUL alpha0, alpha0, a 159 MUL beta0, beta0, a 160 161 M_STRD alpha1, beta1, pAlphaBeta1 162 M_LDR pBS, ppBSArg 163 M_STRD alpha0, beta0, pAlphaBeta0 164 165 LDR XY,=LOOP_COUNT 166 M_STRD XY, pBS, pXYBS 167 168 169LoopY 170LoopX 171;//---------------Load Pixels------------------- 172 173;//----------------Pack q0-q1----------------------- 174 LDRH bS, [pBS], #8 175 LDR mask, =MASK_2 176 177 M_LDRH row4, [pQ0], srcdstStep 178 CMP bS, #0 179 M_STR pBS, ppBS 180 M_LDRH row5, [pQ0], srcdstStep 181 BEQ.W NoFilterBS0 182 LDRH row6, [pQ0] 183 LDRH row7, [pQ0, srcdstStep] 184 185 ;// row4 = [0 0 r0q0 r0q1] 186 ;// row5 = [0 0 r1q0 r1q1] 187 ;// row6 = [0 0 r2q0 r2q1] 188 ;// row7 = [0 0 r3q0 r3q1] 189 190 AND tunpk4, mask, row4 191 AND tunpk5, mask, row4, LSL#8 192 UXTAB tunpk4, tunpk4, row5, ROR#8 193 UXTAB tunpk5, tunpk5, row5 194 AND tunpk6, mask, row6 195 AND tunpk7, mask, row6, LSL#8 196 UXTAB tunpk6, tunpk6, row7, ROR#8 197 UXTAB tunpk7, tunpk7, row7 198 199 ;// tunpk4 = [0 0 r0q0 r1q0] 200 ;// tunpk5 = [0 0 r0q1 r1q1] 201 ;// tunpk6 = [0 0 r2q0 r3q0] 202 ;// tunpk7 = [0 0 r2q1 r3q1] 203 204 SUB pQ0, pQ0, srcdstStep, LSL #1 205 SUB pQ0, pQ0, #2 206 207 PKHBT q_1, tunpk6, tunpk4, LSL#16 208 PKHBT q_0, tunpk7, tunpk5, LSL#16 209 210 ;// q_0 = [r0q0 r1q0 r2q0 r3q0] 211 ;// q_1 = [r0q1 r1q1 r2q1 r3q1] 212 213 214;//----------------Pack p0-p1----------------------- 215 216 M_LDRH row0, [pQ0], srcdstStep 217 M_LDRH row1, [pQ0], srcdstStep 218 LDRH row2, [pQ0] 219 LDRH row3, [pQ0, srcdstStep] 220 221 ;// row0 = [0 0 r0p0 r0p1] 222 ;// row1 = [0 0 r1p0 r1p1] 223 ;// row2 = [0 0 r2p0 r2p1] 224 ;// row3 = [0 0 r3p0 r3p1] 225 226 AND tunpk2, mask, row0 227 AND tunpk6, mask, row0, LSL#8 228 UXTAB tunpk2, tunpk2, row1, ROR#8 229 UXTAB tunpk6, tunpk6, row1 230 231 AND tunpk0, mask, row2 232 AND tunpk3, mask, row2, LSL#8 233 UXTAB tunpk0, tunpk0, row3, ROR#8 234 UXTAB tunpk3, tunpk3, row3 235 236 ;// tunpk2 = [0 0 r0p0 r1p0] 237 ;// tunpk6 = [0 0 r0p1 r1p1] 238 ;// tunpk0 = [0 0 r2p0 r3p0] 239 ;// tunpk3 = [0 0 r2p1 r3p1] 240 241 PKHBT p_0, tunpk0, tunpk2, LSL#16 242 M_LDR bSTemp, ppBS 243 PKHBT p_1, tunpk3, tunpk6, LSL#16 244 245 ;// p_0 = [r0p0 r1p0 r2p0 r3p0] 246 ;// p_1 = [r0p1 r1p1 r2p1 r3p1] 247 248;//--------------Filtering Decision ------------------- 249 USUB8 dp0q0, p_0, q_0 250 LDR m01, =MASK_1 251 LDRH bSTemp, [bSTemp ,#-8] 252 MOV m00, #MASK_0 ;// 00000000 mask 253 254 MOV filt, m01 255 TST bSTemp, #0xff00 256 MOVEQ filt, filt, LSL #16 257 TST bSTemp, #0xff 258 MOVEQ filt, filt, LSR #16 259 TST bSTemp, #4 260 261 ;// Check |p0-q0|<Alpha 262 USUB8 a, q_0, p_0 263 SEL ap0q0, a, dp0q0 264 USUB8 a, ap0q0, alpha 265 SEL filt, m00, filt 266 267 ;// Check |p1-p0|<Beta 268 USUB8 dp1p0, p_1, p_0 269 USUB8 a, p_0, p_1 270 SEL a, a, dp1p0 271 USUB8 a, a, beta 272 SEL filt, m00, filt 273 274 ;// Check |q1-q0|<Beta 275 USUB8 dq1q0, q_1, q_0 276 USUB8 a, q_0, q_1 277 SEL a, a, dq1q0 278 USUB8 a, a, beta 279 SEL filt, m00, filt 280 281 BEQ bSLT4 282;//-------------------Filter-------------------- 283bSGE4 284 ;//---------bSGE4 Execution--------------- 285 CMP filt, #0 286 287 M_LDR pThresholds, ppThresholds 288 289 ;// Compute P0b 290 UHADD8 t1, p_0, q_1 291 BEQ NoFilterFilt0 292 MVN t2, p_1 293 UHSUB8 t1, t1, t2 294 USUB8 t2, filt, m01 295 EOR t1, t1, m01, LSL #7 296 297 ADD pThresholds,pThresholds, #4 298 299 ;// Compute Q0b 300 UHADD8 t2, q_0, p_1 301 MVN t3, q_1 302 UHSUB8 t2, t2, t3 303 M_STR pThresholds, ppThresholds 304 SEL P_0, t1, p_0 305 EOR t2, t2, m01, LSL #7 306 SEL Q_0, t2, q_0 307 308 B StoreResultAndExit 309 310;//---------- Exit of LoopX -------------- 311;//---- for the case of no filtering ----- 312 313NoFilterFilt0 314 ADD pQ0, pQ0, #2 315NoFilterBS0 316 M_LDR pThresholds, ppThresholds 317 SUB pQ0, pQ0, srcdstStep, LSL #1 318 ADD pQ0, pQ0, #4 319 ADD pThresholds, pThresholds, #4 320 ;// Load counter for LoopX 321 M_LDRD XY, pBS, pXYBS 322 M_STR pThresholds, ppThresholds 323 M_LDRD alpha, beta, pAlphaBeta1 324 325 ;// Align the pointer 326 ADDS XY, XY, XY 327 M_STR XY, pXYBS 328 BCC LoopY 329 B ExitLoopY 330 331bSLT4 332 ;//---------bSLT4 Execution--------------- 333 M_LDR pThresholds, ppThresholds 334 CMP filt, #0 335 336 337 ;// Since beta <= 18 and alpha <= 255 we know 338 ;// -254 <= p0-q0 <= 254 339 ;// -17 <= q1-q0 <= 17 340 ;// -17 <= p1-p0 <= 17 341 342 ;// delta = Clip3( -tC, tC, ((((q0-p0)<<2) + (p1-q1) + 4)>>3)) 343 ;// 344 ;// Calculate A = (((q0-p0)<<2) + (p1-q1) + 4)>>3 345 ;// = (4*q0 - 4*p0 + p1 - q1 + 4)>>3 346 ;// = ((p1-p0) - (q1-q0) - 3*(p0-q0) + 4)>>3 347 348 USUB8 t1, p_1, p_0 349 USUB8 t2, q_1, q_0 350 BEQ NoFilterFilt0 351 352 LDRB tC0, [pThresholds], #1 353 SSUB8 t1, t1, t2 354 LDRB tC1, [pThresholds], #3 355 M_STR pThresholds, ppThresholds 356 UHSUB8 t4, p_0, q_0 357 ORR tC, tC1, tC0, LSL #16 358 USUB8 t5, p_0, q_0 359 AND t5, t5, m01 360 SHSUB8 t1, t1, t5 361 ORR tC, tC, LSL #8 362 SSUB8 t1, t1, t5 363 SHSUB8 t1, t1, t4 364 UQADD8 tC, tC, m01 365 SADD8 t1, t1, m01 366 USUB8 t5, filt, m01 367 SHSUB8 t1, t1, t4 368 SEL tC, tC, m00 369 370 ;// Split into positive and negative part and clip 371 372 SSUB8 t1, t1, m00 373 SEL pos, t1, m00 374 USUB8 neg, pos, t1 375 USUB8 t3, pos, tC 376 SEL pos, tC, pos 377 USUB8 t3, neg, tC 378 SEL neg, tC, neg 379 UQADD8 P_0, p_0, pos 380 UQSUB8 Q_0, q_0, pos 381 UQSUB8 P_0, P_0, neg 382 UQADD8 Q_0, Q_0, neg 383 384 ;// Choose to store the filtered 385 ;// value or the original pixel 386 USUB8 t1, filt, m01 387 SEL P_0, P_0, p_0 388 SEL Q_0, Q_0, q_0 389 390StoreResultAndExit 391 392 ;//---------Store result--------------- 393 394 ;// P_0 = [r0p0 r1p0 r2p0 r3p0] 395 ;// Q_0 = [r0q0 r1q0 r2q0 r3q0] 396 397 SUB pQ0, pQ0, srcdstStep, LSL #1 398 ADD pQ0, pQ0, #1 399 400 MOV t1, Q_0, LSR #24 401 STRB t1, [pQ0, #1] 402 MOV t1, P_0, LSR #24 403 M_STRB t1, [pQ0], srcdstStep 404 405 MOV t1, Q_0, LSR #16 406 STRB t1, [pQ0, #1] 407 MOV t1, P_0, LSR #16 408 M_STRB t1, [pQ0], srcdstStep 409 410 MOV t1, P_0, LSR #8 411 STRB t1, [pQ0] 412 STRB P_0, [pQ0, srcdstStep] 413 MOV t1, Q_0, LSR #8 414 STRB t1, [pQ0, #1]! 415 STRB Q_0, [pQ0, srcdstStep] 416 417 M_LDRD XY, pBS, pXYBS 418 M_LDRD alpha, beta, pAlphaBeta1 419 420 SUB pQ0, pQ0, srcdstStep, LSL #1 421 ADD pQ0, pQ0, #4 422 423 ADDS XY, XY, XY 424 M_STR XY, pXYBS 425 BCC LoopX 426 427;//-------- Common Exit of LoopY ----------------- 428 ;// Align the pointers 429 430ExitLoopY 431 432 M_LDR pThresholds, ppThresholds 433 SUB pQ0, pQ0, #8 434 ADD pQ0, pQ0, srcdstStep, LSL #2 435 SUB pBS, pBS, #14 436 SUB pThresholds, pThresholds, #6 437 M_STR pThresholds, ppThresholds 438 439 M_LDRD alpha, beta, pAlphaBeta0 440 441 BNE LoopY 442 MOV r0, #OMX_Sts_NoErr 443;//-----------------End Filter-------------------- 444 445 M_END 446 447 ENDIF 448 449 END 450 451 452