1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12 EXPORT |vp8_loop_filter_horizontal_edge_armv6| 13 EXPORT |vp8_mbloop_filter_horizontal_edge_armv6| 14 EXPORT |vp8_loop_filter_vertical_edge_armv6| 15 EXPORT |vp8_mbloop_filter_vertical_edge_armv6| 16 17 AREA |.text|, CODE, READONLY ; name this block of code 18 19 MACRO 20 TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3 21 ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3 22 ; a0: 03 02 01 00 23 ; a1: 13 12 11 10 24 ; a2: 23 22 21 20 25 ; a3: 33 32 31 30 26 ; b3 b2 b1 b0 27 28 uxtb16 $b1, $a1 ; xx 12 xx 10 29 uxtb16 $b0, $a0 ; xx 02 xx 00 30 uxtb16 $b3, $a3 ; xx 32 xx 30 31 uxtb16 $b2, $a2 ; xx 22 xx 20 32 orr $b1, $b0, $b1, lsl #8 ; 12 02 10 00 33 orr $b3, $b2, $b3, lsl #8 ; 32 22 30 20 34 35 uxtb16 $a1, $a1, ror #8 ; xx 13 xx 11 36 uxtb16 $a3, $a3, ror #8 ; xx 33 xx 31 37 uxtb16 $a0, $a0, ror #8 ; xx 03 xx 01 38 uxtb16 $a2, $a2, ror #8 ; xx 23 xx 21 39 orr $a0, $a0, $a1, lsl #8 ; 13 03 11 01 40 orr $a2, $a2, $a3, lsl #8 ; 33 23 31 21 41 42 pkhtb $b2, $b3, $b1, asr #16 ; 32 22 12 02 -- p1 43 pkhbt $b0, $b1, $b3, lsl #16 ; 30 20 10 00 -- p3 44 45 pkhtb $b3, $a2, $a0, asr #16 ; 33 23 13 03 -- p0 46 pkhbt $b1, $a0, $a2, lsl #16 ; 31 21 11 01 -- p2 47 MEND 48 49 50src RN r0 51pstep RN r1 52count RN r5 53 54;r0 unsigned char *src_ptr, 55;r1 int src_pixel_step, 56;r2 const char *flimit, 57;r3 const char *limit, 58;stack const char *thresh, 59;stack int count 60 61;Note: All 16 elements in flimit are equal. So, in the code, only one load is needed 62;for flimit. Same way applies to limit and thresh. 63 64;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- 65|vp8_loop_filter_horizontal_edge_armv6| PROC 66;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- 67 stmdb sp!, {r4 - r11, lr} 68 69 sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines 70 ldr count, [sp, #40] ; count for 8-in-parallel 71 ldr r6, [sp, #36] ; load thresh address 72 sub sp, sp, #16 ; create temp buffer 73 74 ldr r9, [src], pstep ; p3 75 ldr r4, [r2], #4 ; flimit 76 ldr r10, [src], pstep ; p2 77 ldr r2, [r3], #4 ; limit 78 ldr r11, [src], pstep ; p1 79 uadd8 r4, r4, r4 ; flimit * 2 80 ldr r3, [r6], #4 ; thresh 81 mov count, count, lsl #1 ; 4-in-parallel 82 uadd8 r4, r4, r2 ; flimit * 2 + limit 83 84|Hnext8| 85 ; vp8_filter_mask() function 86 ; calculate breakout conditions 87 ldr r12, [src], pstep ; p0 88 89 uqsub8 r6, r9, r10 ; p3 - p2 90 uqsub8 r7, r10, r9 ; p2 - p3 91 uqsub8 r8, r10, r11 ; p2 - p1 92 uqsub8 r10, r11, r10 ; p1 - p2 93 94 orr r6, r6, r7 ; abs (p3-p2) 95 orr r8, r8, r10 ; abs (p2-p1) 96 uqsub8 lr, r6, r2 ; compare to limit. lr: vp8_filter_mask 97 uqsub8 r8, r8, r2 ; compare to limit 98 uqsub8 r6, r11, r12 ; p1 - p0 99 orr lr, lr, r8 100 uqsub8 r7, r12, r11 ; p0 - p1 101 ldr r9, [src], pstep ; q0 102 ldr r10, [src], pstep ; q1 103 orr r6, r6, r7 ; abs (p1-p0) 104 uqsub8 r7, r6, r2 ; compare to limit 105 uqsub8 r8, r6, r3 ; compare to thresh -- save r8 for later 106 orr lr, lr, r7 107 108 uqsub8 r6, r11, r10 ; p1 - q1 109 uqsub8 r7, r10, r11 ; q1 - p1 110 uqsub8 r11, r12, r9 ; p0 - q0 111 uqsub8 r12, r9, r12 ; q0 - p0 112 orr r6, r6, r7 ; abs (p1-q1) 113 ldr r7, c0x7F7F7F7F 114 orr r12, r11, r12 ; abs (p0-q0) 115 ldr r11, [src], pstep ; q2 116 uqadd8 r12, r12, r12 ; abs (p0-q0) * 2 117 and r6, r7, r6, lsr #1 ; abs (p1-q1) / 2 118 uqsub8 r7, r9, r10 ; q0 - q1 119 uqadd8 r12, r12, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2 120 uqsub8 r6, r10, r9 ; q1 - q0 121 uqsub8 r12, r12, r4 ; compare to flimit 122 uqsub8 r9, r11, r10 ; q2 - q1 123 124 orr lr, lr, r12 125 126 ldr r12, [src], pstep ; q3 127 uqsub8 r10, r10, r11 ; q1 - q2 128 orr r6, r7, r6 ; abs (q1-q0) 129 orr r10, r9, r10 ; abs (q2-q1) 130 uqsub8 r7, r6, r2 ; compare to limit 131 uqsub8 r10, r10, r2 ; compare to limit 132 uqsub8 r6, r6, r3 ; compare to thresh -- save r6 for later 133 orr lr, lr, r7 134 orr lr, lr, r10 135 136 uqsub8 r10, r12, r11 ; q3 - q2 137 uqsub8 r9, r11, r12 ; q2 - q3 138 139 mvn r11, #0 ; r11 == -1 140 141 orr r10, r10, r9 ; abs (q3-q2) 142 uqsub8 r10, r10, r2 ; compare to limit 143 144 mov r12, #0 145 orr lr, lr, r10 146 sub src, src, pstep, lsl #2 147 148 usub8 lr, r12, lr ; use usub8 instead of ssub8 149 sel lr, r11, r12 ; filter mask: lr 150 151 cmp lr, #0 152 beq hskip_filter ; skip filtering 153 154 sub src, src, pstep, lsl #1 ; move src pointer down by 6 lines 155 156 ;vp8_hevmask() function 157 ;calculate high edge variance 158 orr r10, r6, r8 ; calculate vp8_hevmask 159 160 ldr r7, [src], pstep ; p1 161 162 usub8 r10, r12, r10 ; use usub8 instead of ssub8 163 sel r6, r12, r11 ; obtain vp8_hevmask: r6 164 165 ;vp8_filter() function 166 ldr r8, [src], pstep ; p0 167 ldr r12, c0x80808080 168 ldr r9, [src], pstep ; q0 169 ldr r10, [src], pstep ; q1 170 171 eor r7, r7, r12 ; p1 offset to convert to a signed value 172 eor r8, r8, r12 ; p0 offset to convert to a signed value 173 eor r9, r9, r12 ; q0 offset to convert to a signed value 174 eor r10, r10, r12 ; q1 offset to convert to a signed value 175 176 str r9, [sp] ; store qs0 temporarily 177 str r8, [sp, #4] ; store ps0 temporarily 178 str r10, [sp, #8] ; store qs1 temporarily 179 str r7, [sp, #12] ; store ps1 temporarily 180 181 qsub8 r7, r7, r10 ; vp8_signed_char_clamp(ps1-qs1) 182 qsub8 r8, r9, r8 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) 183 184 and r7, r7, r6 ; vp8_filter (r7) &= hev 185 186 qadd8 r7, r7, r8 187 ldr r9, c0x03030303 ; r9 = 3 --modified for vp8 188 189 qadd8 r7, r7, r8 190 ldr r10, c0x04040404 191 192 qadd8 r7, r7, r8 193 and r7, r7, lr ; vp8_filter &= mask; 194 195 ;modify code for vp8 -- Filter1 = vp8_filter (r7) 196 qadd8 r8 , r7 , r9 ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3) 197 qadd8 r7 , r7 , r10 ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4) 198 199 mov r9, #0 200 shadd8 r8 , r8 , r9 ; Filter2 >>= 3 201 shadd8 r7 , r7 , r9 ; vp8_filter >>= 3 202 shadd8 r8 , r8 , r9 203 shadd8 r7 , r7 , r9 204 shadd8 lr , r8 , r9 ; lr: Filter2 205 shadd8 r7 , r7 , r9 ; r7: filter 206 207 ;usub8 lr, r8, r10 ; s = (s==4)*-1 208 ;sel lr, r11, r9 209 ;usub8 r8, r10, r8 210 ;sel r8, r11, r9 211 ;and r8, r8, lr ; -1 for each element that equals 4 212 213 ;calculate output 214 ;qadd8 lr, r8, r7 ; u = vp8_signed_char_clamp(s + vp8_filter) 215 216 ldr r8, [sp] ; load qs0 217 ldr r9, [sp, #4] ; load ps0 218 219 ldr r10, c0x01010101 220 221 qsub8 r8 ,r8, r7 ; u = vp8_signed_char_clamp(qs0 - vp8_filter) 222 qadd8 r9, r9, lr ; u = vp8_signed_char_clamp(ps0 + Filter2) 223 224 ;end of modification for vp8 225 226 mov lr, #0 227 sadd8 r7, r7 , r10 ; vp8_filter += 1 228 shadd8 r7, r7, lr ; vp8_filter >>= 1 229 230 ldr r11, [sp, #12] ; load ps1 231 ldr r10, [sp, #8] ; load qs1 232 233 bic r7, r7, r6 ; vp8_filter &= ~hev 234 sub src, src, pstep, lsl #2 235 236 qadd8 r11, r11, r7 ; u = vp8_signed_char_clamp(ps1 + vp8_filter) 237 qsub8 r10, r10,r7 ; u = vp8_signed_char_clamp(qs1 - vp8_filter) 238 239 eor r11, r11, r12 ; *op1 = u^0x80 240 str r11, [src], pstep ; store op1 241 eor r9, r9, r12 ; *op0 = u^0x80 242 str r9, [src], pstep ; store op0 result 243 eor r8, r8, r12 ; *oq0 = u^0x80 244 str r8, [src], pstep ; store oq0 result 245 eor r10, r10, r12 ; *oq1 = u^0x80 246 str r10, [src], pstep ; store oq1 247 248 sub src, src, pstep, lsl #1 249 250|hskip_filter| 251 add src, src, #4 252 sub src, src, pstep, lsl #2 253 254 subs count, count, #1 255 256 ;pld [src] 257 ;pld [src, pstep] 258 ;pld [src, pstep, lsl #1] 259 ;pld [src, pstep, lsl #2] 260 ;pld [src, pstep, lsl #3] 261 262 ldrne r9, [src], pstep ; p3 263 ldrne r10, [src], pstep ; p2 264 ldrne r11, [src], pstep ; p1 265 266 bne Hnext8 267 268 add sp, sp, #16 269 ldmia sp!, {r4 - r11, pc} 270 ENDP ; |vp8_loop_filter_horizontal_edge_armv6| 271 272 273;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- 274|vp8_mbloop_filter_horizontal_edge_armv6| PROC 275;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- 276 stmdb sp!, {r4 - r11, lr} 277 278 sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines 279 ldr count, [sp, #40] ; count for 8-in-parallel 280 ldr r6, [sp, #36] ; load thresh address 281 sub sp, sp, #16 ; create temp buffer 282 283 ldr r9, [src], pstep ; p3 284 ldr r4, [r2], #4 ; flimit 285 ldr r10, [src], pstep ; p2 286 ldr r2, [r3], #4 ; limit 287 ldr r11, [src], pstep ; p1 288 uadd8 r4, r4, r4 ; flimit * 2 289 ldr r3, [r6], #4 ; thresh 290 mov count, count, lsl #1 ; 4-in-parallel 291 uadd8 r4, r4, r2 ; flimit * 2 + limit 292 293|MBHnext8| 294 295 ; vp8_filter_mask() function 296 ; calculate breakout conditions 297 ldr r12, [src], pstep ; p0 298 299 uqsub8 r6, r9, r10 ; p3 - p2 300 uqsub8 r7, r10, r9 ; p2 - p3 301 uqsub8 r8, r10, r11 ; p2 - p1 302 uqsub8 r10, r11, r10 ; p1 - p2 303 304 orr r6, r6, r7 ; abs (p3-p2) 305 orr r8, r8, r10 ; abs (p2-p1) 306 uqsub8 lr, r6, r2 ; compare to limit. lr: vp8_filter_mask 307 uqsub8 r8, r8, r2 ; compare to limit 308 309 uqsub8 r6, r11, r12 ; p1 - p0 310 orr lr, lr, r8 311 uqsub8 r7, r12, r11 ; p0 - p1 312 ldr r9, [src], pstep ; q0 313 ldr r10, [src], pstep ; q1 314 orr r6, r6, r7 ; abs (p1-p0) 315 uqsub8 r7, r6, r2 ; compare to limit 316 uqsub8 r8, r6, r3 ; compare to thresh -- save r8 for later 317 orr lr, lr, r7 318 319 uqsub8 r6, r11, r10 ; p1 - q1 320 uqsub8 r7, r10, r11 ; q1 - p1 321 uqsub8 r11, r12, r9 ; p0 - q0 322 uqsub8 r12, r9, r12 ; q0 - p0 323 orr r6, r6, r7 ; abs (p1-q1) 324 ldr r7, c0x7F7F7F7F 325 orr r12, r11, r12 ; abs (p0-q0) 326 ldr r11, [src], pstep ; q2 327 uqadd8 r12, r12, r12 ; abs (p0-q0) * 2 328 and r6, r7, r6, lsr #1 ; abs (p1-q1) / 2 329 uqsub8 r7, r9, r10 ; q0 - q1 330 uqadd8 r12, r12, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2 331 uqsub8 r6, r10, r9 ; q1 - q0 332 uqsub8 r12, r12, r4 ; compare to flimit 333 uqsub8 r9, r11, r10 ; q2 - q1 334 335 orr lr, lr, r12 336 337 ldr r12, [src], pstep ; q3 338 339 uqsub8 r10, r10, r11 ; q1 - q2 340 orr r6, r7, r6 ; abs (q1-q0) 341 orr r10, r9, r10 ; abs (q2-q1) 342 uqsub8 r7, r6, r2 ; compare to limit 343 uqsub8 r10, r10, r2 ; compare to limit 344 uqsub8 r6, r6, r3 ; compare to thresh -- save r6 for later 345 orr lr, lr, r7 346 orr lr, lr, r10 347 348 uqsub8 r10, r12, r11 ; q3 - q2 349 uqsub8 r9, r11, r12 ; q2 - q3 350 351 mvn r11, #0 ; r11 == -1 352 353 orr r10, r10, r9 ; abs (q3-q2) 354 uqsub8 r10, r10, r2 ; compare to limit 355 356 mov r12, #0 357 358 orr lr, lr, r10 359 360 usub8 lr, r12, lr ; use usub8 instead of ssub8 361 sel lr, r11, r12 ; filter mask: lr 362 363 cmp lr, #0 364 beq mbhskip_filter ; skip filtering 365 366 ;vp8_hevmask() function 367 ;calculate high edge variance 368 sub src, src, pstep, lsl #2 ; move src pointer down by 6 lines 369 sub src, src, pstep, lsl #1 370 371 orr r10, r6, r8 372 ldr r7, [src], pstep ; p1 373 374 usub8 r10, r12, r10 375 sel r6, r12, r11 ; hev mask: r6 376 377 ;vp8_mbfilter() function 378 ;p2, q2 are only needed at the end. Don't need to load them in now. 379 ldr r8, [src], pstep ; p0 380 ldr r12, c0x80808080 381 ldr r9, [src], pstep ; q0 382 ldr r10, [src] ; q1 383 384 eor r7, r7, r12 ; ps1 385 eor r8, r8, r12 ; ps0 386 eor r9, r9, r12 ; qs0 387 eor r10, r10, r12 ; qs1 388 389 qsub8 r12, r9, r8 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) 390 str r7, [sp, #12] ; store ps1 temporarily 391 qsub8 r7, r7, r10 ; vp8_signed_char_clamp(ps1-qs1) 392 str r10, [sp, #8] ; store qs1 temporarily 393 qadd8 r7, r7, r12 394 str r9, [sp] ; store qs0 temporarily 395 qadd8 r7, r7, r12 396 str r8, [sp, #4] ; store ps0 temporarily 397 qadd8 r7, r7, r12 ; vp8_filter: r7 398 399 ldr r10, c0x03030303 ; r10 = 3 --modified for vp8 400 ldr r9, c0x04040404 401 402 and r7, r7, lr ; vp8_filter &= mask (lr is free) 403 404 mov r12, r7 ; Filter2: r12 405 and r12, r12, r6 ; Filter2 &= hev 406 407 ;modify code for vp8 408 ;save bottom 3 bits so that we round one side +4 and the other +3 409 qadd8 r8 , r12 , r9 ; Filter1 (r8) = vp8_signed_char_clamp(Filter2+4) 410 qadd8 r12 , r12 , r10 ; Filter2 (r12) = vp8_signed_char_clamp(Filter2+3) 411 412 mov r10, #0 413 shadd8 r8 , r8 , r10 ; Filter1 >>= 3 414 shadd8 r12 , r12 , r10 ; Filter2 >>= 3 415 shadd8 r8 , r8 , r10 416 shadd8 r12 , r12 , r10 417 shadd8 r8 , r8 , r10 ; r8: Filter1 418 shadd8 r12 , r12 , r10 ; r12: Filter2 419 420 ldr r9, [sp] ; load qs0 421 ldr r11, [sp, #4] ; load ps0 422 423 qsub8 r9 , r9, r8 ; qs0 = vp8_signed_char_clamp(qs0 - Filter1) 424 qadd8 r11, r11, r12 ; ps0 = vp8_signed_char_clamp(ps0 + Filter2) 425 426 ;save bottom 3 bits so that we round one side +4 and the other +3 427 ;and r8, r12, r10 ; s = Filter2 & 7 (s: r8) 428 ;qadd8 r12 , r12 , r9 ; Filter2 = vp8_signed_char_clamp(Filter2+4) 429 ;mov r10, #0 430 ;shadd8 r12 , r12 , r10 ; Filter2 >>= 3 431 ;usub8 lr, r8, r9 ; s = (s==4)*-1 432 ;sel lr, r11, r10 433 ;shadd8 r12 , r12 , r10 434 ;usub8 r8, r9, r8 435 ;sel r8, r11, r10 436 ;ldr r9, [sp] ; load qs0 437 ;ldr r11, [sp, #4] ; load ps0 438 ;shadd8 r12 , r12 , r10 439 ;and r8, r8, lr ; -1 for each element that equals 4 440 ;qadd8 r10, r8, r12 ; u = vp8_signed_char_clamp(s + Filter2) 441 ;qsub8 r9 , r9, r12 ; qs0 = vp8_signed_char_clamp(qs0 - Filter2) 442 ;qadd8 r11, r11, r10 ; ps0 = vp8_signed_char_clamp(ps0 + u) 443 444 ;end of modification for vp8 445 446 bic r12, r7, r6 ; vp8_filter &= ~hev ( r6 is free) 447 ;mov r12, r7 448 449 ;roughly 3/7th difference across boundary 450 mov lr, #0x1b ; 27 451 mov r7, #0x3f ; 63 452 453 sxtb16 r6, r12 454 sxtb16 r10, r12, ror #8 455 smlabb r8, r6, lr, r7 456 smlatb r6, r6, lr, r7 457 smlabb r7, r10, lr, r7 458 smultb r10, r10, lr 459 ssat r8, #8, r8, asr #7 460 ssat r6, #8, r6, asr #7 461 add r10, r10, #63 462 ssat r7, #8, r7, asr #7 463 ssat r10, #8, r10, asr #7 464 465 ldr lr, c0x80808080 466 467 pkhbt r6, r8, r6, lsl #16 468 pkhbt r10, r7, r10, lsl #16 469 uxtb16 r6, r6 470 uxtb16 r10, r10 471 472 sub src, src, pstep 473 474 orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7) 475 476 qsub8 r8, r9, r10 ; s = vp8_signed_char_clamp(qs0 - u) 477 qadd8 r10, r11, r10 ; s = vp8_signed_char_clamp(ps0 + u) 478 eor r8, r8, lr ; *oq0 = s^0x80 479 str r8, [src] ; store *oq0 480 sub src, src, pstep 481 eor r10, r10, lr ; *op0 = s^0x80 482 str r10, [src] ; store *op0 483 484 ;roughly 2/7th difference across boundary 485 mov lr, #0x12 ; 18 486 mov r7, #0x3f ; 63 487 488 sxtb16 r6, r12 489 sxtb16 r10, r12, ror #8 490 smlabb r8, r6, lr, r7 491 smlatb r6, r6, lr, r7 492 smlabb r9, r10, lr, r7 493 smlatb r10, r10, lr, r7 494 ssat r8, #8, r8, asr #7 495 ssat r6, #8, r6, asr #7 496 ssat r9, #8, r9, asr #7 497 ssat r10, #8, r10, asr #7 498 499 ldr lr, c0x80808080 500 501 pkhbt r6, r8, r6, lsl #16 502 pkhbt r10, r9, r10, lsl #16 503 504 ldr r9, [sp, #8] ; load qs1 505 ldr r11, [sp, #12] ; load ps1 506 507 uxtb16 r6, r6 508 uxtb16 r10, r10 509 510 sub src, src, pstep 511 512 orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7) 513 514 qadd8 r11, r11, r10 ; s = vp8_signed_char_clamp(ps1 + u) 515 qsub8 r8, r9, r10 ; s = vp8_signed_char_clamp(qs1 - u) 516 eor r11, r11, lr ; *op1 = s^0x80 517 str r11, [src], pstep ; store *op1 518 eor r8, r8, lr ; *oq1 = s^0x80 519 add src, src, pstep, lsl #1 520 521 mov r7, #0x3f ; 63 522 523 str r8, [src], pstep ; store *oq1 524 525 ;roughly 1/7th difference across boundary 526 mov lr, #0x9 ; 9 527 ldr r9, [src] ; load q2 528 529 sxtb16 r6, r12 530 sxtb16 r10, r12, ror #8 531 smlabb r8, r6, lr, r7 532 smlatb r6, r6, lr, r7 533 smlabb r12, r10, lr, r7 534 smlatb r10, r10, lr, r7 535 ssat r8, #8, r8, asr #7 536 ssat r6, #8, r6, asr #7 537 ssat r12, #8, r12, asr #7 538 ssat r10, #8, r10, asr #7 539 540 sub src, src, pstep, lsl #2 541 542 pkhbt r6, r8, r6, lsl #16 543 pkhbt r10, r12, r10, lsl #16 544 545 sub src, src, pstep 546 ldr lr, c0x80808080 547 548 ldr r11, [src] ; load p2 549 550 uxtb16 r6, r6 551 uxtb16 r10, r10 552 553 eor r9, r9, lr 554 eor r11, r11, lr 555 556 orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7) 557 558 qadd8 r8, r11, r10 ; s = vp8_signed_char_clamp(ps2 + u) 559 qsub8 r10, r9, r10 ; s = vp8_signed_char_clamp(qs2 - u) 560 eor r8, r8, lr ; *op2 = s^0x80 561 str r8, [src], pstep, lsl #2 ; store *op2 562 add src, src, pstep 563 eor r10, r10, lr ; *oq2 = s^0x80 564 str r10, [src], pstep, lsl #1 ; store *oq2 565 566|mbhskip_filter| 567 add src, src, #4 568 sub src, src, pstep, lsl #3 569 subs count, count, #1 570 571 ldrne r9, [src], pstep ; p3 572 ldrne r10, [src], pstep ; p2 573 ldrne r11, [src], pstep ; p1 574 575 bne MBHnext8 576 577 add sp, sp, #16 578 ldmia sp!, {r4 - r11, pc} 579 ENDP ; |vp8_mbloop_filter_horizontal_edge_armv6| 580 581 582;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- 583|vp8_loop_filter_vertical_edge_armv6| PROC 584;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- 585 stmdb sp!, {r4 - r11, lr} 586 587 sub src, src, #4 ; move src pointer down by 4 588 ldr count, [sp, #40] ; count for 8-in-parallel 589 ldr r12, [sp, #36] ; load thresh address 590 sub sp, sp, #16 ; create temp buffer 591 592 ldr r6, [src], pstep ; load source data 593 ldr r4, [r2], #4 ; flimit 594 ldr r7, [src], pstep 595 ldr r2, [r3], #4 ; limit 596 ldr r8, [src], pstep 597 uadd8 r4, r4, r4 ; flimit * 2 598 ldr r3, [r12], #4 ; thresh 599 ldr lr, [src], pstep 600 mov count, count, lsl #1 ; 4-in-parallel 601 uadd8 r4, r4, r2 ; flimit * 2 + limit 602 603|Vnext8| 604 605 ; vp8_filter_mask() function 606 ; calculate breakout conditions 607 ; transpose the source data for 4-in-parallel operation 608 TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12 609 610 uqsub8 r7, r9, r10 ; p3 - p2 611 uqsub8 r8, r10, r9 ; p2 - p3 612 uqsub8 r9, r10, r11 ; p2 - p1 613 uqsub8 r10, r11, r10 ; p1 - p2 614 orr r7, r7, r8 ; abs (p3-p2) 615 orr r10, r9, r10 ; abs (p2-p1) 616 uqsub8 lr, r7, r2 ; compare to limit. lr: vp8_filter_mask 617 uqsub8 r10, r10, r2 ; compare to limit 618 619 sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines 620 621 orr lr, lr, r10 622 623 uqsub8 r6, r11, r12 ; p1 - p0 624 uqsub8 r7, r12, r11 ; p0 - p1 625 add src, src, #4 ; move src pointer up by 4 626 orr r6, r6, r7 ; abs (p1-p0) 627 str r11, [sp, #12] ; save p1 628 uqsub8 r10, r6, r2 ; compare to limit 629 uqsub8 r11, r6, r3 ; compare to thresh 630 orr lr, lr, r10 631 632 ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now 633 ; transpose the source data for 4-in-parallel operation 634 ldr r6, [src], pstep ; load source data 635 str r11, [sp] ; push r11 to stack 636 ldr r7, [src], pstep 637 str r12, [sp, #4] ; save current reg before load q0 - q3 data 638 ldr r8, [src], pstep 639 str lr, [sp, #8] 640 ldr lr, [src], pstep 641 642 TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12 643 644 ldr lr, [sp, #8] ; load back (f)limit accumulator 645 646 uqsub8 r6, r12, r11 ; q3 - q2 647 uqsub8 r7, r11, r12 ; q2 - q3 648 uqsub8 r12, r11, r10 ; q2 - q1 649 uqsub8 r11, r10, r11 ; q1 - q2 650 orr r6, r6, r7 ; abs (q3-q2) 651 orr r7, r12, r11 ; abs (q2-q1) 652 uqsub8 r6, r6, r2 ; compare to limit 653 uqsub8 r7, r7, r2 ; compare to limit 654 ldr r11, [sp, #4] ; load back p0 655 ldr r12, [sp, #12] ; load back p1 656 orr lr, lr, r6 657 orr lr, lr, r7 658 659 uqsub8 r6, r11, r9 ; p0 - q0 660 uqsub8 r7, r9, r11 ; q0 - p0 661 uqsub8 r8, r12, r10 ; p1 - q1 662 uqsub8 r11, r10, r12 ; q1 - p1 663 orr r6, r6, r7 ; abs (p0-q0) 664 ldr r7, c0x7F7F7F7F 665 orr r8, r8, r11 ; abs (p1-q1) 666 uqadd8 r6, r6, r6 ; abs (p0-q0) * 2 667 and r8, r7, r8, lsr #1 ; abs (p1-q1) / 2 668 uqsub8 r11, r10, r9 ; q1 - q0 669 uqadd8 r6, r8, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2 670 uqsub8 r12, r9, r10 ; q0 - q1 671 uqsub8 r6, r6, r4 ; compare to flimit 672 673 orr r9, r11, r12 ; abs (q1-q0) 674 uqsub8 r8, r9, r2 ; compare to limit 675 uqsub8 r10, r9, r3 ; compare to thresh 676 orr lr, lr, r6 677 orr lr, lr, r8 678 679 mvn r11, #0 ; r11 == -1 680 mov r12, #0 681 682 usub8 lr, r12, lr 683 ldr r9, [sp] ; load the compared result 684 sel lr, r11, r12 ; filter mask: lr 685 686 cmp lr, #0 687 beq vskip_filter ; skip filtering 688 689 ;vp8_hevmask() function 690 ;calculate high edge variance 691 692 sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines 693 694 orr r9, r9, r10 695 696 ldrh r7, [src, #-2] 697 ldrh r8, [src], pstep 698 699 usub8 r9, r12, r9 700 sel r6, r12, r11 ; hev mask: r6 701 702 ;vp8_filter() function 703 ; load soure data to r6, r11, r12, lr 704 ldrh r9, [src, #-2] 705 ldrh r10, [src], pstep 706 707 pkhbt r12, r7, r8, lsl #16 708 709 ldrh r7, [src, #-2] 710 ldrh r8, [src], pstep 711 712 pkhbt r11, r9, r10, lsl #16 713 714 ldrh r9, [src, #-2] 715 ldrh r10, [src], pstep 716 717 ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first 718 str r6, [sp] 719 str lr, [sp, #4] 720 721 pkhbt r6, r7, r8, lsl #16 722 pkhbt lr, r9, r10, lsl #16 723 724 ;transpose r12, r11, r6, lr to r7, r8, r9, r10 725 TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10 726 727 ;load back hev_mask r6 and filter_mask lr 728 ldr r12, c0x80808080 729 ldr r6, [sp] 730 ldr lr, [sp, #4] 731 732 eor r7, r7, r12 ; p1 offset to convert to a signed value 733 eor r8, r8, r12 ; p0 offset to convert to a signed value 734 eor r9, r9, r12 ; q0 offset to convert to a signed value 735 eor r10, r10, r12 ; q1 offset to convert to a signed value 736 737 str r9, [sp] ; store qs0 temporarily 738 str r8, [sp, #4] ; store ps0 temporarily 739 str r10, [sp, #8] ; store qs1 temporarily 740 str r7, [sp, #12] ; store ps1 temporarily 741 742 qsub8 r7, r7, r10 ; vp8_signed_char_clamp(ps1-qs1) 743 qsub8 r8, r9, r8 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) 744 745 and r7, r7, r6 ; vp8_filter (r7) &= hev (r7 : filter) 746 747 qadd8 r7, r7, r8 748 ldr r9, c0x03030303 ; r9 = 3 --modified for vp8 749 750 qadd8 r7, r7, r8 751 ldr r10, c0x04040404 752 753 qadd8 r7, r7, r8 754 ;mvn r11, #0 ; r11 == -1 755 756 and r7, r7, lr ; vp8_filter &= mask 757 758 ;modify code for vp8 -- Filter1 = vp8_filter (r7) 759 qadd8 r8 , r7 , r9 ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3) 760 qadd8 r7 , r7 , r10 ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4) 761 762 mov r9, #0 763 shadd8 r8 , r8 , r9 ; Filter2 >>= 3 764 shadd8 r7 , r7 , r9 ; vp8_filter >>= 3 765 shadd8 r8 , r8 , r9 766 shadd8 r7 , r7 , r9 767 shadd8 lr , r8 , r9 ; lr: filter2 768 shadd8 r7 , r7 , r9 ; r7: filter 769 770 ;usub8 lr, r8, r10 ; s = (s==4)*-1 771 ;sel lr, r11, r9 772 ;usub8 r8, r10, r8 773 ;sel r8, r11, r9 774 ;and r8, r8, lr ; -1 for each element that equals 4 -- r8: s 775 776 ;calculate output 777 ;qadd8 lr, r8, r7 ; u = vp8_signed_char_clamp(s + vp8_filter) 778 779 ldr r8, [sp] ; load qs0 780 ldr r9, [sp, #4] ; load ps0 781 782 ldr r10, c0x01010101 783 784 qsub8 r8, r8, r7 ; u = vp8_signed_char_clamp(qs0 - vp8_filter) 785 qadd8 r9, r9, lr ; u = vp8_signed_char_clamp(ps0 + Filter2) 786 ;end of modification for vp8 787 788 eor r8, r8, r12 789 eor r9, r9, r12 790 791 mov lr, #0 792 793 sadd8 r7, r7, r10 794 shadd8 r7, r7, lr 795 796 ldr r10, [sp, #8] ; load qs1 797 ldr r11, [sp, #12] ; load ps1 798 799 bic r7, r7, r6 ; r7: vp8_filter 800 801 qsub8 r10 , r10, r7 ; u = vp8_signed_char_clamp(qs1 - vp8_filter) 802 qadd8 r11, r11, r7 ; u = vp8_signed_char_clamp(ps1 + vp8_filter) 803 eor r10, r10, r12 804 eor r11, r11, r12 805 806 sub src, src, pstep, lsl #2 807 808 ;we can use TRANSPOSE_MATRIX macro to transpose output - input: q1, q0, p0, p1 809 ;output is b0, b1, b2, b3 810 ;b0: 03 02 01 00 811 ;b1: 13 12 11 10 812 ;b2: 23 22 21 20 813 ;b3: 33 32 31 30 814 ; p1 p0 q0 q1 815 ; (a3 a2 a1 a0) 816 TRANSPOSE_MATRIX r11, r9, r8, r10, r6, r7, r12, lr 817 818 strh r6, [src, #-2] ; store the result 819 mov r6, r6, lsr #16 820 strh r6, [src], pstep 821 822 strh r7, [src, #-2] 823 mov r7, r7, lsr #16 824 strh r7, [src], pstep 825 826 strh r12, [src, #-2] 827 mov r12, r12, lsr #16 828 strh r12, [src], pstep 829 830 strh lr, [src, #-2] 831 mov lr, lr, lsr #16 832 strh lr, [src], pstep 833 834|vskip_filter| 835 sub src, src, #4 836 subs count, count, #1 837 838 ldrne r6, [src], pstep ; load source data 839 ldrne r7, [src], pstep 840 ldrne r8, [src], pstep 841 ldrne lr, [src], pstep 842 843 bne Vnext8 844 845 add sp, sp, #16 846 847 ldmia sp!, {r4 - r11, pc} 848 ENDP ; |vp8_loop_filter_vertical_edge_armv6| 849 850 851 852;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- 853|vp8_mbloop_filter_vertical_edge_armv6| PROC 854;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- 855 stmdb sp!, {r4 - r11, lr} 856 857 sub src, src, #4 ; move src pointer down by 4 858 ldr count, [sp, #40] ; count for 8-in-parallel 859 ldr r12, [sp, #36] ; load thresh address 860 sub sp, sp, #16 ; create temp buffer 861 862 ldr r6, [src], pstep ; load source data 863 ldr r4, [r2], #4 ; flimit 864 ldr r7, [src], pstep 865 ldr r2, [r3], #4 ; limit 866 ldr r8, [src], pstep 867 uadd8 r4, r4, r4 ; flimit * 2 868 ldr r3, [r12], #4 ; thresh 869 ldr lr, [src], pstep 870 mov count, count, lsl #1 ; 4-in-parallel 871 uadd8 r4, r4, r2 ; flimit * 2 + limit 872 873|MBVnext8| 874 ; vp8_filter_mask() function 875 ; calculate breakout conditions 876 ; transpose the source data for 4-in-parallel operation 877 TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12 878 879 uqsub8 r7, r9, r10 ; p3 - p2 880 uqsub8 r8, r10, r9 ; p2 - p3 881 uqsub8 r9, r10, r11 ; p2 - p1 882 uqsub8 r10, r11, r10 ; p1 - p2 883 orr r7, r7, r8 ; abs (p3-p2) 884 orr r10, r9, r10 ; abs (p2-p1) 885 uqsub8 lr, r7, r2 ; compare to limit. lr: vp8_filter_mask 886 uqsub8 r10, r10, r2 ; compare to limit 887 888 sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines 889 890 orr lr, lr, r10 891 892 uqsub8 r6, r11, r12 ; p1 - p0 893 uqsub8 r7, r12, r11 ; p0 - p1 894 add src, src, #4 ; move src pointer up by 4 895 orr r6, r6, r7 ; abs (p1-p0) 896 str r11, [sp, #12] ; save p1 897 uqsub8 r10, r6, r2 ; compare to limit 898 uqsub8 r11, r6, r3 ; compare to thresh 899 orr lr, lr, r10 900 901 ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now 902 ; transpose the source data for 4-in-parallel operation 903 ldr r6, [src], pstep ; load source data 904 str r11, [sp] ; push r11 to stack 905 ldr r7, [src], pstep 906 str r12, [sp, #4] ; save current reg before load q0 - q3 data 907 ldr r8, [src], pstep 908 str lr, [sp, #8] 909 ldr lr, [src], pstep 910 911 TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12 912 913 ldr lr, [sp, #8] ; load back (f)limit accumulator 914 915 uqsub8 r6, r12, r11 ; q3 - q2 916 uqsub8 r7, r11, r12 ; q2 - q3 917 uqsub8 r12, r11, r10 ; q2 - q1 918 uqsub8 r11, r10, r11 ; q1 - q2 919 orr r6, r6, r7 ; abs (q3-q2) 920 orr r7, r12, r11 ; abs (q2-q1) 921 uqsub8 r6, r6, r2 ; compare to limit 922 uqsub8 r7, r7, r2 ; compare to limit 923 ldr r11, [sp, #4] ; load back p0 924 ldr r12, [sp, #12] ; load back p1 925 orr lr, lr, r6 926 orr lr, lr, r7 927 928 uqsub8 r6, r11, r9 ; p0 - q0 929 uqsub8 r7, r9, r11 ; q0 - p0 930 uqsub8 r8, r12, r10 ; p1 - q1 931 uqsub8 r11, r10, r12 ; q1 - p1 932 orr r6, r6, r7 ; abs (p0-q0) 933 ldr r7, c0x7F7F7F7F 934 orr r8, r8, r11 ; abs (p1-q1) 935 uqadd8 r6, r6, r6 ; abs (p0-q0) * 2 936 and r8, r7, r8, lsr #1 ; abs (p1-q1) / 2 937 uqsub8 r11, r10, r9 ; q1 - q0 938 uqadd8 r6, r8, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2 939 uqsub8 r12, r9, r10 ; q0 - q1 940 uqsub8 r6, r6, r4 ; compare to flimit 941 942 orr r9, r11, r12 ; abs (q1-q0) 943 uqsub8 r8, r9, r2 ; compare to limit 944 uqsub8 r10, r9, r3 ; compare to thresh 945 orr lr, lr, r6 946 orr lr, lr, r8 947 948 mvn r11, #0 ; r11 == -1 949 mov r12, #0 950 951 usub8 lr, r12, lr 952 ldr r9, [sp] ; load the compared result 953 sel lr, r11, r12 ; filter mask: lr 954 955 cmp lr, #0 956 beq mbvskip_filter ; skip filtering 957 958 959 ;vp8_hevmask() function 960 ;calculate high edge variance 961 962 sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines 963 964 orr r9, r9, r10 965 966 ldrh r7, [src, #-2] 967 ldrh r8, [src], pstep 968 969 usub8 r9, r12, r9 970 sel r6, r12, r11 ; hev mask: r6 971 972 973 ; vp8_mbfilter() function 974 ; p2, q2 are only needed at the end. Don't need to load them in now. 975 ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first 976 ; load soure data to r6, r11, r12, lr 977 ldrh r9, [src, #-2] 978 ldrh r10, [src], pstep 979 980 pkhbt r12, r7, r8, lsl #16 981 982 ldrh r7, [src, #-2] 983 ldrh r8, [src], pstep 984 985 pkhbt r11, r9, r10, lsl #16 986 987 ldrh r9, [src, #-2] 988 ldrh r10, [src], pstep 989 990 str r6, [sp] ; save r6 991 str lr, [sp, #4] ; save lr 992 993 pkhbt r6, r7, r8, lsl #16 994 pkhbt lr, r9, r10, lsl #16 995 996 ;transpose r12, r11, r6, lr to p1, p0, q0, q1 997 TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10 998 999 ;load back hev_mask r6 and filter_mask lr 1000 ldr r12, c0x80808080 1001 ldr r6, [sp] 1002 ldr lr, [sp, #4] 1003 1004 eor r7, r7, r12 ; ps1 1005 eor r8, r8, r12 ; ps0 1006 eor r9, r9, r12 ; qs0 1007 eor r10, r10, r12 ; qs1 1008 1009 qsub8 r12, r9, r8 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) 1010 str r7, [sp, #12] ; store ps1 temporarily 1011 qsub8 r7, r7, r10 ; vp8_signed_char_clamp(ps1-qs1) 1012 str r10, [sp, #8] ; store qs1 temporarily 1013 qadd8 r7, r7, r12 1014 str r9, [sp] ; store qs0 temporarily 1015 qadd8 r7, r7, r12 1016 str r8, [sp, #4] ; store ps0 temporarily 1017 qadd8 r7, r7, r12 ; vp8_filter: r7 1018 1019 ldr r10, c0x03030303 ; r10 = 3 --modified for vp8 1020 ldr r9, c0x04040404 1021 ;mvn r11, #0 ; r11 == -1 1022 1023 and r7, r7, lr ; vp8_filter &= mask (lr is free) 1024 1025 mov r12, r7 ; Filter2: r12 1026 and r12, r12, r6 ; Filter2 &= hev 1027 1028 ;modify code for vp8 1029 ;save bottom 3 bits so that we round one side +4 and the other +3 1030 qadd8 r8 , r12 , r9 ; Filter1 (r8) = vp8_signed_char_clamp(Filter2+4) 1031 qadd8 r12 , r12 , r10 ; Filter2 (r12) = vp8_signed_char_clamp(Filter2+3) 1032 1033 mov r10, #0 1034 shadd8 r8 , r8 , r10 ; Filter1 >>= 3 1035 shadd8 r12 , r12 , r10 ; Filter2 >>= 3 1036 shadd8 r8 , r8 , r10 1037 shadd8 r12 , r12 , r10 1038 shadd8 r8 , r8 , r10 ; r8: Filter1 1039 shadd8 r12 , r12 , r10 ; r12: Filter2 1040 1041 ldr r9, [sp] ; load qs0 1042 ldr r11, [sp, #4] ; load ps0 1043 1044 qsub8 r9 , r9, r8 ; qs0 = vp8_signed_char_clamp(qs0 - Filter1) 1045 qadd8 r11, r11, r12 ; ps0 = vp8_signed_char_clamp(ps0 + Filter2) 1046 1047 ;save bottom 3 bits so that we round one side +4 and the other +3 1048 ;and r8, r12, r10 ; s = Filter2 & 7 (s: r8) 1049 ;qadd8 r12 , r12 , r9 ; Filter2 = vp8_signed_char_clamp(Filter2+4) 1050 ;mov r10, #0 1051 ;shadd8 r12 , r12 , r10 ; Filter2 >>= 3 1052 ;usub8 lr, r8, r9 ; s = (s==4)*-1 1053 ;sel lr, r11, r10 1054 ;shadd8 r12 , r12 , r10 1055 ;usub8 r8, r9, r8 1056 ;sel r8, r11, r10 1057 ;ldr r9, [sp] ; load qs0 1058 ;ldr r11, [sp, #4] ; load ps0 1059 ;shadd8 r12 , r12 , r10 1060 ;and r8, r8, lr ; -1 for each element that equals 4 1061 ;qadd8 r10, r8, r12 ; u = vp8_signed_char_clamp(s + Filter2) 1062 ;qsub8 r9 , r9, r12 ; qs0 = vp8_signed_char_clamp(qs0 - Filter2) 1063 ;qadd8 r11, r11, r10 ; ps0 = vp8_signed_char_clamp(ps0 + u) 1064 1065 ;end of modification for vp8 1066 1067 bic r12, r7, r6 ;vp8_filter &= ~hev ( r6 is free) 1068 ;mov r12, r7 1069 1070 ;roughly 3/7th difference across boundary 1071 mov lr, #0x1b ; 27 1072 mov r7, #0x3f ; 63 1073 1074 sxtb16 r6, r12 1075 sxtb16 r10, r12, ror #8 1076 smlabb r8, r6, lr, r7 1077 smlatb r6, r6, lr, r7 1078 smlabb r7, r10, lr, r7 1079 smultb r10, r10, lr 1080 ssat r8, #8, r8, asr #7 1081 ssat r6, #8, r6, asr #7 1082 add r10, r10, #63 1083 ssat r7, #8, r7, asr #7 1084 ssat r10, #8, r10, asr #7 1085 1086 ldr lr, c0x80808080 1087 1088 pkhbt r6, r8, r6, lsl #16 1089 pkhbt r10, r7, r10, lsl #16 1090 uxtb16 r6, r6 1091 uxtb16 r10, r10 1092 1093 sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines 1094 1095 orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7) 1096 1097 qsub8 r8, r9, r10 ; s = vp8_signed_char_clamp(qs0 - u) 1098 qadd8 r10, r11, r10 ; s = vp8_signed_char_clamp(ps0 + u) 1099 eor r8, r8, lr ; *oq0 = s^0x80 1100 eor r10, r10, lr ; *op0 = s^0x80 1101 1102 strb r10, [src, #-1] ; store op0 result 1103 strb r8, [src], pstep ; store oq0 result 1104 mov r10, r10, lsr #8 1105 mov r8, r8, lsr #8 1106 strb r10, [src, #-1] 1107 strb r8, [src], pstep 1108 mov r10, r10, lsr #8 1109 mov r8, r8, lsr #8 1110 strb r10, [src, #-1] 1111 strb r8, [src], pstep 1112 mov r10, r10, lsr #8 1113 mov r8, r8, lsr #8 1114 strb r10, [src, #-1] 1115 strb r8, [src], pstep 1116 1117 ;roughly 2/7th difference across boundary 1118 mov lr, #0x12 ; 18 1119 mov r7, #0x3f ; 63 1120 1121 sxtb16 r6, r12 1122 sxtb16 r10, r12, ror #8 1123 smlabb r8, r6, lr, r7 1124 smlatb r6, r6, lr, r7 1125 smlabb r9, r10, lr, r7 1126 smlatb r10, r10, lr, r7 1127 ssat r8, #8, r8, asr #7 1128 ssat r6, #8, r6, asr #7 1129 ssat r9, #8, r9, asr #7 1130 ssat r10, #8, r10, asr #7 1131 1132 sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines 1133 1134 pkhbt r6, r8, r6, lsl #16 1135 pkhbt r10, r9, r10, lsl #16 1136 1137 ldr r9, [sp, #8] ; load qs1 1138 ldr r11, [sp, #12] ; load ps1 1139 ldr lr, c0x80808080 1140 1141 uxtb16 r6, r6 1142 uxtb16 r10, r10 1143 1144 add src, src, #2 1145 1146 orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7) 1147 1148 qsub8 r8, r9, r10 ; s = vp8_signed_char_clamp(qs1 - u) 1149 qadd8 r10, r11, r10 ; s = vp8_signed_char_clamp(ps1 + u) 1150 eor r8, r8, lr ; *oq1 = s^0x80 1151 eor r10, r10, lr ; *op1 = s^0x80 1152 1153 ldrb r11, [src, #-5] ; load p2 for 1/7th difference across boundary 1154 strb r10, [src, #-4] ; store op1 1155 strb r8, [src, #-1] ; store oq1 1156 ldrb r9, [src], pstep ; load q2 for 1/7th difference across boundary 1157 1158 mov r10, r10, lsr #8 1159 mov r8, r8, lsr #8 1160 1161 ldrb r6, [src, #-5] 1162 strb r10, [src, #-4] 1163 strb r8, [src, #-1] 1164 ldrb r7, [src], pstep 1165 1166 mov r10, r10, lsr #8 1167 mov r8, r8, lsr #8 1168 orr r11, r11, r6, lsl #8 1169 orr r9, r9, r7, lsl #8 1170 1171 ldrb r6, [src, #-5] 1172 strb r10, [src, #-4] 1173 strb r8, [src, #-1] 1174 ldrb r7, [src], pstep 1175 1176 mov r10, r10, lsr #8 1177 mov r8, r8, lsr #8 1178 orr r11, r11, r6, lsl #16 1179 orr r9, r9, r7, lsl #16 1180 1181 ldrb r6, [src, #-5] 1182 strb r10, [src, #-4] 1183 strb r8, [src, #-1] 1184 ldrb r7, [src], pstep 1185 orr r11, r11, r6, lsl #24 1186 orr r9, r9, r7, lsl #24 1187 1188 ;roughly 1/7th difference across boundary 1189 eor r9, r9, lr 1190 eor r11, r11, lr 1191 1192 mov lr, #0x9 ; 9 1193 mov r7, #0x3f ; 63 1194 1195 sxtb16 r6, r12 1196 sxtb16 r10, r12, ror #8 1197 smlabb r8, r6, lr, r7 1198 smlatb r6, r6, lr, r7 1199 smlabb r12, r10, lr, r7 1200 smlatb r10, r10, lr, r7 1201 ssat r8, #8, r8, asr #7 1202 ssat r6, #8, r6, asr #7 1203 ssat r12, #8, r12, asr #7 1204 ssat r10, #8, r10, asr #7 1205 1206 sub src, src, pstep, lsl #2 1207 1208 pkhbt r6, r8, r6, lsl #16 1209 pkhbt r10, r12, r10, lsl #16 1210 1211 uxtb16 r6, r6 1212 uxtb16 r10, r10 1213 1214 ldr lr, c0x80808080 1215 1216 orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7) 1217 1218 qadd8 r8, r11, r10 ; s = vp8_signed_char_clamp(ps2 + u) 1219 qsub8 r10, r9, r10 ; s = vp8_signed_char_clamp(qs2 - u) 1220 eor r8, r8, lr ; *op2 = s^0x80 1221 eor r10, r10, lr ; *oq2 = s^0x80 1222 1223 strb r8, [src, #-5] ; store *op2 1224 strb r10, [src], pstep ; store *oq2 1225 mov r8, r8, lsr #8 1226 mov r10, r10, lsr #8 1227 strb r8, [src, #-5] 1228 strb r10, [src], pstep 1229 mov r8, r8, lsr #8 1230 mov r10, r10, lsr #8 1231 strb r8, [src, #-5] 1232 strb r10, [src], pstep 1233 mov r8, r8, lsr #8 1234 mov r10, r10, lsr #8 1235 strb r8, [src, #-5] 1236 strb r10, [src], pstep 1237 1238 ;adjust src pointer for next loop 1239 sub src, src, #2 1240 1241|mbvskip_filter| 1242 sub src, src, #4 1243 subs count, count, #1 1244 1245 ldrne r6, [src], pstep ; load source data 1246 ldrne r7, [src], pstep 1247 ldrne r8, [src], pstep 1248 ldrne lr, [src], pstep 1249 1250 bne MBVnext8 1251 1252 add sp, sp, #16 1253 1254 ldmia sp!, {r4 - r11, pc} 1255 ENDP ; |vp8_mbloop_filter_vertical_edge_armv6| 1256 1257; Constant Pool 1258c0x80808080 DCD 0x80808080 1259c0x03030303 DCD 0x03030303 1260c0x04040404 DCD 0x04040404 1261c0x01010101 DCD 0x01010101 1262c0x7F7F7F7F DCD 0x7F7F7F7F 1263 1264 END 1265