loopfilter_v6.asm revision 1b362b15af34006e6a11974088a46d42b903418e
1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12    EXPORT |vp8_loop_filter_horizontal_edge_armv6|
13    EXPORT |vp8_mbloop_filter_horizontal_edge_armv6|
14    EXPORT |vp8_loop_filter_vertical_edge_armv6|
15    EXPORT |vp8_mbloop_filter_vertical_edge_armv6|
16
17    AREA    |.text|, CODE, READONLY  ; name this block of code
18
19    MACRO
20    TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3
21    ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3
22    ; a0: 03 02 01 00
23    ; a1: 13 12 11 10
24    ; a2: 23 22 21 20
25    ; a3: 33 32 31 30
26    ;     b3 b2 b1 b0
27
28    uxtb16      $b1, $a1                    ; xx 12 xx 10
29    uxtb16      $b0, $a0                    ; xx 02 xx 00
30    uxtb16      $b3, $a3                    ; xx 32 xx 30
31    uxtb16      $b2, $a2                    ; xx 22 xx 20
32    orr         $b1, $b0, $b1, lsl #8       ; 12 02 10 00
33    orr         $b3, $b2, $b3, lsl #8       ; 32 22 30 20
34
35    uxtb16      $a1, $a1, ror #8            ; xx 13 xx 11
36    uxtb16      $a3, $a3, ror #8            ; xx 33 xx 31
37    uxtb16      $a0, $a0, ror #8            ; xx 03 xx 01
38    uxtb16      $a2, $a2, ror #8            ; xx 23 xx 21
39    orr         $a0, $a0, $a1, lsl #8       ; 13 03 11 01
40    orr         $a2, $a2, $a3, lsl #8       ; 33 23 31 21
41
42    pkhtb       $b2, $b3, $b1, asr #16      ; 32 22 12 02   -- p1
43    pkhbt       $b0, $b1, $b3, lsl #16      ; 30 20 10 00   -- p3
44
45    pkhtb       $b3, $a2, $a0, asr #16      ; 33 23 13 03   -- p0
46    pkhbt       $b1, $a0, $a2, lsl #16      ; 31 21 11 01   -- p2
47    MEND
48
49
50src         RN  r0
51pstep       RN  r1
52count       RN  r5
53
54;r0     unsigned char *src_ptr,
55;r1     int src_pixel_step,
56;r2     const char *blimit,
57;r3     const char *limit,
58;stack  const char *thresh,
59;stack  int  count
60
61;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
62|vp8_loop_filter_horizontal_edge_armv6| PROC
63;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
64    stmdb       sp!, {r4 - r11, lr}
65
66    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
67    ldr         count, [sp, #40]            ; count for 8-in-parallel
68    ldr         r6, [sp, #36]               ; load thresh address
69    sub         sp, sp, #16                 ; create temp buffer
70
71    ldr         r9, [src], pstep            ; p3
72    ldrb        r4, [r2]                    ; blimit
73    ldr         r10, [src], pstep           ; p2
74    ldrb        r2, [r3]                    ; limit
75    ldr         r11, [src], pstep           ; p1
76    orr         r4, r4, r4, lsl #8
77    ldrb        r3, [r6]                    ; thresh
78    orr         r2, r2, r2, lsl #8
79    mov         count, count, lsl #1        ; 4-in-parallel
80    orr         r4, r4, r4, lsl #16
81    orr         r3, r3, r3, lsl #8
82    orr         r2, r2, r2, lsl #16
83    orr         r3, r3, r3, lsl #16
84
85|Hnext8|
86    ; vp8_filter_mask() function
87    ; calculate breakout conditions
88    ldr         r12, [src], pstep           ; p0
89
90    uqsub8      r6, r9, r10                 ; p3 - p2
91    uqsub8      r7, r10, r9                 ; p2 - p3
92    uqsub8      r8, r10, r11                ; p2 - p1
93    uqsub8      r10, r11, r10               ; p1 - p2
94
95    orr         r6, r6, r7                  ; abs (p3-p2)
96    orr         r8, r8, r10                 ; abs (p2-p1)
97    uqsub8      lr, r6, r2                  ; compare to limit. lr: vp8_filter_mask
98    uqsub8      r8, r8, r2                  ; compare to limit
99    uqsub8      r6, r11, r12                ; p1 - p0
100    orr         lr, lr, r8
101    uqsub8      r7, r12, r11                ; p0 - p1
102    ldr         r9, [src], pstep            ; q0
103    ldr         r10, [src], pstep           ; q1
104    orr         r6, r6, r7                  ; abs (p1-p0)
105    uqsub8      r7, r6, r2                  ; compare to limit
106    uqsub8      r8, r6, r3                  ; compare to thresh  -- save r8 for later
107    orr         lr, lr, r7
108
109    uqsub8      r6, r11, r10                ; p1 - q1
110    uqsub8      r7, r10, r11                ; q1 - p1
111    uqsub8      r11, r12, r9                ; p0 - q0
112    uqsub8      r12, r9, r12                ; q0 - p0
113    orr         r6, r6, r7                  ; abs (p1-q1)
114    ldr         r7, c0x7F7F7F7F
115    orr         r12, r11, r12               ; abs (p0-q0)
116    ldr         r11, [src], pstep           ; q2
117    uqadd8      r12, r12, r12               ; abs (p0-q0) * 2
118    and         r6, r7, r6, lsr #1          ; abs (p1-q1) / 2
119    uqsub8      r7, r9, r10                 ; q0 - q1
120    uqadd8      r12, r12, r6                ; abs (p0-q0)*2 + abs (p1-q1)/2
121    uqsub8      r6, r10, r9                 ; q1 - q0
122    uqsub8      r12, r12, r4                ; compare to flimit
123    uqsub8      r9, r11, r10                ; q2 - q1
124
125    orr         lr, lr, r12
126
127    ldr         r12, [src], pstep           ; q3
128    uqsub8      r10, r10, r11               ; q1 - q2
129    orr         r6, r7, r6                  ; abs (q1-q0)
130    orr         r10, r9, r10                ; abs (q2-q1)
131    uqsub8      r7, r6, r2                  ; compare to limit
132    uqsub8      r10, r10, r2                ; compare to limit
133    uqsub8      r6, r6, r3                  ; compare to thresh -- save r6 for later
134    orr         lr, lr, r7
135    orr         lr, lr, r10
136
137    uqsub8      r10, r12, r11               ; q3 - q2
138    uqsub8      r9, r11, r12                ; q2 - q3
139
140    mvn         r11, #0                     ; r11 == -1
141
142    orr         r10, r10, r9                ; abs (q3-q2)
143    uqsub8      r10, r10, r2                ; compare to limit
144
145    mov         r12, #0
146    orr         lr, lr, r10
147    sub         src, src, pstep, lsl #2
148
149    usub8       lr, r12, lr                 ; use usub8 instead of ssub8
150    sel         lr, r11, r12                ; filter mask: lr
151
152    cmp         lr, #0
153    beq         hskip_filter                 ; skip filtering
154
155    sub         src, src, pstep, lsl #1     ; move src pointer down by 6 lines
156
157    ;vp8_hevmask() function
158    ;calculate high edge variance
159    orr         r10, r6, r8                 ; calculate vp8_hevmask
160
161    ldr         r7, [src], pstep            ; p1
162
163    usub8       r10, r12, r10               ; use usub8 instead of ssub8
164    sel         r6, r12, r11                ; obtain vp8_hevmask: r6
165
166    ;vp8_filter() function
167    ldr         r8, [src], pstep            ; p0
168    ldr         r12, c0x80808080
169    ldr         r9, [src], pstep            ; q0
170    ldr         r10, [src], pstep           ; q1
171
172    eor         r7, r7, r12                 ; p1 offset to convert to a signed value
173    eor         r8, r8, r12                 ; p0 offset to convert to a signed value
174    eor         r9, r9, r12                 ; q0 offset to convert to a signed value
175    eor         r10, r10, r12               ; q1 offset to convert to a signed value
176
177    str         r9, [sp]                    ; store qs0 temporarily
178    str         r8, [sp, #4]                ; store ps0 temporarily
179    str         r10, [sp, #8]               ; store qs1 temporarily
180    str         r7, [sp, #12]               ; store ps1 temporarily
181
182    qsub8       r7, r7, r10                 ; vp8_signed_char_clamp(ps1-qs1)
183    qsub8       r8, r9, r8                  ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
184
185    and         r7, r7, r6                  ; vp8_filter (r7) &= hev
186
187    qadd8       r7, r7, r8
188    ldr         r9, c0x03030303             ; r9 = 3 --modified for vp8
189
190    qadd8       r7, r7, r8
191    ldr         r10, c0x04040404
192
193    qadd8       r7, r7, r8
194    and         r7, r7, lr                  ; vp8_filter &= mask;
195
196    ;modify code for vp8 -- Filter1 = vp8_filter (r7)
197    qadd8       r8 , r7 , r9                ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3)
198    qadd8       r7 , r7 , r10               ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4)
199
200    mov         r9, #0
201    shadd8      r8 , r8 , r9                ; Filter2 >>= 3
202    shadd8      r7 , r7 , r9                ; vp8_filter >>= 3
203    shadd8      r8 , r8 , r9
204    shadd8      r7 , r7 , r9
205    shadd8      lr , r8 , r9                ; lr: Filter2
206    shadd8      r7 , r7 , r9                ; r7: filter
207
208    ;usub8      lr, r8, r10                 ; s = (s==4)*-1
209    ;sel        lr, r11, r9
210    ;usub8      r8, r10, r8
211    ;sel        r8, r11, r9
212    ;and        r8, r8, lr                  ; -1 for each element that equals 4
213
214    ;calculate output
215    ;qadd8      lr, r8, r7                  ; u = vp8_signed_char_clamp(s + vp8_filter)
216
217    ldr         r8, [sp]                    ; load qs0
218    ldr         r9, [sp, #4]                ; load ps0
219
220    ldr         r10, c0x01010101
221
222    qsub8       r8 ,r8, r7                  ; u = vp8_signed_char_clamp(qs0 - vp8_filter)
223    qadd8       r9, r9, lr                  ; u = vp8_signed_char_clamp(ps0 + Filter2)
224
225    ;end of modification for vp8
226
227    mov         lr, #0
228    sadd8       r7, r7 , r10                ; vp8_filter += 1
229    shadd8      r7, r7, lr                  ; vp8_filter >>= 1
230
231    ldr         r11, [sp, #12]              ; load ps1
232    ldr         r10, [sp, #8]               ; load qs1
233
234    bic         r7, r7, r6                  ; vp8_filter &= ~hev
235    sub         src, src, pstep, lsl #2
236
237    qadd8       r11, r11, r7                ; u = vp8_signed_char_clamp(ps1 + vp8_filter)
238    qsub8       r10, r10,r7                 ; u = vp8_signed_char_clamp(qs1 - vp8_filter)
239
240    eor         r11, r11, r12               ; *op1 = u^0x80
241    str         r11, [src], pstep           ; store op1
242    eor         r9, r9, r12                 ; *op0 = u^0x80
243    str         r9, [src], pstep            ; store op0 result
244    eor         r8, r8, r12                 ; *oq0 = u^0x80
245    str         r8, [src], pstep            ; store oq0 result
246    eor         r10, r10, r12               ; *oq1 = u^0x80
247    str         r10, [src], pstep           ; store oq1
248
249    sub         src, src, pstep, lsl #1
250
251|hskip_filter|
252    add         src, src, #4
253    sub         src, src, pstep, lsl #2
254
255    subs        count, count, #1
256
257    ldrne       r9, [src], pstep            ; p3
258    ldrne       r10, [src], pstep           ; p2
259    ldrne       r11, [src], pstep           ; p1
260
261    bne         Hnext8
262
263    add         sp, sp, #16
264    ldmia       sp!, {r4 - r11, pc}
265    ENDP        ; |vp8_loop_filter_horizontal_edge_armv6|
266
267
268;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
269|vp8_mbloop_filter_horizontal_edge_armv6| PROC
270;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
271    stmdb       sp!, {r4 - r11, lr}
272
273    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
274    ldr         count, [sp, #40]            ; count for 8-in-parallel
275    ldr         r6, [sp, #36]               ; load thresh address
276    sub         sp, sp, #16                 ; create temp buffer
277
278    ldr         r9, [src], pstep            ; p3
279    ldrb        r4, [r2]                    ; blimit
280    ldr         r10, [src], pstep           ; p2
281    ldrb        r2, [r3]                    ; limit
282    ldr         r11, [src], pstep           ; p1
283    orr         r4, r4, r4, lsl #8
284    ldrb        r3, [r6]                    ; thresh
285    orr         r2, r2, r2, lsl #8
286    mov         count, count, lsl #1        ; 4-in-parallel
287    orr         r4, r4, r4, lsl #16
288    orr         r3, r3, r3, lsl #8
289    orr         r2, r2, r2, lsl #16
290    orr         r3, r3, r3, lsl #16
291
292|MBHnext8|
293
294    ; vp8_filter_mask() function
295    ; calculate breakout conditions
296    ldr         r12, [src], pstep           ; p0
297
298    uqsub8      r6, r9, r10                 ; p3 - p2
299    uqsub8      r7, r10, r9                 ; p2 - p3
300    uqsub8      r8, r10, r11                ; p2 - p1
301    uqsub8      r10, r11, r10               ; p1 - p2
302
303    orr         r6, r6, r7                  ; abs (p3-p2)
304    orr         r8, r8, r10                 ; abs (p2-p1)
305    uqsub8      lr, r6, r2                  ; compare to limit. lr: vp8_filter_mask
306    uqsub8      r8, r8, r2                  ; compare to limit
307
308    uqsub8      r6, r11, r12                ; p1 - p0
309    orr         lr, lr, r8
310    uqsub8      r7, r12, r11                ; p0 - p1
311    ldr         r9, [src], pstep            ; q0
312    ldr         r10, [src], pstep           ; q1
313    orr         r6, r6, r7                  ; abs (p1-p0)
314    uqsub8      r7, r6, r2                  ; compare to limit
315    uqsub8      r8, r6, r3                  ; compare to thresh  -- save r8 for later
316    orr         lr, lr, r7
317
318    uqsub8      r6, r11, r10                ; p1 - q1
319    uqsub8      r7, r10, r11                ; q1 - p1
320    uqsub8      r11, r12, r9                ; p0 - q0
321    uqsub8      r12, r9, r12                ; q0 - p0
322    orr         r6, r6, r7                  ; abs (p1-q1)
323    ldr         r7, c0x7F7F7F7F
324    orr         r12, r11, r12               ; abs (p0-q0)
325    ldr         r11, [src], pstep           ; q2
326    uqadd8      r12, r12, r12               ; abs (p0-q0) * 2
327    and         r6, r7, r6, lsr #1          ; abs (p1-q1) / 2
328    uqsub8      r7, r9, r10                 ; q0 - q1
329    uqadd8      r12, r12, r6                ; abs (p0-q0)*2 + abs (p1-q1)/2
330    uqsub8      r6, r10, r9                 ; q1 - q0
331    uqsub8      r12, r12, r4                ; compare to flimit
332    uqsub8      r9, r11, r10                ; q2 - q1
333
334    orr         lr, lr, r12
335
336    ldr         r12, [src], pstep           ; q3
337
338    uqsub8      r10, r10, r11               ; q1 - q2
339    orr         r6, r7, r6                  ; abs (q1-q0)
340    orr         r10, r9, r10                ; abs (q2-q1)
341    uqsub8      r7, r6, r2                  ; compare to limit
342    uqsub8      r10, r10, r2                ; compare to limit
343    uqsub8      r6, r6, r3                  ; compare to thresh -- save r6 for later
344    orr         lr, lr, r7
345    orr         lr, lr, r10
346
347    uqsub8      r10, r12, r11               ; q3 - q2
348    uqsub8      r9, r11, r12                ; q2 - q3
349
350    mvn         r11, #0                     ; r11 == -1
351
352    orr         r10, r10, r9                ; abs (q3-q2)
353    uqsub8      r10, r10, r2                ; compare to limit
354
355    mov         r12, #0
356
357    orr         lr, lr, r10
358
359    usub8       lr, r12, lr                 ; use usub8 instead of ssub8
360    sel         lr, r11, r12                ; filter mask: lr
361
362    cmp         lr, #0
363    beq         mbhskip_filter               ; skip filtering
364
365    ;vp8_hevmask() function
366    ;calculate high edge variance
367    sub         src, src, pstep, lsl #2     ; move src pointer down by 6 lines
368    sub         src, src, pstep, lsl #1
369
370    orr         r10, r6, r8
371    ldr         r7, [src], pstep            ; p1
372
373    usub8       r10, r12, r10
374    sel         r6, r12, r11                ; hev mask: r6
375
376    ;vp8_mbfilter() function
377    ;p2, q2 are only needed at the end. Don't need to load them in now.
378    ldr         r8, [src], pstep            ; p0
379    ldr         r12, c0x80808080
380    ldr         r9, [src], pstep            ; q0
381    ldr         r10, [src]                  ; q1
382
383    eor         r7, r7, r12                 ; ps1
384    eor         r8, r8, r12                 ; ps0
385    eor         r9, r9, r12                 ; qs0
386    eor         r10, r10, r12               ; qs1
387
388    qsub8       r12, r9, r8                 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
389    str         r7, [sp, #12]               ; store ps1 temporarily
390    qsub8       r7, r7, r10                 ; vp8_signed_char_clamp(ps1-qs1)
391    str         r10, [sp, #8]               ; store qs1 temporarily
392    qadd8       r7, r7, r12
393    str         r9, [sp]                    ; store qs0 temporarily
394    qadd8       r7, r7, r12
395    str         r8, [sp, #4]                ; store ps0 temporarily
396    qadd8       r7, r7, r12                 ; vp8_filter: r7
397
398    ldr         r10, c0x03030303            ; r10 = 3 --modified for vp8
399    ldr         r9, c0x04040404
400
401    and         r7, r7, lr                  ; vp8_filter &= mask (lr is free)
402
403    mov         r12, r7                     ; Filter2: r12
404    and         r12, r12, r6                ; Filter2 &= hev
405
406    ;modify code for vp8
407    ;save bottom 3 bits so that we round one side +4 and the other +3
408    qadd8       r8 , r12 , r9               ; Filter1 (r8) = vp8_signed_char_clamp(Filter2+4)
409    qadd8       r12 , r12 , r10             ; Filter2 (r12) = vp8_signed_char_clamp(Filter2+3)
410
411    mov         r10, #0
412    shadd8      r8 , r8 , r10               ; Filter1 >>= 3
413    shadd8      r12 , r12 , r10             ; Filter2 >>= 3
414    shadd8      r8 , r8 , r10
415    shadd8      r12 , r12 , r10
416    shadd8      r8 , r8 , r10               ; r8: Filter1
417    shadd8      r12 , r12 , r10             ; r12: Filter2
418
419    ldr         r9, [sp]                    ; load qs0
420    ldr         r11, [sp, #4]               ; load ps0
421
422    qsub8       r9 , r9, r8                 ; qs0 = vp8_signed_char_clamp(qs0 - Filter1)
423    qadd8       r11, r11, r12               ; ps0 = vp8_signed_char_clamp(ps0 + Filter2)
424
425    ;save bottom 3 bits so that we round one side +4 and the other +3
426    ;and            r8, r12, r10                ; s = Filter2 & 7 (s: r8)
427    ;qadd8      r12 , r12 , r9              ; Filter2 = vp8_signed_char_clamp(Filter2+4)
428    ;mov            r10, #0
429    ;shadd8     r12 , r12 , r10             ; Filter2 >>= 3
430    ;usub8      lr, r8, r9                  ; s = (s==4)*-1
431    ;sel            lr, r11, r10
432    ;shadd8     r12 , r12 , r10
433    ;usub8      r8, r9, r8
434    ;sel            r8, r11, r10
435    ;ldr            r9, [sp]                    ; load qs0
436    ;ldr            r11, [sp, #4]               ; load ps0
437    ;shadd8     r12 , r12 , r10
438    ;and            r8, r8, lr                  ; -1 for each element that equals 4
439    ;qadd8      r10, r8, r12                ; u = vp8_signed_char_clamp(s + Filter2)
440    ;qsub8      r9 , r9, r12                ; qs0 = vp8_signed_char_clamp(qs0 - Filter2)
441    ;qadd8      r11, r11, r10               ; ps0 = vp8_signed_char_clamp(ps0 + u)
442
443    ;end of modification for vp8
444
445    bic         r12, r7, r6                 ; vp8_filter &= ~hev    ( r6 is free)
446    ;mov        r12, r7
447
448    ;roughly 3/7th difference across boundary
449    mov         lr, #0x1b                   ; 27
450    mov         r7, #0x3f                   ; 63
451
452    sxtb16      r6, r12
453    sxtb16      r10, r12, ror #8
454    smlabb      r8, r6, lr, r7
455    smlatb      r6, r6, lr, r7
456    smlabb      r7, r10, lr, r7
457    smultb      r10, r10, lr
458    ssat        r8, #8, r8, asr #7
459    ssat        r6, #8, r6, asr #7
460    add         r10, r10, #63
461    ssat        r7, #8, r7, asr #7
462    ssat        r10, #8, r10, asr #7
463
464    ldr         lr, c0x80808080
465
466    pkhbt       r6, r8, r6, lsl #16
467    pkhbt       r10, r7, r10, lsl #16
468    uxtb16      r6, r6
469    uxtb16      r10, r10
470
471    sub         src, src, pstep
472
473    orr         r10, r6, r10, lsl #8        ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
474
475    qsub8       r8, r9, r10                 ; s = vp8_signed_char_clamp(qs0 - u)
476    qadd8       r10, r11, r10               ; s = vp8_signed_char_clamp(ps0 + u)
477    eor         r8, r8, lr                  ; *oq0 = s^0x80
478    str         r8, [src]                   ; store *oq0
479    sub         src, src, pstep
480    eor         r10, r10, lr                ; *op0 = s^0x80
481    str         r10, [src]                  ; store *op0
482
483    ;roughly 2/7th difference across boundary
484    mov         lr, #0x12                   ; 18
485    mov         r7, #0x3f                   ; 63
486
487    sxtb16      r6, r12
488    sxtb16      r10, r12, ror #8
489    smlabb      r8, r6, lr, r7
490    smlatb      r6, r6, lr, r7
491    smlabb      r9, r10, lr, r7
492    smlatb      r10, r10, lr, r7
493    ssat        r8, #8, r8, asr #7
494    ssat        r6, #8, r6, asr #7
495    ssat        r9, #8, r9, asr #7
496    ssat        r10, #8, r10, asr #7
497
498    ldr         lr, c0x80808080
499
500    pkhbt       r6, r8, r6, lsl #16
501    pkhbt       r10, r9, r10, lsl #16
502
503    ldr         r9, [sp, #8]                ; load qs1
504    ldr         r11, [sp, #12]              ; load ps1
505
506    uxtb16      r6, r6
507    uxtb16      r10, r10
508
509    sub         src, src, pstep
510
511    orr         r10, r6, r10, lsl #8        ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
512
513    qadd8       r11, r11, r10               ; s = vp8_signed_char_clamp(ps1 + u)
514    qsub8       r8, r9, r10                 ; s = vp8_signed_char_clamp(qs1 - u)
515    eor         r11, r11, lr                ; *op1 = s^0x80
516    str         r11, [src], pstep           ; store *op1
517    eor         r8, r8, lr                  ; *oq1 = s^0x80
518    add         src, src, pstep, lsl #1
519
520    mov         r7, #0x3f                   ; 63
521
522    str         r8, [src], pstep            ; store *oq1
523
524    ;roughly 1/7th difference across boundary
525    mov         lr, #0x9                    ; 9
526    ldr         r9, [src]                   ; load q2
527
528    sxtb16      r6, r12
529    sxtb16      r10, r12, ror #8
530    smlabb      r8, r6, lr, r7
531    smlatb      r6, r6, lr, r7
532    smlabb      r12, r10, lr, r7
533    smlatb      r10, r10, lr, r7
534    ssat        r8, #8, r8, asr #7
535    ssat        r6, #8, r6, asr #7
536    ssat        r12, #8, r12, asr #7
537    ssat        r10, #8, r10, asr #7
538
539    sub         src, src, pstep, lsl #2
540
541    pkhbt       r6, r8, r6, lsl #16
542    pkhbt       r10, r12, r10, lsl #16
543
544    sub         src, src, pstep
545    ldr         lr, c0x80808080
546
547    ldr         r11, [src]                  ; load p2
548
549    uxtb16      r6, r6
550    uxtb16      r10, r10
551
552    eor         r9, r9, lr
553    eor         r11, r11, lr
554
555    orr         r10, r6, r10, lsl #8        ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
556
557    qadd8       r8, r11, r10                ; s = vp8_signed_char_clamp(ps2 + u)
558    qsub8       r10, r9, r10                ; s = vp8_signed_char_clamp(qs2 - u)
559    eor         r8, r8, lr                  ; *op2 = s^0x80
560    str         r8, [src], pstep, lsl #2    ; store *op2
561    add         src, src, pstep
562    eor         r10, r10, lr                ; *oq2 = s^0x80
563    str         r10, [src], pstep, lsl #1   ; store *oq2
564
565|mbhskip_filter|
566    add         src, src, #4
567    sub         src, src, pstep, lsl #3
568    subs        count, count, #1
569
570    ldrne       r9, [src], pstep            ; p3
571    ldrne       r10, [src], pstep           ; p2
572    ldrne       r11, [src], pstep           ; p1
573
574    bne         MBHnext8
575
576    add         sp, sp, #16
577    ldmia       sp!, {r4 - r11, pc}
578    ENDP        ; |vp8_mbloop_filter_horizontal_edge_armv6|
579
580
581;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
582|vp8_loop_filter_vertical_edge_armv6| PROC
583;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
584    stmdb       sp!, {r4 - r11, lr}
585
586    sub         src, src, #4                ; move src pointer down by 4
587    ldr         count, [sp, #40]            ; count for 8-in-parallel
588    ldr         r12, [sp, #36]              ; load thresh address
589    sub         sp, sp, #16                 ; create temp buffer
590
591    ldr         r6, [src], pstep            ; load source data
592    ldrb        r4, [r2]                    ; blimit
593    ldr         r7, [src], pstep
594    ldrb        r2, [r3]                    ; limit
595    ldr         r8, [src], pstep
596    orr         r4, r4, r4, lsl #8
597    ldrb        r3, [r12]                   ; thresh
598    orr         r2, r2, r2, lsl #8
599    ldr         lr, [src], pstep
600    mov         count, count, lsl #1        ; 4-in-parallel
601    orr         r4, r4, r4, lsl #16
602    orr         r3, r3, r3, lsl #8
603    orr         r2, r2, r2, lsl #16
604    orr         r3, r3, r3, lsl #16
605
606|Vnext8|
607
608    ; vp8_filter_mask() function
609    ; calculate breakout conditions
610    ; transpose the source data for 4-in-parallel operation
611    TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
612
613    uqsub8      r7, r9, r10                 ; p3 - p2
614    uqsub8      r8, r10, r9                 ; p2 - p3
615    uqsub8      r9, r10, r11                ; p2 - p1
616    uqsub8      r10, r11, r10               ; p1 - p2
617    orr         r7, r7, r8                  ; abs (p3-p2)
618    orr         r10, r9, r10                ; abs (p2-p1)
619    uqsub8      lr, r7, r2                  ; compare to limit. lr: vp8_filter_mask
620    uqsub8      r10, r10, r2                ; compare to limit
621
622    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
623
624    orr         lr, lr, r10
625
626    uqsub8      r6, r11, r12                ; p1 - p0
627    uqsub8      r7, r12, r11                ; p0 - p1
628    add         src, src, #4                ; move src pointer up by 4
629    orr         r6, r6, r7                  ; abs (p1-p0)
630    str         r11, [sp, #12]              ; save p1
631    uqsub8      r10, r6, r2                 ; compare to limit
632    uqsub8      r11, r6, r3                 ; compare to thresh
633    orr         lr, lr, r10
634
635    ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now
636    ; transpose the source data for 4-in-parallel operation
637    ldr         r6, [src], pstep            ; load source data
638    str         r11, [sp]                   ; push r11 to stack
639    ldr         r7, [src], pstep
640    str         r12, [sp, #4]               ; save current reg before load q0 - q3 data
641    ldr         r8, [src], pstep
642    str         lr, [sp, #8]
643    ldr         lr, [src], pstep
644
645    TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
646
647    ldr         lr, [sp, #8]                ; load back (f)limit accumulator
648
649    uqsub8      r6, r12, r11                ; q3 - q2
650    uqsub8      r7, r11, r12                ; q2 - q3
651    uqsub8      r12, r11, r10               ; q2 - q1
652    uqsub8      r11, r10, r11               ; q1 - q2
653    orr         r6, r6, r7                  ; abs (q3-q2)
654    orr         r7, r12, r11                ; abs (q2-q1)
655    uqsub8      r6, r6, r2                  ; compare to limit
656    uqsub8      r7, r7, r2                  ; compare to limit
657    ldr         r11, [sp, #4]               ; load back p0
658    ldr         r12, [sp, #12]              ; load back p1
659    orr         lr, lr, r6
660    orr         lr, lr, r7
661
662    uqsub8      r6, r11, r9                 ; p0 - q0
663    uqsub8      r7, r9, r11                 ; q0 - p0
664    uqsub8      r8, r12, r10                ; p1 - q1
665    uqsub8      r11, r10, r12               ; q1 - p1
666    orr         r6, r6, r7                  ; abs (p0-q0)
667    ldr         r7, c0x7F7F7F7F
668    orr         r8, r8, r11                 ; abs (p1-q1)
669    uqadd8      r6, r6, r6                  ; abs (p0-q0) * 2
670    and         r8, r7, r8, lsr #1          ; abs (p1-q1) / 2
671    uqsub8      r11, r10, r9                ; q1 - q0
672    uqadd8      r6, r8, r6                  ; abs (p0-q0)*2 + abs (p1-q1)/2
673    uqsub8      r12, r9, r10                ; q0 - q1
674    uqsub8      r6, r6, r4                  ; compare to flimit
675
676    orr         r9, r11, r12                ; abs (q1-q0)
677    uqsub8      r8, r9, r2                  ; compare to limit
678    uqsub8      r10, r9, r3                 ; compare to thresh
679    orr         lr, lr, r6
680    orr         lr, lr, r8
681
682    mvn         r11, #0                     ; r11 == -1
683    mov         r12, #0
684
685    usub8       lr, r12, lr
686    ldr         r9, [sp]                    ; load the compared result
687    sel         lr, r11, r12                ; filter mask: lr
688
689    cmp         lr, #0
690    beq         vskip_filter                 ; skip filtering
691
692    ;vp8_hevmask() function
693    ;calculate high edge variance
694
695    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
696
697    orr         r9, r9, r10
698
699    ldrh        r7, [src, #-2]
700    ldrh        r8, [src], pstep
701
702    usub8       r9, r12, r9
703    sel         r6, r12, r11                ; hev mask: r6
704
705    ;vp8_filter() function
706    ; load soure data to r6, r11, r12, lr
707    ldrh        r9, [src, #-2]
708    ldrh        r10, [src], pstep
709
710    pkhbt       r12, r7, r8, lsl #16
711
712    ldrh        r7, [src, #-2]
713    ldrh        r8, [src], pstep
714
715    pkhbt       r11, r9, r10, lsl #16
716
717    ldrh        r9, [src, #-2]
718    ldrh        r10, [src], pstep
719
720    ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first
721    str         r6, [sp]
722    str         lr, [sp, #4]
723
724    pkhbt       r6, r7, r8, lsl #16
725    pkhbt       lr, r9, r10, lsl #16
726
727    ;transpose r12, r11, r6, lr to r7, r8, r9, r10
728    TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10
729
730    ;load back hev_mask r6 and filter_mask lr
731    ldr         r12, c0x80808080
732    ldr         r6, [sp]
733    ldr         lr, [sp, #4]
734
735    eor         r7, r7, r12                 ; p1 offset to convert to a signed value
736    eor         r8, r8, r12                 ; p0 offset to convert to a signed value
737    eor         r9, r9, r12                 ; q0 offset to convert to a signed value
738    eor         r10, r10, r12               ; q1 offset to convert to a signed value
739
740    str         r9, [sp]                    ; store qs0 temporarily
741    str         r8, [sp, #4]                ; store ps0 temporarily
742    str         r10, [sp, #8]               ; store qs1 temporarily
743    str         r7, [sp, #12]               ; store ps1 temporarily
744
745    qsub8       r7, r7, r10                 ; vp8_signed_char_clamp(ps1-qs1)
746    qsub8       r8, r9, r8                  ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
747
748    and         r7, r7, r6                  ;  vp8_filter (r7) &= hev (r7 : filter)
749
750    qadd8       r7, r7, r8
751    ldr         r9, c0x03030303             ; r9 = 3 --modified for vp8
752
753    qadd8       r7, r7, r8
754    ldr         r10, c0x04040404
755
756    qadd8       r7, r7, r8
757    ;mvn         r11, #0                     ; r11 == -1
758
759    and         r7, r7, lr                  ; vp8_filter &= mask
760
761    ;modify code for vp8 -- Filter1 = vp8_filter (r7)
762    qadd8       r8 , r7 , r9                ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3)
763    qadd8       r7 , r7 , r10               ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4)
764
765    mov         r9, #0
766    shadd8      r8 , r8 , r9                ; Filter2 >>= 3
767    shadd8      r7 , r7 , r9                ; vp8_filter >>= 3
768    shadd8      r8 , r8 , r9
769    shadd8      r7 , r7 , r9
770    shadd8      lr , r8 , r9                ; lr: filter2
771    shadd8      r7 , r7 , r9                ; r7: filter
772
773    ;usub8      lr, r8, r10                 ; s = (s==4)*-1
774    ;sel            lr, r11, r9
775    ;usub8      r8, r10, r8
776    ;sel            r8, r11, r9
777    ;and            r8, r8, lr                  ; -1 for each element that equals 4 -- r8: s
778
779    ;calculate output
780    ;qadd8      lr, r8, r7                  ; u = vp8_signed_char_clamp(s + vp8_filter)
781
782    ldr         r8, [sp]                    ; load qs0
783    ldr         r9, [sp, #4]                ; load ps0
784
785    ldr         r10, c0x01010101
786
787    qsub8       r8, r8, r7                  ; u = vp8_signed_char_clamp(qs0 - vp8_filter)
788    qadd8       r9, r9, lr                  ; u = vp8_signed_char_clamp(ps0 + Filter2)
789    ;end of modification for vp8
790
791    eor         r8, r8, r12
792    eor         r9, r9, r12
793
794    mov         lr, #0
795
796    sadd8       r7, r7, r10
797    shadd8      r7, r7, lr
798
799    ldr         r10, [sp, #8]               ; load qs1
800    ldr         r11, [sp, #12]              ; load ps1
801
802    bic         r7, r7, r6                  ; r7: vp8_filter
803
804    qsub8       r10 , r10, r7               ; u = vp8_signed_char_clamp(qs1 - vp8_filter)
805    qadd8       r11, r11, r7                ; u = vp8_signed_char_clamp(ps1 + vp8_filter)
806    eor         r10, r10, r12
807    eor         r11, r11, r12
808
809    sub         src, src, pstep, lsl #2
810
811    ;we can use TRANSPOSE_MATRIX macro to transpose output - input: q1, q0, p0, p1
812    ;output is b0, b1, b2, b3
813    ;b0: 03 02 01 00
814    ;b1: 13 12 11 10
815    ;b2: 23 22 21 20
816    ;b3: 33 32 31 30
817    ;    p1 p0 q0 q1
818    ;   (a3 a2 a1 a0)
819    TRANSPOSE_MATRIX r11, r9, r8, r10, r6, r7, r12, lr
820
821    strh        r6, [src, #-2]              ; store the result
822    mov         r6, r6, lsr #16
823    strh        r6, [src], pstep
824
825    strh        r7, [src, #-2]
826    mov         r7, r7, lsr #16
827    strh        r7, [src], pstep
828
829    strh        r12, [src, #-2]
830    mov         r12, r12, lsr #16
831    strh        r12, [src], pstep
832
833    strh        lr, [src, #-2]
834    mov         lr, lr, lsr #16
835    strh        lr, [src], pstep
836
837|vskip_filter|
838    sub         src, src, #4
839    subs        count, count, #1
840
841    ldrne       r6, [src], pstep            ; load source data
842    ldrne       r7, [src], pstep
843    ldrne       r8, [src], pstep
844    ldrne       lr, [src], pstep
845
846    bne         Vnext8
847
848    add         sp, sp, #16
849
850    ldmia       sp!, {r4 - r11, pc}
851    ENDP        ; |vp8_loop_filter_vertical_edge_armv6|
852
853
854
855;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
856|vp8_mbloop_filter_vertical_edge_armv6| PROC
857;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
858    stmdb       sp!, {r4 - r11, lr}
859
860    sub         src, src, #4                ; move src pointer down by 4
861    ldr         count, [sp, #40]            ; count for 8-in-parallel
862    ldr         r12, [sp, #36]              ; load thresh address
863    pld         [src, #23]                  ; preload for next block
864    sub         sp, sp, #16                 ; create temp buffer
865
866    ldr         r6, [src], pstep            ; load source data
867    ldrb        r4, [r2]                    ; blimit
868    pld         [src, #23]
869    ldr         r7, [src], pstep
870    ldrb        r2, [r3]                    ; limit
871    pld         [src, #23]
872    ldr         r8, [src], pstep
873    orr         r4, r4, r4, lsl #8
874    ldrb        r3, [r12]                   ; thresh
875    orr         r2, r2, r2, lsl #8
876    pld         [src, #23]
877    ldr         lr, [src], pstep
878    mov         count, count, lsl #1        ; 4-in-parallel
879    orr         r4, r4, r4, lsl #16
880    orr         r3, r3, r3, lsl #8
881    orr         r2, r2, r2, lsl #16
882    orr         r3, r3, r3, lsl #16
883
884|MBVnext8|
885    ; vp8_filter_mask() function
886    ; calculate breakout conditions
887    ; transpose the source data for 4-in-parallel operation
888    TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
889
890    uqsub8      r7, r9, r10                 ; p3 - p2
891    uqsub8      r8, r10, r9                 ; p2 - p3
892    uqsub8      r9, r10, r11                ; p2 - p1
893    uqsub8      r10, r11, r10               ; p1 - p2
894    orr         r7, r7, r8                  ; abs (p3-p2)
895    orr         r10, r9, r10                ; abs (p2-p1)
896    uqsub8      lr, r7, r2                  ; compare to limit. lr: vp8_filter_mask
897    uqsub8      r10, r10, r2                ; compare to limit
898
899    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
900
901    orr         lr, lr, r10
902
903    uqsub8      r6, r11, r12                ; p1 - p0
904    uqsub8      r7, r12, r11                ; p0 - p1
905    add         src, src, #4                ; move src pointer up by 4
906    orr         r6, r6, r7                  ; abs (p1-p0)
907    str         r11, [sp, #12]              ; save p1
908    uqsub8      r10, r6, r2                 ; compare to limit
909    uqsub8      r11, r6, r3                 ; compare to thresh
910    orr         lr, lr, r10
911
912    ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now
913    ; transpose the source data for 4-in-parallel operation
914    ldr         r6, [src], pstep            ; load source data
915    str         r11, [sp]                   ; push r11 to stack
916    ldr         r7, [src], pstep
917    str         r12, [sp, #4]               ; save current reg before load q0 - q3 data
918    ldr         r8, [src], pstep
919    str         lr, [sp, #8]
920    ldr         lr, [src], pstep
921
922
923    TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
924
925    ldr         lr, [sp, #8]                ; load back (f)limit accumulator
926
927    uqsub8      r6, r12, r11                ; q3 - q2
928    uqsub8      r7, r11, r12                ; q2 - q3
929    uqsub8      r12, r11, r10               ; q2 - q1
930    uqsub8      r11, r10, r11               ; q1 - q2
931    orr         r6, r6, r7                  ; abs (q3-q2)
932    orr         r7, r12, r11                ; abs (q2-q1)
933    uqsub8      r6, r6, r2                  ; compare to limit
934    uqsub8      r7, r7, r2                  ; compare to limit
935    ldr         r11, [sp, #4]               ; load back p0
936    ldr         r12, [sp, #12]              ; load back p1
937    orr         lr, lr, r6
938    orr         lr, lr, r7
939
940    uqsub8      r6, r11, r9                 ; p0 - q0
941    uqsub8      r7, r9, r11                 ; q0 - p0
942    uqsub8      r8, r12, r10                ; p1 - q1
943    uqsub8      r11, r10, r12               ; q1 - p1
944    orr         r6, r6, r7                  ; abs (p0-q0)
945    ldr         r7, c0x7F7F7F7F
946    orr         r8, r8, r11                 ; abs (p1-q1)
947    uqadd8      r6, r6, r6                  ; abs (p0-q0) * 2
948    and         r8, r7, r8, lsr #1          ; abs (p1-q1) / 2
949    uqsub8      r11, r10, r9                ; q1 - q0
950    uqadd8      r6, r8, r6                  ; abs (p0-q0)*2 + abs (p1-q1)/2
951    uqsub8      r12, r9, r10                ; q0 - q1
952    uqsub8      r6, r6, r4                  ; compare to flimit
953
954    orr         r9, r11, r12                ; abs (q1-q0)
955    uqsub8      r8, r9, r2                  ; compare to limit
956    uqsub8      r10, r9, r3                 ; compare to thresh
957    orr         lr, lr, r6
958    orr         lr, lr, r8
959
960    mvn         r11, #0                     ; r11 == -1
961    mov         r12, #0
962
963    usub8       lr, r12, lr
964    ldr         r9, [sp]                    ; load the compared result
965    sel         lr, r11, r12                ; filter mask: lr
966
967    cmp         lr, #0
968    beq         mbvskip_filter               ; skip filtering
969
970
971
972    ;vp8_hevmask() function
973    ;calculate high edge variance
974
975    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
976
977    orr         r9, r9, r10
978
979    ldrh        r7, [src, #-2]
980    ldrh        r8, [src], pstep
981
982    usub8       r9, r12, r9
983    sel         r6, r12, r11                ; hev mask: r6
984
985
986    ; vp8_mbfilter() function
987    ; p2, q2 are only needed at the end. Don't need to load them in now.
988    ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first
989    ; load soure data to r6, r11, r12, lr
990    ldrh        r9, [src, #-2]
991    ldrh        r10, [src], pstep
992
993    pkhbt       r12, r7, r8, lsl #16
994
995    ldrh        r7, [src, #-2]
996    ldrh        r8, [src], pstep
997
998    pkhbt       r11, r9, r10, lsl #16
999
1000    ldrh        r9, [src, #-2]
1001    ldrh        r10, [src], pstep
1002
1003    str         r6, [sp]                    ; save r6
1004    str         lr, [sp, #4]                ; save lr
1005
1006    pkhbt       r6, r7, r8, lsl #16
1007    pkhbt       lr, r9, r10, lsl #16
1008
1009    ;transpose r12, r11, r6, lr to p1, p0, q0, q1
1010    TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10
1011
1012    ;load back hev_mask r6 and filter_mask lr
1013    ldr         r12, c0x80808080
1014    ldr         r6, [sp]
1015    ldr         lr, [sp, #4]
1016
1017    eor         r7, r7, r12                 ; ps1
1018    eor         r8, r8, r12                 ; ps0
1019    eor         r9, r9, r12                 ; qs0
1020    eor         r10, r10, r12               ; qs1
1021
1022    qsub8       r12, r9, r8                 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
1023    str         r7, [sp, #12]               ; store ps1 temporarily
1024    qsub8       r7, r7, r10                 ; vp8_signed_char_clamp(ps1-qs1)
1025    str         r10, [sp, #8]               ; store qs1 temporarily
1026    qadd8       r7, r7, r12
1027    str         r9, [sp]                    ; store qs0 temporarily
1028    qadd8       r7, r7, r12
1029    str         r8, [sp, #4]                ; store ps0 temporarily
1030    qadd8       r7, r7, r12                 ; vp8_filter: r7
1031
1032    ldr         r10, c0x03030303            ; r10 = 3 --modified for vp8
1033    ldr         r9, c0x04040404
1034    ;mvn         r11, #0                     ; r11 == -1
1035
1036    and         r7, r7, lr                  ; vp8_filter &= mask (lr is free)
1037
1038    mov         r12, r7                     ; Filter2: r12
1039    and         r12, r12, r6                ; Filter2 &= hev
1040
1041    ;modify code for vp8
1042    ;save bottom 3 bits so that we round one side +4 and the other +3
1043    qadd8       r8 , r12 , r9               ; Filter1 (r8) = vp8_signed_char_clamp(Filter2+4)
1044    qadd8       r12 , r12 , r10             ; Filter2 (r12) = vp8_signed_char_clamp(Filter2+3)
1045
1046    mov         r10, #0
1047    shadd8      r8 , r8 , r10               ; Filter1 >>= 3
1048    shadd8      r12 , r12 , r10             ; Filter2 >>= 3
1049    shadd8      r8 , r8 , r10
1050    shadd8      r12 , r12 , r10
1051    shadd8      r8 , r8 , r10               ; r8: Filter1
1052    shadd8      r12 , r12 , r10             ; r12: Filter2
1053
1054    ldr         r9, [sp]                    ; load qs0
1055    ldr         r11, [sp, #4]               ; load ps0
1056
1057    qsub8       r9 , r9, r8                 ; qs0 = vp8_signed_char_clamp(qs0 - Filter1)
1058    qadd8       r11, r11, r12               ; ps0 = vp8_signed_char_clamp(ps0 + Filter2)
1059
1060    ;save bottom 3 bits so that we round one side +4 and the other +3
1061    ;and            r8, r12, r10                ; s = Filter2 & 7 (s: r8)
1062    ;qadd8      r12 , r12 , r9              ; Filter2 = vp8_signed_char_clamp(Filter2+4)
1063    ;mov            r10, #0
1064    ;shadd8     r12 , r12 , r10             ; Filter2 >>= 3
1065    ;usub8      lr, r8, r9                  ; s = (s==4)*-1
1066    ;sel            lr, r11, r10
1067    ;shadd8     r12 , r12 , r10
1068    ;usub8      r8, r9, r8
1069    ;sel            r8, r11, r10
1070    ;ldr            r9, [sp]                    ; load qs0
1071    ;ldr            r11, [sp, #4]               ; load ps0
1072    ;shadd8     r12 , r12 , r10
1073    ;and            r8, r8, lr                  ; -1 for each element that equals 4
1074    ;qadd8      r10, r8, r12                ; u = vp8_signed_char_clamp(s + Filter2)
1075    ;qsub8      r9 , r9, r12                ; qs0 = vp8_signed_char_clamp(qs0 - Filter2)
1076    ;qadd8      r11, r11, r10               ; ps0 = vp8_signed_char_clamp(ps0 + u)
1077
1078    ;end of modification for vp8
1079
1080    bic         r12, r7, r6                 ;vp8_filter &= ~hev    ( r6 is free)
1081    ;mov            r12, r7
1082
1083    ;roughly 3/7th difference across boundary
1084    mov         lr, #0x1b                   ; 27
1085    mov         r7, #0x3f                   ; 63
1086
1087    sxtb16      r6, r12
1088    sxtb16      r10, r12, ror #8
1089    smlabb      r8, r6, lr, r7
1090    smlatb      r6, r6, lr, r7
1091    smlabb      r7, r10, lr, r7
1092    smultb      r10, r10, lr
1093    ssat        r8, #8, r8, asr #7
1094    ssat        r6, #8, r6, asr #7
1095    add         r10, r10, #63
1096    ssat        r7, #8, r7, asr #7
1097    ssat        r10, #8, r10, asr #7
1098
1099    ldr         lr, c0x80808080
1100
1101    pkhbt       r6, r8, r6, lsl #16
1102    pkhbt       r10, r7, r10, lsl #16
1103    uxtb16      r6, r6
1104    uxtb16      r10, r10
1105
1106    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
1107
1108    orr         r10, r6, r10, lsl #8        ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
1109
1110    qsub8       r8, r9, r10                 ; s = vp8_signed_char_clamp(qs0 - u)
1111    qadd8       r10, r11, r10               ; s = vp8_signed_char_clamp(ps0 + u)
1112    eor         r8, r8, lr                  ; *oq0 = s^0x80
1113    eor         r10, r10, lr                ; *op0 = s^0x80
1114
1115    strb        r10, [src, #-1]             ; store op0 result
1116    strb        r8, [src], pstep            ; store oq0 result
1117    mov         r10, r10, lsr #8
1118    mov         r8, r8, lsr #8
1119    strb        r10, [src, #-1]
1120    strb        r8, [src], pstep
1121    mov         r10, r10, lsr #8
1122    mov         r8, r8, lsr #8
1123    strb        r10, [src, #-1]
1124    strb        r8, [src], pstep
1125    mov         r10, r10, lsr #8
1126    mov         r8, r8, lsr #8
1127    strb        r10, [src, #-1]
1128    strb        r8, [src], pstep
1129
1130    ;roughly 2/7th difference across boundary
1131    mov         lr, #0x12                   ; 18
1132    mov         r7, #0x3f                   ; 63
1133
1134    sxtb16      r6, r12
1135    sxtb16      r10, r12, ror #8
1136    smlabb      r8, r6, lr, r7
1137    smlatb      r6, r6, lr, r7
1138    smlabb      r9, r10, lr, r7
1139
1140    smlatb      r10, r10, lr, r7
1141    ssat        r8, #8, r8, asr #7
1142    ssat        r6, #8, r6, asr #7
1143    ssat        r9, #8, r9, asr #7
1144    ssat        r10, #8, r10, asr #7
1145
1146    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
1147
1148    pkhbt       r6, r8, r6, lsl #16
1149    pkhbt       r10, r9, r10, lsl #16
1150
1151    ldr         r9, [sp, #8]                ; load qs1
1152    ldr         r11, [sp, #12]              ; load ps1
1153    ldr         lr, c0x80808080
1154
1155    uxtb16      r6, r6
1156    uxtb16      r10, r10
1157
1158    add         src, src, #2
1159
1160    orr         r10, r6, r10, lsl #8        ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
1161
1162    qsub8       r8, r9, r10                 ; s = vp8_signed_char_clamp(qs1 - u)
1163    qadd8       r10, r11, r10               ; s = vp8_signed_char_clamp(ps1 + u)
1164    eor         r8, r8, lr                  ; *oq1 = s^0x80
1165    eor         r10, r10, lr                ; *op1 = s^0x80
1166
1167    ldrb        r11, [src, #-5]             ; load p2 for 1/7th difference across boundary
1168    strb        r10, [src, #-4]             ; store op1
1169    strb        r8, [src, #-1]              ; store oq1
1170    ldrb        r9, [src], pstep            ; load q2 for 1/7th difference across boundary
1171
1172    mov         r10, r10, lsr #8
1173    mov         r8, r8, lsr #8
1174
1175    ldrb        r6, [src, #-5]
1176    strb        r10, [src, #-4]
1177    strb        r8, [src, #-1]
1178    ldrb        r7, [src], pstep
1179
1180    mov         r10, r10, lsr #8
1181    mov         r8, r8, lsr #8
1182    orr         r11, r11, r6, lsl #8
1183    orr         r9, r9, r7, lsl #8
1184
1185    ldrb        r6, [src, #-5]
1186    strb        r10, [src, #-4]
1187    strb        r8, [src, #-1]
1188    ldrb        r7, [src], pstep
1189
1190    mov         r10, r10, lsr #8
1191    mov         r8, r8, lsr #8
1192    orr         r11, r11, r6, lsl #16
1193    orr         r9, r9, r7, lsl #16
1194
1195    ldrb        r6, [src, #-5]
1196    strb        r10, [src, #-4]
1197    strb        r8, [src, #-1]
1198    ldrb        r7, [src], pstep
1199    orr         r11, r11, r6, lsl #24
1200    orr         r9, r9, r7, lsl #24
1201
1202    ;roughly 1/7th difference across boundary
1203    eor         r9, r9, lr
1204    eor         r11, r11, lr
1205
1206    mov         lr, #0x9                    ; 9
1207    mov         r7, #0x3f                   ; 63
1208
1209    sxtb16      r6, r12
1210    sxtb16      r10, r12, ror #8
1211    smlabb      r8, r6, lr, r7
1212    smlatb      r6, r6, lr, r7
1213    smlabb      r12, r10, lr, r7
1214    smlatb      r10, r10, lr, r7
1215    ssat        r8, #8, r8, asr #7
1216    ssat        r6, #8, r6, asr #7
1217    ssat        r12, #8, r12, asr #7
1218    ssat        r10, #8, r10, asr #7
1219
1220    sub         src, src, pstep, lsl #2
1221
1222    pkhbt       r6, r8, r6, lsl #16
1223    pkhbt       r10, r12, r10, lsl #16
1224
1225    uxtb16      r6, r6
1226    uxtb16      r10, r10
1227
1228    ldr         lr, c0x80808080
1229
1230    orr         r10, r6, r10, lsl #8        ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
1231
1232    qadd8       r8, r11, r10                ; s = vp8_signed_char_clamp(ps2 + u)
1233    qsub8       r10, r9, r10                ; s = vp8_signed_char_clamp(qs2 - u)
1234    eor         r8, r8, lr                  ; *op2 = s^0x80
1235    eor         r10, r10, lr                ; *oq2 = s^0x80
1236
1237    strb        r8, [src, #-5]              ; store *op2
1238    strb        r10, [src], pstep           ; store *oq2
1239    mov         r8, r8, lsr #8
1240    mov         r10, r10, lsr #8
1241    strb        r8, [src, #-5]
1242    strb        r10, [src], pstep
1243    mov         r8, r8, lsr #8
1244    mov         r10, r10, lsr #8
1245    strb        r8, [src, #-5]
1246    strb        r10, [src], pstep
1247    mov         r8, r8, lsr #8
1248    mov         r10, r10, lsr #8
1249    strb        r8, [src, #-5]
1250    strb        r10, [src], pstep
1251
1252    ;adjust src pointer for next loop
1253    sub         src, src, #2
1254
1255|mbvskip_filter|
1256    sub         src, src, #4
1257    subs        count, count, #1
1258
1259    pld         [src, #23]                  ; preload for next block
1260    ldrne       r6, [src], pstep            ; load source data
1261    pld         [src, #23]
1262    ldrne       r7, [src], pstep
1263    pld         [src, #23]
1264    ldrne       r8, [src], pstep
1265    pld         [src, #23]
1266    ldrne       lr, [src], pstep
1267
1268    bne         MBVnext8
1269
1270    add         sp, sp, #16
1271
1272    ldmia       sp!, {r4 - r11, pc}
1273    ENDP        ; |vp8_mbloop_filter_vertical_edge_armv6|
1274
1275; Constant Pool
1276c0x80808080 DCD     0x80808080
1277c0x03030303 DCD     0x03030303
1278c0x04040404 DCD     0x04040404
1279c0x01010101 DCD     0x01010101
1280c0x7F7F7F7F DCD     0x7F7F7F7F
1281
1282    END
1283