1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12    EXPORT |vp8_loop_filter_horizontal_edge_armv6|
13    EXPORT |vp8_mbloop_filter_horizontal_edge_armv6|
14    EXPORT |vp8_loop_filter_vertical_edge_armv6|
15    EXPORT |vp8_mbloop_filter_vertical_edge_armv6|
16
17    AREA    |.text|, CODE, READONLY  ; name this block of code
18
19    MACRO
20    TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3
21    ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3
22    ; a0: 03 02 01 00
23    ; a1: 13 12 11 10
24    ; a2: 23 22 21 20
25    ; a3: 33 32 31 30
26    ;     b3 b2 b1 b0
27
28    uxtb16      $b1, $a1                    ; xx 12 xx 10
29    uxtb16      $b0, $a0                    ; xx 02 xx 00
30    uxtb16      $b3, $a3                    ; xx 32 xx 30
31    uxtb16      $b2, $a2                    ; xx 22 xx 20
32    orr         $b1, $b0, $b1, lsl #8       ; 12 02 10 00
33    orr         $b3, $b2, $b3, lsl #8       ; 32 22 30 20
34
35    uxtb16      $a1, $a1, ror #8            ; xx 13 xx 11
36    uxtb16      $a3, $a3, ror #8            ; xx 33 xx 31
37    uxtb16      $a0, $a0, ror #8            ; xx 03 xx 01
38    uxtb16      $a2, $a2, ror #8            ; xx 23 xx 21
39    orr         $a0, $a0, $a1, lsl #8       ; 13 03 11 01
40    orr         $a2, $a2, $a3, lsl #8       ; 33 23 31 21
41
42    pkhtb       $b2, $b3, $b1, asr #16      ; 32 22 12 02   -- p1
43    pkhbt       $b0, $b1, $b3, lsl #16      ; 30 20 10 00   -- p3
44
45    pkhtb       $b3, $a2, $a0, asr #16      ; 33 23 13 03   -- p0
46    pkhbt       $b1, $a0, $a2, lsl #16      ; 31 21 11 01   -- p2
47    MEND
48
49
50src         RN  r0
51pstep       RN  r1
52count       RN  r5
53
54;r0     unsigned char *src_ptr,
55;r1     int src_pixel_step,
56;r2     const char *flimit,
57;r3     const char *limit,
58;stack  const char *thresh,
59;stack  int  count
60
61;Note: All 16 elements in flimit are equal. So, in the code, only one load is needed
62;for flimit. Same way applies to limit and thresh.
63
64;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
65|vp8_loop_filter_horizontal_edge_armv6| PROC
66;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
67    stmdb       sp!, {r4 - r11, lr}
68
69    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
70    ldr         count, [sp, #40]            ; count for 8-in-parallel
71    ldr         r6, [sp, #36]               ; load thresh address
72    sub         sp, sp, #16                 ; create temp buffer
73
74    ldr         r9, [src], pstep            ; p3
75    ldr         r4, [r2], #4                ; flimit
76    ldr         r10, [src], pstep           ; p2
77    ldr         r2, [r3], #4                ; limit
78    ldr         r11, [src], pstep           ; p1
79    uadd8       r4, r4, r4                  ; flimit * 2
80    ldr         r3, [r6], #4                ; thresh
81    mov         count, count, lsl #1        ; 4-in-parallel
82    uadd8       r4, r4, r2                  ; flimit * 2 + limit
83
84|Hnext8|
85    ; vp8_filter_mask() function
86    ; calculate breakout conditions
87    ldr         r12, [src], pstep           ; p0
88
89    uqsub8      r6, r9, r10                 ; p3 - p2
90    uqsub8      r7, r10, r9                 ; p2 - p3
91    uqsub8      r8, r10, r11                ; p2 - p1
92    uqsub8      r10, r11, r10               ; p1 - p2
93
94    orr         r6, r6, r7                  ; abs (p3-p2)
95    orr         r8, r8, r10                 ; abs (p2-p1)
96    uqsub8      lr, r6, r2                  ; compare to limit. lr: vp8_filter_mask
97    uqsub8      r8, r8, r2                  ; compare to limit
98    uqsub8      r6, r11, r12                ; p1 - p0
99    orr         lr, lr, r8
100    uqsub8      r7, r12, r11                ; p0 - p1
101    ldr         r9, [src], pstep            ; q0
102    ldr         r10, [src], pstep           ; q1
103    orr         r6, r6, r7                  ; abs (p1-p0)
104    uqsub8      r7, r6, r2                  ; compare to limit
105    uqsub8      r8, r6, r3                  ; compare to thresh  -- save r8 for later
106    orr         lr, lr, r7
107
108    uqsub8      r6, r11, r10                ; p1 - q1
109    uqsub8      r7, r10, r11                ; q1 - p1
110    uqsub8      r11, r12, r9                ; p0 - q0
111    uqsub8      r12, r9, r12                ; q0 - p0
112    orr         r6, r6, r7                  ; abs (p1-q1)
113    ldr         r7, c0x7F7F7F7F
114    orr         r12, r11, r12               ; abs (p0-q0)
115    ldr         r11, [src], pstep           ; q2
116    uqadd8      r12, r12, r12               ; abs (p0-q0) * 2
117    and         r6, r7, r6, lsr #1          ; abs (p1-q1) / 2
118    uqsub8      r7, r9, r10                 ; q0 - q1
119    uqadd8      r12, r12, r6                ; abs (p0-q0)*2 + abs (p1-q1)/2
120    uqsub8      r6, r10, r9                 ; q1 - q0
121    uqsub8      r12, r12, r4                ; compare to flimit
122    uqsub8      r9, r11, r10                ; q2 - q1
123
124    orr         lr, lr, r12
125
126    ldr         r12, [src], pstep           ; q3
127    uqsub8      r10, r10, r11               ; q1 - q2
128    orr         r6, r7, r6                  ; abs (q1-q0)
129    orr         r10, r9, r10                ; abs (q2-q1)
130    uqsub8      r7, r6, r2                  ; compare to limit
131    uqsub8      r10, r10, r2                ; compare to limit
132    uqsub8      r6, r6, r3                  ; compare to thresh -- save r6 for later
133    orr         lr, lr, r7
134    orr         lr, lr, r10
135
136    uqsub8      r10, r12, r11               ; q3 - q2
137    uqsub8      r9, r11, r12                ; q2 - q3
138
139    mvn         r11, #0                     ; r11 == -1
140
141    orr         r10, r10, r9                ; abs (q3-q2)
142    uqsub8      r10, r10, r2                ; compare to limit
143
144    mov         r12, #0
145    orr         lr, lr, r10
146    sub         src, src, pstep, lsl #2
147
148    usub8       lr, r12, lr                 ; use usub8 instead of ssub8
149    sel         lr, r11, r12                ; filter mask: lr
150
151    cmp         lr, #0
152    beq         hskip_filter                 ; skip filtering
153
154    sub         src, src, pstep, lsl #1     ; move src pointer down by 6 lines
155
156    ;vp8_hevmask() function
157    ;calculate high edge variance
158    orr         r10, r6, r8                 ; calculate vp8_hevmask
159
160    ldr         r7, [src], pstep            ; p1
161
162    usub8       r10, r12, r10               ; use usub8 instead of ssub8
163    sel         r6, r12, r11                ; obtain vp8_hevmask: r6
164
165    ;vp8_filter() function
166    ldr         r8, [src], pstep            ; p0
167    ldr         r12, c0x80808080
168    ldr         r9, [src], pstep            ; q0
169    ldr         r10, [src], pstep           ; q1
170
171    eor         r7, r7, r12                 ; p1 offset to convert to a signed value
172    eor         r8, r8, r12                 ; p0 offset to convert to a signed value
173    eor         r9, r9, r12                 ; q0 offset to convert to a signed value
174    eor         r10, r10, r12               ; q1 offset to convert to a signed value
175
176    str         r9, [sp]                    ; store qs0 temporarily
177    str         r8, [sp, #4]                ; store ps0 temporarily
178    str         r10, [sp, #8]               ; store qs1 temporarily
179    str         r7, [sp, #12]               ; store ps1 temporarily
180
181    qsub8       r7, r7, r10                 ; vp8_signed_char_clamp(ps1-qs1)
182    qsub8       r8, r9, r8                  ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
183
184    and         r7, r7, r6                  ; vp8_filter (r7) &= hev
185
186    qadd8       r7, r7, r8
187    ldr         r9, c0x03030303             ; r9 = 3 --modified for vp8
188
189    qadd8       r7, r7, r8
190    ldr         r10, c0x04040404
191
192    qadd8       r7, r7, r8
193    and         r7, r7, lr                  ; vp8_filter &= mask;
194
195    ;modify code for vp8 -- Filter1 = vp8_filter (r7)
196    qadd8       r8 , r7 , r9                ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3)
197    qadd8       r7 , r7 , r10               ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4)
198
199    mov         r9, #0
200    shadd8      r8 , r8 , r9                ; Filter2 >>= 3
201    shadd8      r7 , r7 , r9                ; vp8_filter >>= 3
202    shadd8      r8 , r8 , r9
203    shadd8      r7 , r7 , r9
204    shadd8      lr , r8 , r9                ; lr: Filter2
205    shadd8      r7 , r7 , r9                ; r7: filter
206
207    ;usub8      lr, r8, r10                 ; s = (s==4)*-1
208    ;sel        lr, r11, r9
209    ;usub8      r8, r10, r8
210    ;sel        r8, r11, r9
211    ;and        r8, r8, lr                  ; -1 for each element that equals 4
212
213    ;calculate output
214    ;qadd8      lr, r8, r7                  ; u = vp8_signed_char_clamp(s + vp8_filter)
215
216    ldr         r8, [sp]                    ; load qs0
217    ldr         r9, [sp, #4]                ; load ps0
218
219    ldr         r10, c0x01010101
220
221    qsub8       r8 ,r8, r7                  ; u = vp8_signed_char_clamp(qs0 - vp8_filter)
222    qadd8       r9, r9, lr                  ; u = vp8_signed_char_clamp(ps0 + Filter2)
223
224    ;end of modification for vp8
225
226    mov         lr, #0
227    sadd8       r7, r7 , r10                ; vp8_filter += 1
228    shadd8      r7, r7, lr                  ; vp8_filter >>= 1
229
230    ldr         r11, [sp, #12]              ; load ps1
231    ldr         r10, [sp, #8]               ; load qs1
232
233    bic         r7, r7, r6                  ; vp8_filter &= ~hev
234    sub         src, src, pstep, lsl #2
235
236    qadd8       r11, r11, r7                ; u = vp8_signed_char_clamp(ps1 + vp8_filter)
237    qsub8       r10, r10,r7                 ; u = vp8_signed_char_clamp(qs1 - vp8_filter)
238
239    eor         r11, r11, r12               ; *op1 = u^0x80
240    str         r11, [src], pstep           ; store op1
241    eor         r9, r9, r12                 ; *op0 = u^0x80
242    str         r9, [src], pstep            ; store op0 result
243    eor         r8, r8, r12                 ; *oq0 = u^0x80
244    str         r8, [src], pstep            ; store oq0 result
245    eor         r10, r10, r12               ; *oq1 = u^0x80
246    str         r10, [src], pstep           ; store oq1
247
248    sub         src, src, pstep, lsl #1
249
250|hskip_filter|
251    add         src, src, #4
252    sub         src, src, pstep, lsl #2
253
254    subs        count, count, #1
255
256    ;pld            [src]
257    ;pld            [src, pstep]
258    ;pld            [src, pstep, lsl #1]
259    ;pld            [src, pstep, lsl #2]
260    ;pld            [src, pstep, lsl #3]
261
262    ldrne       r9, [src], pstep            ; p3
263    ldrne       r10, [src], pstep           ; p2
264    ldrne       r11, [src], pstep           ; p1
265
266    bne         Hnext8
267
268    add         sp, sp, #16
269    ldmia       sp!, {r4 - r11, pc}
270    ENDP        ; |vp8_loop_filter_horizontal_edge_armv6|
271
272
273;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
274|vp8_mbloop_filter_horizontal_edge_armv6| PROC
275;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
276    stmdb       sp!, {r4 - r11, lr}
277
278    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
279    ldr         count, [sp, #40]            ; count for 8-in-parallel
280    ldr         r6, [sp, #36]               ; load thresh address
281    sub         sp, sp, #16                 ; create temp buffer
282
283    ldr         r9, [src], pstep            ; p3
284    ldr         r4, [r2], #4                ; flimit
285    ldr         r10, [src], pstep           ; p2
286    ldr         r2, [r3], #4                ; limit
287    ldr         r11, [src], pstep           ; p1
288    uadd8       r4, r4, r4                  ; flimit * 2
289    ldr         r3, [r6], #4                ; thresh
290    mov         count, count, lsl #1        ; 4-in-parallel
291    uadd8       r4, r4, r2                  ; flimit * 2 + limit
292
293|MBHnext8|
294
295    ; vp8_filter_mask() function
296    ; calculate breakout conditions
297    ldr         r12, [src], pstep           ; p0
298
299    uqsub8      r6, r9, r10                 ; p3 - p2
300    uqsub8      r7, r10, r9                 ; p2 - p3
301    uqsub8      r8, r10, r11                ; p2 - p1
302    uqsub8      r10, r11, r10               ; p1 - p2
303
304    orr         r6, r6, r7                  ; abs (p3-p2)
305    orr         r8, r8, r10                 ; abs (p2-p1)
306    uqsub8      lr, r6, r2                  ; compare to limit. lr: vp8_filter_mask
307    uqsub8      r8, r8, r2                  ; compare to limit
308
309    uqsub8      r6, r11, r12                ; p1 - p0
310    orr         lr, lr, r8
311    uqsub8      r7, r12, r11                ; p0 - p1
312    ldr         r9, [src], pstep            ; q0
313    ldr         r10, [src], pstep           ; q1
314    orr         r6, r6, r7                  ; abs (p1-p0)
315    uqsub8      r7, r6, r2                  ; compare to limit
316    uqsub8      r8, r6, r3                  ; compare to thresh  -- save r8 for later
317    orr         lr, lr, r7
318
319    uqsub8      r6, r11, r10                ; p1 - q1
320    uqsub8      r7, r10, r11                ; q1 - p1
321    uqsub8      r11, r12, r9                ; p0 - q0
322    uqsub8      r12, r9, r12                ; q0 - p0
323    orr         r6, r6, r7                  ; abs (p1-q1)
324    ldr         r7, c0x7F7F7F7F
325    orr         r12, r11, r12               ; abs (p0-q0)
326    ldr         r11, [src], pstep           ; q2
327    uqadd8      r12, r12, r12               ; abs (p0-q0) * 2
328    and         r6, r7, r6, lsr #1          ; abs (p1-q1) / 2
329    uqsub8      r7, r9, r10                 ; q0 - q1
330    uqadd8      r12, r12, r6                ; abs (p0-q0)*2 + abs (p1-q1)/2
331    uqsub8      r6, r10, r9                 ; q1 - q0
332    uqsub8      r12, r12, r4                ; compare to flimit
333    uqsub8      r9, r11, r10                ; q2 - q1
334
335    orr         lr, lr, r12
336
337    ldr         r12, [src], pstep           ; q3
338
339    uqsub8      r10, r10, r11               ; q1 - q2
340    orr         r6, r7, r6                  ; abs (q1-q0)
341    orr         r10, r9, r10                ; abs (q2-q1)
342    uqsub8      r7, r6, r2                  ; compare to limit
343    uqsub8      r10, r10, r2                ; compare to limit
344    uqsub8      r6, r6, r3                  ; compare to thresh -- save r6 for later
345    orr         lr, lr, r7
346    orr         lr, lr, r10
347
348    uqsub8      r10, r12, r11               ; q3 - q2
349    uqsub8      r9, r11, r12                ; q2 - q3
350
351    mvn         r11, #0                     ; r11 == -1
352
353    orr         r10, r10, r9                ; abs (q3-q2)
354    uqsub8      r10, r10, r2                ; compare to limit
355
356    mov         r12, #0
357
358    orr         lr, lr, r10
359
360    usub8       lr, r12, lr                 ; use usub8 instead of ssub8
361    sel         lr, r11, r12                ; filter mask: lr
362
363    cmp         lr, #0
364    beq         mbhskip_filter               ; skip filtering
365
366    ;vp8_hevmask() function
367    ;calculate high edge variance
368    sub         src, src, pstep, lsl #2     ; move src pointer down by 6 lines
369    sub         src, src, pstep, lsl #1
370
371    orr         r10, r6, r8
372    ldr         r7, [src], pstep            ; p1
373
374    usub8       r10, r12, r10
375    sel         r6, r12, r11                ; hev mask: r6
376
377    ;vp8_mbfilter() function
378    ;p2, q2 are only needed at the end. Don't need to load them in now.
379    ldr         r8, [src], pstep            ; p0
380    ldr         r12, c0x80808080
381    ldr         r9, [src], pstep            ; q0
382    ldr         r10, [src]                  ; q1
383
384    eor         r7, r7, r12                 ; ps1
385    eor         r8, r8, r12                 ; ps0
386    eor         r9, r9, r12                 ; qs0
387    eor         r10, r10, r12               ; qs1
388
389    qsub8       r12, r9, r8                 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
390    str         r7, [sp, #12]               ; store ps1 temporarily
391    qsub8       r7, r7, r10                 ; vp8_signed_char_clamp(ps1-qs1)
392    str         r10, [sp, #8]               ; store qs1 temporarily
393    qadd8       r7, r7, r12
394    str         r9, [sp]                    ; store qs0 temporarily
395    qadd8       r7, r7, r12
396    str         r8, [sp, #4]                ; store ps0 temporarily
397    qadd8       r7, r7, r12                 ; vp8_filter: r7
398
399    ldr         r10, c0x03030303            ; r10 = 3 --modified for vp8
400    ldr         r9, c0x04040404
401
402    and         r7, r7, lr                  ; vp8_filter &= mask (lr is free)
403
404    mov         r12, r7                     ; Filter2: r12
405    and         r12, r12, r6                ; Filter2 &= hev
406
407    ;modify code for vp8
408    ;save bottom 3 bits so that we round one side +4 and the other +3
409    qadd8       r8 , r12 , r9               ; Filter1 (r8) = vp8_signed_char_clamp(Filter2+4)
410    qadd8       r12 , r12 , r10             ; Filter2 (r12) = vp8_signed_char_clamp(Filter2+3)
411
412    mov         r10, #0
413    shadd8      r8 , r8 , r10               ; Filter1 >>= 3
414    shadd8      r12 , r12 , r10             ; Filter2 >>= 3
415    shadd8      r8 , r8 , r10
416    shadd8      r12 , r12 , r10
417    shadd8      r8 , r8 , r10               ; r8: Filter1
418    shadd8      r12 , r12 , r10             ; r12: Filter2
419
420    ldr         r9, [sp]                    ; load qs0
421    ldr         r11, [sp, #4]               ; load ps0
422
423    qsub8       r9 , r9, r8                 ; qs0 = vp8_signed_char_clamp(qs0 - Filter1)
424    qadd8       r11, r11, r12               ; ps0 = vp8_signed_char_clamp(ps0 + Filter2)
425
426    ;save bottom 3 bits so that we round one side +4 and the other +3
427    ;and            r8, r12, r10                ; s = Filter2 & 7 (s: r8)
428    ;qadd8      r12 , r12 , r9              ; Filter2 = vp8_signed_char_clamp(Filter2+4)
429    ;mov            r10, #0
430    ;shadd8     r12 , r12 , r10             ; Filter2 >>= 3
431    ;usub8      lr, r8, r9                  ; s = (s==4)*-1
432    ;sel            lr, r11, r10
433    ;shadd8     r12 , r12 , r10
434    ;usub8      r8, r9, r8
435    ;sel            r8, r11, r10
436    ;ldr            r9, [sp]                    ; load qs0
437    ;ldr            r11, [sp, #4]               ; load ps0
438    ;shadd8     r12 , r12 , r10
439    ;and            r8, r8, lr                  ; -1 for each element that equals 4
440    ;qadd8      r10, r8, r12                ; u = vp8_signed_char_clamp(s + Filter2)
441    ;qsub8      r9 , r9, r12                ; qs0 = vp8_signed_char_clamp(qs0 - Filter2)
442    ;qadd8      r11, r11, r10               ; ps0 = vp8_signed_char_clamp(ps0 + u)
443
444    ;end of modification for vp8
445
446    bic         r12, r7, r6                 ; vp8_filter &= ~hev    ( r6 is free)
447    ;mov        r12, r7
448
449    ;roughly 3/7th difference across boundary
450    mov         lr, #0x1b                   ; 27
451    mov         r7, #0x3f                   ; 63
452
453    sxtb16      r6, r12
454    sxtb16      r10, r12, ror #8
455    smlabb      r8, r6, lr, r7
456    smlatb      r6, r6, lr, r7
457    smlabb      r7, r10, lr, r7
458    smultb      r10, r10, lr
459    ssat        r8, #8, r8, asr #7
460    ssat        r6, #8, r6, asr #7
461    add         r10, r10, #63
462    ssat        r7, #8, r7, asr #7
463    ssat        r10, #8, r10, asr #7
464
465    ldr         lr, c0x80808080
466
467    pkhbt       r6, r8, r6, lsl #16
468    pkhbt       r10, r7, r10, lsl #16
469    uxtb16      r6, r6
470    uxtb16      r10, r10
471
472    sub         src, src, pstep
473
474    orr         r10, r6, r10, lsl #8        ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
475
476    qsub8       r8, r9, r10                 ; s = vp8_signed_char_clamp(qs0 - u)
477    qadd8       r10, r11, r10               ; s = vp8_signed_char_clamp(ps0 + u)
478    eor         r8, r8, lr                  ; *oq0 = s^0x80
479    str         r8, [src]                   ; store *oq0
480    sub         src, src, pstep
481    eor         r10, r10, lr                ; *op0 = s^0x80
482    str         r10, [src]                  ; store *op0
483
484    ;roughly 2/7th difference across boundary
485    mov         lr, #0x12                   ; 18
486    mov         r7, #0x3f                   ; 63
487
488    sxtb16      r6, r12
489    sxtb16      r10, r12, ror #8
490    smlabb      r8, r6, lr, r7
491    smlatb      r6, r6, lr, r7
492    smlabb      r9, r10, lr, r7
493    smlatb      r10, r10, lr, r7
494    ssat        r8, #8, r8, asr #7
495    ssat        r6, #8, r6, asr #7
496    ssat        r9, #8, r9, asr #7
497    ssat        r10, #8, r10, asr #7
498
499    ldr         lr, c0x80808080
500
501    pkhbt       r6, r8, r6, lsl #16
502    pkhbt       r10, r9, r10, lsl #16
503
504    ldr         r9, [sp, #8]                ; load qs1
505    ldr         r11, [sp, #12]              ; load ps1
506
507    uxtb16      r6, r6
508    uxtb16      r10, r10
509
510    sub         src, src, pstep
511
512    orr         r10, r6, r10, lsl #8        ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
513
514    qadd8       r11, r11, r10               ; s = vp8_signed_char_clamp(ps1 + u)
515    qsub8       r8, r9, r10                 ; s = vp8_signed_char_clamp(qs1 - u)
516    eor         r11, r11, lr                ; *op1 = s^0x80
517    str         r11, [src], pstep           ; store *op1
518    eor         r8, r8, lr                  ; *oq1 = s^0x80
519    add         src, src, pstep, lsl #1
520
521    mov         r7, #0x3f                   ; 63
522
523    str         r8, [src], pstep            ; store *oq1
524
525    ;roughly 1/7th difference across boundary
526    mov         lr, #0x9                    ; 9
527    ldr         r9, [src]                   ; load q2
528
529    sxtb16      r6, r12
530    sxtb16      r10, r12, ror #8
531    smlabb      r8, r6, lr, r7
532    smlatb      r6, r6, lr, r7
533    smlabb      r12, r10, lr, r7
534    smlatb      r10, r10, lr, r7
535    ssat        r8, #8, r8, asr #7
536    ssat        r6, #8, r6, asr #7
537    ssat        r12, #8, r12, asr #7
538    ssat        r10, #8, r10, asr #7
539
540    sub         src, src, pstep, lsl #2
541
542    pkhbt       r6, r8, r6, lsl #16
543    pkhbt       r10, r12, r10, lsl #16
544
545    sub         src, src, pstep
546    ldr         lr, c0x80808080
547
548    ldr         r11, [src]                  ; load p2
549
550    uxtb16      r6, r6
551    uxtb16      r10, r10
552
553    eor         r9, r9, lr
554    eor         r11, r11, lr
555
556    orr         r10, r6, r10, lsl #8        ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
557
558    qadd8       r8, r11, r10                ; s = vp8_signed_char_clamp(ps2 + u)
559    qsub8       r10, r9, r10                ; s = vp8_signed_char_clamp(qs2 - u)
560    eor         r8, r8, lr                  ; *op2 = s^0x80
561    str         r8, [src], pstep, lsl #2    ; store *op2
562    add         src, src, pstep
563    eor         r10, r10, lr                ; *oq2 = s^0x80
564    str         r10, [src], pstep, lsl #1   ; store *oq2
565
566|mbhskip_filter|
567    add         src, src, #4
568    sub         src, src, pstep, lsl #3
569    subs        count, count, #1
570
571    ldrne       r9, [src], pstep            ; p3
572    ldrne       r10, [src], pstep           ; p2
573    ldrne       r11, [src], pstep           ; p1
574
575    bne         MBHnext8
576
577    add         sp, sp, #16
578    ldmia       sp!, {r4 - r11, pc}
579    ENDP        ; |vp8_mbloop_filter_horizontal_edge_armv6|
580
581
582;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
583|vp8_loop_filter_vertical_edge_armv6| PROC
584;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
585    stmdb       sp!, {r4 - r11, lr}
586
587    sub         src, src, #4                ; move src pointer down by 4
588    ldr         count, [sp, #40]            ; count for 8-in-parallel
589    ldr         r12, [sp, #36]              ; load thresh address
590    sub         sp, sp, #16                 ; create temp buffer
591
592    ldr         r6, [src], pstep            ; load source data
593    ldr         r4, [r2], #4                ; flimit
594    ldr         r7, [src], pstep
595    ldr         r2, [r3], #4                ; limit
596    ldr         r8, [src], pstep
597    uadd8       r4, r4, r4                  ; flimit * 2
598    ldr         r3, [r12], #4               ; thresh
599    ldr         lr, [src], pstep
600    mov         count, count, lsl #1        ; 4-in-parallel
601    uadd8       r4, r4, r2                  ; flimit * 2 + limit
602
603|Vnext8|
604
605    ; vp8_filter_mask() function
606    ; calculate breakout conditions
607    ; transpose the source data for 4-in-parallel operation
608    TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
609
610    uqsub8      r7, r9, r10                 ; p3 - p2
611    uqsub8      r8, r10, r9                 ; p2 - p3
612    uqsub8      r9, r10, r11                ; p2 - p1
613    uqsub8      r10, r11, r10               ; p1 - p2
614    orr         r7, r7, r8                  ; abs (p3-p2)
615    orr         r10, r9, r10                ; abs (p2-p1)
616    uqsub8      lr, r7, r2                  ; compare to limit. lr: vp8_filter_mask
617    uqsub8      r10, r10, r2                ; compare to limit
618
619    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
620
621    orr         lr, lr, r10
622
623    uqsub8      r6, r11, r12                ; p1 - p0
624    uqsub8      r7, r12, r11                ; p0 - p1
625    add         src, src, #4                ; move src pointer up by 4
626    orr         r6, r6, r7                  ; abs (p1-p0)
627    str         r11, [sp, #12]              ; save p1
628    uqsub8      r10, r6, r2                 ; compare to limit
629    uqsub8      r11, r6, r3                 ; compare to thresh
630    orr         lr, lr, r10
631
632    ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now
633    ; transpose the source data for 4-in-parallel operation
634    ldr         r6, [src], pstep            ; load source data
635    str         r11, [sp]                   ; push r11 to stack
636    ldr         r7, [src], pstep
637    str         r12, [sp, #4]               ; save current reg before load q0 - q3 data
638    ldr         r8, [src], pstep
639    str         lr, [sp, #8]
640    ldr         lr, [src], pstep
641
642    TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
643
644    ldr         lr, [sp, #8]                ; load back (f)limit accumulator
645
646    uqsub8      r6, r12, r11                ; q3 - q2
647    uqsub8      r7, r11, r12                ; q2 - q3
648    uqsub8      r12, r11, r10               ; q2 - q1
649    uqsub8      r11, r10, r11               ; q1 - q2
650    orr         r6, r6, r7                  ; abs (q3-q2)
651    orr         r7, r12, r11                ; abs (q2-q1)
652    uqsub8      r6, r6, r2                  ; compare to limit
653    uqsub8      r7, r7, r2                  ; compare to limit
654    ldr         r11, [sp, #4]               ; load back p0
655    ldr         r12, [sp, #12]              ; load back p1
656    orr         lr, lr, r6
657    orr         lr, lr, r7
658
659    uqsub8      r6, r11, r9                 ; p0 - q0
660    uqsub8      r7, r9, r11                 ; q0 - p0
661    uqsub8      r8, r12, r10                ; p1 - q1
662    uqsub8      r11, r10, r12               ; q1 - p1
663    orr         r6, r6, r7                  ; abs (p0-q0)
664    ldr         r7, c0x7F7F7F7F
665    orr         r8, r8, r11                 ; abs (p1-q1)
666    uqadd8      r6, r6, r6                  ; abs (p0-q0) * 2
667    and         r8, r7, r8, lsr #1          ; abs (p1-q1) / 2
668    uqsub8      r11, r10, r9                ; q1 - q0
669    uqadd8      r6, r8, r6                  ; abs (p0-q0)*2 + abs (p1-q1)/2
670    uqsub8      r12, r9, r10                ; q0 - q1
671    uqsub8      r6, r6, r4                  ; compare to flimit
672
673    orr         r9, r11, r12                ; abs (q1-q0)
674    uqsub8      r8, r9, r2                  ; compare to limit
675    uqsub8      r10, r9, r3                 ; compare to thresh
676    orr         lr, lr, r6
677    orr         lr, lr, r8
678
679    mvn         r11, #0                     ; r11 == -1
680    mov         r12, #0
681
682    usub8       lr, r12, lr
683    ldr         r9, [sp]                    ; load the compared result
684    sel         lr, r11, r12                ; filter mask: lr
685
686    cmp         lr, #0
687    beq         vskip_filter                 ; skip filtering
688
689    ;vp8_hevmask() function
690    ;calculate high edge variance
691
692    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
693
694    orr         r9, r9, r10
695
696    ldrh        r7, [src, #-2]
697    ldrh        r8, [src], pstep
698
699    usub8       r9, r12, r9
700    sel         r6, r12, r11                ; hev mask: r6
701
702    ;vp8_filter() function
703    ; load soure data to r6, r11, r12, lr
704    ldrh        r9, [src, #-2]
705    ldrh        r10, [src], pstep
706
707    pkhbt       r12, r7, r8, lsl #16
708
709    ldrh        r7, [src, #-2]
710    ldrh        r8, [src], pstep
711
712    pkhbt       r11, r9, r10, lsl #16
713
714    ldrh        r9, [src, #-2]
715    ldrh        r10, [src], pstep
716
717    ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first
718    str         r6, [sp]
719    str         lr, [sp, #4]
720
721    pkhbt       r6, r7, r8, lsl #16
722    pkhbt       lr, r9, r10, lsl #16
723
724    ;transpose r12, r11, r6, lr to r7, r8, r9, r10
725    TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10
726
727    ;load back hev_mask r6 and filter_mask lr
728    ldr         r12, c0x80808080
729    ldr         r6, [sp]
730    ldr         lr, [sp, #4]
731
732    eor         r7, r7, r12                 ; p1 offset to convert to a signed value
733    eor         r8, r8, r12                 ; p0 offset to convert to a signed value
734    eor         r9, r9, r12                 ; q0 offset to convert to a signed value
735    eor         r10, r10, r12               ; q1 offset to convert to a signed value
736
737    str         r9, [sp]                    ; store qs0 temporarily
738    str         r8, [sp, #4]                ; store ps0 temporarily
739    str         r10, [sp, #8]               ; store qs1 temporarily
740    str         r7, [sp, #12]               ; store ps1 temporarily
741
742    qsub8       r7, r7, r10                 ; vp8_signed_char_clamp(ps1-qs1)
743    qsub8       r8, r9, r8                  ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
744
745    and         r7, r7, r6                  ;  vp8_filter (r7) &= hev (r7 : filter)
746
747    qadd8       r7, r7, r8
748    ldr         r9, c0x03030303             ; r9 = 3 --modified for vp8
749
750    qadd8       r7, r7, r8
751    ldr         r10, c0x04040404
752
753    qadd8       r7, r7, r8
754    ;mvn         r11, #0                     ; r11 == -1
755
756    and         r7, r7, lr                  ; vp8_filter &= mask
757
758    ;modify code for vp8 -- Filter1 = vp8_filter (r7)
759    qadd8       r8 , r7 , r9                ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3)
760    qadd8       r7 , r7 , r10               ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4)
761
762    mov         r9, #0
763    shadd8      r8 , r8 , r9                ; Filter2 >>= 3
764    shadd8      r7 , r7 , r9                ; vp8_filter >>= 3
765    shadd8      r8 , r8 , r9
766    shadd8      r7 , r7 , r9
767    shadd8      lr , r8 , r9                ; lr: filter2
768    shadd8      r7 , r7 , r9                ; r7: filter
769
770    ;usub8      lr, r8, r10                 ; s = (s==4)*-1
771    ;sel            lr, r11, r9
772    ;usub8      r8, r10, r8
773    ;sel            r8, r11, r9
774    ;and            r8, r8, lr                  ; -1 for each element that equals 4 -- r8: s
775
776    ;calculate output
777    ;qadd8      lr, r8, r7                  ; u = vp8_signed_char_clamp(s + vp8_filter)
778
779    ldr         r8, [sp]                    ; load qs0
780    ldr         r9, [sp, #4]                ; load ps0
781
782    ldr         r10, c0x01010101
783
784    qsub8       r8, r8, r7                  ; u = vp8_signed_char_clamp(qs0 - vp8_filter)
785    qadd8       r9, r9, lr                  ; u = vp8_signed_char_clamp(ps0 + Filter2)
786    ;end of modification for vp8
787
788    eor         r8, r8, r12
789    eor         r9, r9, r12
790
791    mov         lr, #0
792
793    sadd8       r7, r7, r10
794    shadd8      r7, r7, lr
795
796    ldr         r10, [sp, #8]               ; load qs1
797    ldr         r11, [sp, #12]              ; load ps1
798
799    bic         r7, r7, r6                  ; r7: vp8_filter
800
801    qsub8       r10 , r10, r7               ; u = vp8_signed_char_clamp(qs1 - vp8_filter)
802    qadd8       r11, r11, r7                ; u = vp8_signed_char_clamp(ps1 + vp8_filter)
803    eor         r10, r10, r12
804    eor         r11, r11, r12
805
806    sub         src, src, pstep, lsl #2
807
808    ;we can use TRANSPOSE_MATRIX macro to transpose output - input: q1, q0, p0, p1
809    ;output is b0, b1, b2, b3
810    ;b0: 03 02 01 00
811    ;b1: 13 12 11 10
812    ;b2: 23 22 21 20
813    ;b3: 33 32 31 30
814    ;    p1 p0 q0 q1
815    ;   (a3 a2 a1 a0)
816    TRANSPOSE_MATRIX r11, r9, r8, r10, r6, r7, r12, lr
817
818    strh        r6, [src, #-2]              ; store the result
819    mov         r6, r6, lsr #16
820    strh        r6, [src], pstep
821
822    strh        r7, [src, #-2]
823    mov         r7, r7, lsr #16
824    strh        r7, [src], pstep
825
826    strh        r12, [src, #-2]
827    mov         r12, r12, lsr #16
828    strh        r12, [src], pstep
829
830    strh        lr, [src, #-2]
831    mov         lr, lr, lsr #16
832    strh        lr, [src], pstep
833
834|vskip_filter|
835    sub         src, src, #4
836    subs        count, count, #1
837
838    ldrne       r6, [src], pstep            ; load source data
839    ldrne       r7, [src], pstep
840    ldrne       r8, [src], pstep
841    ldrne       lr, [src], pstep
842
843    bne         Vnext8
844
845    add         sp, sp, #16
846
847    ldmia       sp!, {r4 - r11, pc}
848    ENDP        ; |vp8_loop_filter_vertical_edge_armv6|
849
850
851
852;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
853|vp8_mbloop_filter_vertical_edge_armv6| PROC
854;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
855    stmdb       sp!, {r4 - r11, lr}
856
857    sub         src, src, #4                ; move src pointer down by 4
858    ldr         count, [sp, #40]            ; count for 8-in-parallel
859    ldr         r12, [sp, #36]              ; load thresh address
860    sub         sp, sp, #16                 ; create temp buffer
861
862    ldr         r6, [src], pstep            ; load source data
863    ldr         r4, [r2], #4                ; flimit
864    ldr         r7, [src], pstep
865    ldr         r2, [r3], #4                ; limit
866    ldr         r8, [src], pstep
867    uadd8       r4, r4, r4                  ; flimit * 2
868    ldr         r3, [r12], #4               ; thresh
869    ldr         lr, [src], pstep
870    mov         count, count, lsl #1        ; 4-in-parallel
871    uadd8       r4, r4, r2                  ; flimit * 2 + limit
872
873|MBVnext8|
874    ; vp8_filter_mask() function
875    ; calculate breakout conditions
876    ; transpose the source data for 4-in-parallel operation
877    TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
878
879    uqsub8      r7, r9, r10                 ; p3 - p2
880    uqsub8      r8, r10, r9                 ; p2 - p3
881    uqsub8      r9, r10, r11                ; p2 - p1
882    uqsub8      r10, r11, r10               ; p1 - p2
883    orr         r7, r7, r8                  ; abs (p3-p2)
884    orr         r10, r9, r10                ; abs (p2-p1)
885    uqsub8      lr, r7, r2                  ; compare to limit. lr: vp8_filter_mask
886    uqsub8      r10, r10, r2                ; compare to limit
887
888    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
889
890    orr         lr, lr, r10
891
892    uqsub8      r6, r11, r12                ; p1 - p0
893    uqsub8      r7, r12, r11                ; p0 - p1
894    add         src, src, #4                ; move src pointer up by 4
895    orr         r6, r6, r7                  ; abs (p1-p0)
896    str         r11, [sp, #12]              ; save p1
897    uqsub8      r10, r6, r2                 ; compare to limit
898    uqsub8      r11, r6, r3                 ; compare to thresh
899    orr         lr, lr, r10
900
901    ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now
902    ; transpose the source data for 4-in-parallel operation
903    ldr         r6, [src], pstep            ; load source data
904    str         r11, [sp]                   ; push r11 to stack
905    ldr         r7, [src], pstep
906    str         r12, [sp, #4]               ; save current reg before load q0 - q3 data
907    ldr         r8, [src], pstep
908    str         lr, [sp, #8]
909    ldr         lr, [src], pstep
910
911    TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
912
913    ldr         lr, [sp, #8]                ; load back (f)limit accumulator
914
915    uqsub8      r6, r12, r11                ; q3 - q2
916    uqsub8      r7, r11, r12                ; q2 - q3
917    uqsub8      r12, r11, r10               ; q2 - q1
918    uqsub8      r11, r10, r11               ; q1 - q2
919    orr         r6, r6, r7                  ; abs (q3-q2)
920    orr         r7, r12, r11                ; abs (q2-q1)
921    uqsub8      r6, r6, r2                  ; compare to limit
922    uqsub8      r7, r7, r2                  ; compare to limit
923    ldr         r11, [sp, #4]               ; load back p0
924    ldr         r12, [sp, #12]              ; load back p1
925    orr         lr, lr, r6
926    orr         lr, lr, r7
927
928    uqsub8      r6, r11, r9                 ; p0 - q0
929    uqsub8      r7, r9, r11                 ; q0 - p0
930    uqsub8      r8, r12, r10                ; p1 - q1
931    uqsub8      r11, r10, r12               ; q1 - p1
932    orr         r6, r6, r7                  ; abs (p0-q0)
933    ldr         r7, c0x7F7F7F7F
934    orr         r8, r8, r11                 ; abs (p1-q1)
935    uqadd8      r6, r6, r6                  ; abs (p0-q0) * 2
936    and         r8, r7, r8, lsr #1          ; abs (p1-q1) / 2
937    uqsub8      r11, r10, r9                ; q1 - q0
938    uqadd8      r6, r8, r6                  ; abs (p0-q0)*2 + abs (p1-q1)/2
939    uqsub8      r12, r9, r10                ; q0 - q1
940    uqsub8      r6, r6, r4                  ; compare to flimit
941
942    orr         r9, r11, r12                ; abs (q1-q0)
943    uqsub8      r8, r9, r2                  ; compare to limit
944    uqsub8      r10, r9, r3                 ; compare to thresh
945    orr         lr, lr, r6
946    orr         lr, lr, r8
947
948    mvn         r11, #0                     ; r11 == -1
949    mov         r12, #0
950
951    usub8       lr, r12, lr
952    ldr         r9, [sp]                    ; load the compared result
953    sel         lr, r11, r12                ; filter mask: lr
954
955    cmp         lr, #0
956    beq         mbvskip_filter               ; skip filtering
957
958
959    ;vp8_hevmask() function
960    ;calculate high edge variance
961
962    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
963
964    orr         r9, r9, r10
965
966    ldrh        r7, [src, #-2]
967    ldrh        r8, [src], pstep
968
969    usub8       r9, r12, r9
970    sel         r6, r12, r11                ; hev mask: r6
971
972
973    ; vp8_mbfilter() function
974    ; p2, q2 are only needed at the end. Don't need to load them in now.
975    ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first
976    ; load soure data to r6, r11, r12, lr
977    ldrh        r9, [src, #-2]
978    ldrh        r10, [src], pstep
979
980    pkhbt       r12, r7, r8, lsl #16
981
982    ldrh        r7, [src, #-2]
983    ldrh        r8, [src], pstep
984
985    pkhbt       r11, r9, r10, lsl #16
986
987    ldrh        r9, [src, #-2]
988    ldrh        r10, [src], pstep
989
990    str         r6, [sp]                    ; save r6
991    str         lr, [sp, #4]                ; save lr
992
993    pkhbt       r6, r7, r8, lsl #16
994    pkhbt       lr, r9, r10, lsl #16
995
996    ;transpose r12, r11, r6, lr to p1, p0, q0, q1
997    TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10
998
999    ;load back hev_mask r6 and filter_mask lr
1000    ldr         r12, c0x80808080
1001    ldr         r6, [sp]
1002    ldr         lr, [sp, #4]
1003
1004    eor         r7, r7, r12                 ; ps1
1005    eor         r8, r8, r12                 ; ps0
1006    eor         r9, r9, r12                 ; qs0
1007    eor         r10, r10, r12               ; qs1
1008
1009    qsub8       r12, r9, r8                 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
1010    str         r7, [sp, #12]               ; store ps1 temporarily
1011    qsub8       r7, r7, r10                 ; vp8_signed_char_clamp(ps1-qs1)
1012    str         r10, [sp, #8]               ; store qs1 temporarily
1013    qadd8       r7, r7, r12
1014    str         r9, [sp]                    ; store qs0 temporarily
1015    qadd8       r7, r7, r12
1016    str         r8, [sp, #4]                ; store ps0 temporarily
1017    qadd8       r7, r7, r12                 ; vp8_filter: r7
1018
1019    ldr         r10, c0x03030303            ; r10 = 3 --modified for vp8
1020    ldr         r9, c0x04040404
1021    ;mvn         r11, #0                     ; r11 == -1
1022
1023    and         r7, r7, lr                  ; vp8_filter &= mask (lr is free)
1024
1025    mov         r12, r7                     ; Filter2: r12
1026    and         r12, r12, r6                ; Filter2 &= hev
1027
1028    ;modify code for vp8
1029    ;save bottom 3 bits so that we round one side +4 and the other +3
1030    qadd8       r8 , r12 , r9               ; Filter1 (r8) = vp8_signed_char_clamp(Filter2+4)
1031    qadd8       r12 , r12 , r10             ; Filter2 (r12) = vp8_signed_char_clamp(Filter2+3)
1032
1033    mov         r10, #0
1034    shadd8      r8 , r8 , r10               ; Filter1 >>= 3
1035    shadd8      r12 , r12 , r10             ; Filter2 >>= 3
1036    shadd8      r8 , r8 , r10
1037    shadd8      r12 , r12 , r10
1038    shadd8      r8 , r8 , r10               ; r8: Filter1
1039    shadd8      r12 , r12 , r10             ; r12: Filter2
1040
1041    ldr         r9, [sp]                    ; load qs0
1042    ldr         r11, [sp, #4]               ; load ps0
1043
1044    qsub8       r9 , r9, r8                 ; qs0 = vp8_signed_char_clamp(qs0 - Filter1)
1045    qadd8       r11, r11, r12               ; ps0 = vp8_signed_char_clamp(ps0 + Filter2)
1046
1047    ;save bottom 3 bits so that we round one side +4 and the other +3
1048    ;and            r8, r12, r10                ; s = Filter2 & 7 (s: r8)
1049    ;qadd8      r12 , r12 , r9              ; Filter2 = vp8_signed_char_clamp(Filter2+4)
1050    ;mov            r10, #0
1051    ;shadd8     r12 , r12 , r10             ; Filter2 >>= 3
1052    ;usub8      lr, r8, r9                  ; s = (s==4)*-1
1053    ;sel            lr, r11, r10
1054    ;shadd8     r12 , r12 , r10
1055    ;usub8      r8, r9, r8
1056    ;sel            r8, r11, r10
1057    ;ldr            r9, [sp]                    ; load qs0
1058    ;ldr            r11, [sp, #4]               ; load ps0
1059    ;shadd8     r12 , r12 , r10
1060    ;and            r8, r8, lr                  ; -1 for each element that equals 4
1061    ;qadd8      r10, r8, r12                ; u = vp8_signed_char_clamp(s + Filter2)
1062    ;qsub8      r9 , r9, r12                ; qs0 = vp8_signed_char_clamp(qs0 - Filter2)
1063    ;qadd8      r11, r11, r10               ; ps0 = vp8_signed_char_clamp(ps0 + u)
1064
1065    ;end of modification for vp8
1066
1067    bic         r12, r7, r6                 ;vp8_filter &= ~hev    ( r6 is free)
1068    ;mov            r12, r7
1069
1070    ;roughly 3/7th difference across boundary
1071    mov         lr, #0x1b                   ; 27
1072    mov         r7, #0x3f                   ; 63
1073
1074    sxtb16      r6, r12
1075    sxtb16      r10, r12, ror #8
1076    smlabb      r8, r6, lr, r7
1077    smlatb      r6, r6, lr, r7
1078    smlabb      r7, r10, lr, r7
1079    smultb      r10, r10, lr
1080    ssat        r8, #8, r8, asr #7
1081    ssat        r6, #8, r6, asr #7
1082    add         r10, r10, #63
1083    ssat        r7, #8, r7, asr #7
1084    ssat        r10, #8, r10, asr #7
1085
1086    ldr         lr, c0x80808080
1087
1088    pkhbt       r6, r8, r6, lsl #16
1089    pkhbt       r10, r7, r10, lsl #16
1090    uxtb16      r6, r6
1091    uxtb16      r10, r10
1092
1093    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
1094
1095    orr         r10, r6, r10, lsl #8        ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
1096
1097    qsub8       r8, r9, r10                 ; s = vp8_signed_char_clamp(qs0 - u)
1098    qadd8       r10, r11, r10               ; s = vp8_signed_char_clamp(ps0 + u)
1099    eor         r8, r8, lr                  ; *oq0 = s^0x80
1100    eor         r10, r10, lr                ; *op0 = s^0x80
1101
1102    strb        r10, [src, #-1]             ; store op0 result
1103    strb        r8, [src], pstep            ; store oq0 result
1104    mov         r10, r10, lsr #8
1105    mov         r8, r8, lsr #8
1106    strb        r10, [src, #-1]
1107    strb        r8, [src], pstep
1108    mov         r10, r10, lsr #8
1109    mov         r8, r8, lsr #8
1110    strb        r10, [src, #-1]
1111    strb        r8, [src], pstep
1112    mov         r10, r10, lsr #8
1113    mov         r8, r8, lsr #8
1114    strb        r10, [src, #-1]
1115    strb        r8, [src], pstep
1116
1117    ;roughly 2/7th difference across boundary
1118    mov         lr, #0x12                   ; 18
1119    mov         r7, #0x3f                   ; 63
1120
1121    sxtb16      r6, r12
1122    sxtb16      r10, r12, ror #8
1123    smlabb      r8, r6, lr, r7
1124    smlatb      r6, r6, lr, r7
1125    smlabb      r9, r10, lr, r7
1126    smlatb      r10, r10, lr, r7
1127    ssat        r8, #8, r8, asr #7
1128    ssat        r6, #8, r6, asr #7
1129    ssat        r9, #8, r9, asr #7
1130    ssat        r10, #8, r10, asr #7
1131
1132    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
1133
1134    pkhbt       r6, r8, r6, lsl #16
1135    pkhbt       r10, r9, r10, lsl #16
1136
1137    ldr         r9, [sp, #8]                ; load qs1
1138    ldr         r11, [sp, #12]              ; load ps1
1139    ldr         lr, c0x80808080
1140
1141    uxtb16      r6, r6
1142    uxtb16      r10, r10
1143
1144    add         src, src, #2
1145
1146    orr         r10, r6, r10, lsl #8        ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
1147
1148    qsub8       r8, r9, r10                 ; s = vp8_signed_char_clamp(qs1 - u)
1149    qadd8       r10, r11, r10               ; s = vp8_signed_char_clamp(ps1 + u)
1150    eor         r8, r8, lr                  ; *oq1 = s^0x80
1151    eor         r10, r10, lr                ; *op1 = s^0x80
1152
1153    ldrb        r11, [src, #-5]             ; load p2 for 1/7th difference across boundary
1154    strb        r10, [src, #-4]             ; store op1
1155    strb        r8, [src, #-1]              ; store oq1
1156    ldrb        r9, [src], pstep            ; load q2 for 1/7th difference across boundary
1157
1158    mov         r10, r10, lsr #8
1159    mov         r8, r8, lsr #8
1160
1161    ldrb        r6, [src, #-5]
1162    strb        r10, [src, #-4]
1163    strb        r8, [src, #-1]
1164    ldrb        r7, [src], pstep
1165
1166    mov         r10, r10, lsr #8
1167    mov         r8, r8, lsr #8
1168    orr         r11, r11, r6, lsl #8
1169    orr         r9, r9, r7, lsl #8
1170
1171    ldrb        r6, [src, #-5]
1172    strb        r10, [src, #-4]
1173    strb        r8, [src, #-1]
1174    ldrb        r7, [src], pstep
1175
1176    mov         r10, r10, lsr #8
1177    mov         r8, r8, lsr #8
1178    orr         r11, r11, r6, lsl #16
1179    orr         r9, r9, r7, lsl #16
1180
1181    ldrb        r6, [src, #-5]
1182    strb        r10, [src, #-4]
1183    strb        r8, [src, #-1]
1184    ldrb        r7, [src], pstep
1185    orr         r11, r11, r6, lsl #24
1186    orr         r9, r9, r7, lsl #24
1187
1188    ;roughly 1/7th difference across boundary
1189    eor         r9, r9, lr
1190    eor         r11, r11, lr
1191
1192    mov         lr, #0x9                    ; 9
1193    mov         r7, #0x3f                   ; 63
1194
1195    sxtb16      r6, r12
1196    sxtb16      r10, r12, ror #8
1197    smlabb      r8, r6, lr, r7
1198    smlatb      r6, r6, lr, r7
1199    smlabb      r12, r10, lr, r7
1200    smlatb      r10, r10, lr, r7
1201    ssat        r8, #8, r8, asr #7
1202    ssat        r6, #8, r6, asr #7
1203    ssat        r12, #8, r12, asr #7
1204    ssat        r10, #8, r10, asr #7
1205
1206    sub         src, src, pstep, lsl #2
1207
1208    pkhbt       r6, r8, r6, lsl #16
1209    pkhbt       r10, r12, r10, lsl #16
1210
1211    uxtb16      r6, r6
1212    uxtb16      r10, r10
1213
1214    ldr         lr, c0x80808080
1215
1216    orr         r10, r6, r10, lsl #8        ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
1217
1218    qadd8       r8, r11, r10                ; s = vp8_signed_char_clamp(ps2 + u)
1219    qsub8       r10, r9, r10                ; s = vp8_signed_char_clamp(qs2 - u)
1220    eor         r8, r8, lr                  ; *op2 = s^0x80
1221    eor         r10, r10, lr                ; *oq2 = s^0x80
1222
1223    strb        r8, [src, #-5]              ; store *op2
1224    strb        r10, [src], pstep           ; store *oq2
1225    mov         r8, r8, lsr #8
1226    mov         r10, r10, lsr #8
1227    strb        r8, [src, #-5]
1228    strb        r10, [src], pstep
1229    mov         r8, r8, lsr #8
1230    mov         r10, r10, lsr #8
1231    strb        r8, [src, #-5]
1232    strb        r10, [src], pstep
1233    mov         r8, r8, lsr #8
1234    mov         r10, r10, lsr #8
1235    strb        r8, [src, #-5]
1236    strb        r10, [src], pstep
1237
1238    ;adjust src pointer for next loop
1239    sub         src, src, #2
1240
1241|mbvskip_filter|
1242    sub         src, src, #4
1243    subs        count, count, #1
1244
1245    ldrne       r6, [src], pstep            ; load source data
1246    ldrne       r7, [src], pstep
1247    ldrne       r8, [src], pstep
1248    ldrne       lr, [src], pstep
1249
1250    bne         MBVnext8
1251
1252    add         sp, sp, #16
1253
1254    ldmia       sp!, {r4 - r11, pc}
1255    ENDP        ; |vp8_mbloop_filter_vertical_edge_armv6|
1256
1257; Constant Pool
1258c0x80808080 DCD     0x80808080
1259c0x03030303 DCD     0x03030303
1260c0x04040404 DCD     0x04040404
1261c0x01010101 DCD     0x01010101
1262c0x7F7F7F7F DCD     0x7F7F7F7F
1263
1264    END
1265