1233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
2233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
4233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  Use of this source code is governed by a BSD-style license
5233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  that can be found in the LICENSE file in the root of the source
6233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  tree. An additional intellectual property rights grant can be found
7233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  in the file PATENTS.  All contributing project authors may
8233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  be found in the AUTHORS file in the root of the source tree.
9233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
10233d2500723e5594f3e7c70896ffeeef32b9c950ywan
11233d2500723e5594f3e7c70896ffeeef32b9c950ywan
12233d2500723e5594f3e7c70896ffeeef32b9c950ywan    EXPORT |vp8_loop_filter_horizontal_edge_armv6|
13233d2500723e5594f3e7c70896ffeeef32b9c950ywan    EXPORT |vp8_mbloop_filter_horizontal_edge_armv6|
14233d2500723e5594f3e7c70896ffeeef32b9c950ywan    EXPORT |vp8_loop_filter_vertical_edge_armv6|
15233d2500723e5594f3e7c70896ffeeef32b9c950ywan    EXPORT |vp8_mbloop_filter_vertical_edge_armv6|
16233d2500723e5594f3e7c70896ffeeef32b9c950ywan
17233d2500723e5594f3e7c70896ffeeef32b9c950ywan    AREA    |.text|, CODE, READONLY  ; name this block of code
18233d2500723e5594f3e7c70896ffeeef32b9c950ywan
19233d2500723e5594f3e7c70896ffeeef32b9c950ywan    MACRO
20233d2500723e5594f3e7c70896ffeeef32b9c950ywan    TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3
21233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3
22233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; a0: 03 02 01 00
23233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; a1: 13 12 11 10
24233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; a2: 23 22 21 20
25233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; a3: 33 32 31 30
26233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;     b3 b2 b1 b0
27233d2500723e5594f3e7c70896ffeeef32b9c950ywan
28233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uxtb16      $b1, $a1                    ; xx 12 xx 10
29233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uxtb16      $b0, $a0                    ; xx 02 xx 00
30233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uxtb16      $b3, $a3                    ; xx 32 xx 30
31233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uxtb16      $b2, $a2                    ; xx 22 xx 20
32233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         $b1, $b0, $b1, lsl #8       ; 12 02 10 00
33233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         $b3, $b2, $b3, lsl #8       ; 32 22 30 20
34233d2500723e5594f3e7c70896ffeeef32b9c950ywan
35233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uxtb16      $a1, $a1, ror #8            ; xx 13 xx 11
36233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uxtb16      $a3, $a3, ror #8            ; xx 33 xx 31
37233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uxtb16      $a0, $a0, ror #8            ; xx 03 xx 01
38233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uxtb16      $a2, $a2, ror #8            ; xx 23 xx 21
39233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         $a0, $a0, $a1, lsl #8       ; 13 03 11 01
40233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         $a2, $a2, $a3, lsl #8       ; 33 23 31 21
41233d2500723e5594f3e7c70896ffeeef32b9c950ywan
42233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pkhtb       $b2, $b3, $b1, asr #16      ; 32 22 12 02   -- p1
43233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pkhbt       $b0, $b1, $b3, lsl #16      ; 30 20 10 00   -- p3
44233d2500723e5594f3e7c70896ffeeef32b9c950ywan
45233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pkhtb       $b3, $a2, $a0, asr #16      ; 33 23 13 03   -- p0
46233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pkhbt       $b1, $a0, $a2, lsl #16      ; 31 21 11 01   -- p2
47233d2500723e5594f3e7c70896ffeeef32b9c950ywan    MEND
48233d2500723e5594f3e7c70896ffeeef32b9c950ywan
49233d2500723e5594f3e7c70896ffeeef32b9c950ywan
50233d2500723e5594f3e7c70896ffeeef32b9c950ywansrc         RN  r0
51233d2500723e5594f3e7c70896ffeeef32b9c950ywanpstep       RN  r1
52233d2500723e5594f3e7c70896ffeeef32b9c950ywancount       RN  r5
53233d2500723e5594f3e7c70896ffeeef32b9c950ywan
54233d2500723e5594f3e7c70896ffeeef32b9c950ywan;r0     unsigned char *src_ptr,
55233d2500723e5594f3e7c70896ffeeef32b9c950ywan;r1     int src_pixel_step,
56233d2500723e5594f3e7c70896ffeeef32b9c950ywan;r2     const char *blimit,
57233d2500723e5594f3e7c70896ffeeef32b9c950ywan;r3     const char *limit,
58233d2500723e5594f3e7c70896ffeeef32b9c950ywan;stack  const char *thresh,
59233d2500723e5594f3e7c70896ffeeef32b9c950ywan;stack  int  count
60233d2500723e5594f3e7c70896ffeeef32b9c950ywan
61233d2500723e5594f3e7c70896ffeeef32b9c950ywan;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
62233d2500723e5594f3e7c70896ffeeef32b9c950ywan|vp8_loop_filter_horizontal_edge_armv6| PROC
63233d2500723e5594f3e7c70896ffeeef32b9c950ywan;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
64233d2500723e5594f3e7c70896ffeeef32b9c950ywan    stmdb       sp!, {r4 - r11, lr}
65233d2500723e5594f3e7c70896ffeeef32b9c950ywan
66233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
67233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         count, [sp, #40]            ; count for 8-in-parallel
68233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r6, [sp, #36]               ; load thresh address
69233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         sp, sp, #16                 ; create temp buffer
70233d2500723e5594f3e7c70896ffeeef32b9c950ywan
71233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r9, [src], pstep            ; p3
72233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldrb        r4, [r2]                    ; blimit
73233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r10, [src], pstep           ; p2
74233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldrb        r2, [r3]                    ; limit
75233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r11, [src], pstep           ; p1
76233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r4, r4, r4, lsl #8
77233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldrb        r3, [r6]                    ; thresh
78233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r2, r2, r2, lsl #8
79233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         count, count, lsl #1        ; 4-in-parallel
80233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r4, r4, r4, lsl #16
81233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r3, r3, r3, lsl #8
82233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r2, r2, r2, lsl #16
83233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r3, r3, r3, lsl #16
84233d2500723e5594f3e7c70896ffeeef32b9c950ywan
85233d2500723e5594f3e7c70896ffeeef32b9c950ywan|Hnext8|
86233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; vp8_filter_mask() function
87233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; calculate breakout conditions
88233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r12, [src], pstep           ; p0
89233d2500723e5594f3e7c70896ffeeef32b9c950ywan
90233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r6, r9, r10                 ; p3 - p2
91233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r7, r10, r9                 ; p2 - p3
92233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r8, r10, r11                ; p2 - p1
93233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r10, r11, r10               ; p1 - p2
94233d2500723e5594f3e7c70896ffeeef32b9c950ywan
95233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r6, r6, r7                  ; abs (p3-p2)
96233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r8, r8, r10                 ; abs (p2-p1)
97233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      lr, r6, r2                  ; compare to limit. lr: vp8_filter_mask
98233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r8, r8, r2                  ; compare to limit
99233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r6, r11, r12                ; p1 - p0
100233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         lr, lr, r8
101233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r7, r12, r11                ; p0 - p1
102233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r9, [src], pstep            ; q0
103233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r10, [src], pstep           ; q1
104233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r6, r6, r7                  ; abs (p1-p0)
105233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r7, r6, r2                  ; compare to limit
106233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r8, r6, r3                  ; compare to thresh  -- save r8 for later
107233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         lr, lr, r7
108233d2500723e5594f3e7c70896ffeeef32b9c950ywan
109233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r6, r11, r10                ; p1 - q1
110233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r7, r10, r11                ; q1 - p1
111233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r11, r12, r9                ; p0 - q0
112233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r12, r9, r12                ; q0 - p0
113233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r6, r6, r7                  ; abs (p1-q1)
114233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r7, c0x7F7F7F7F
115233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r12, r11, r12               ; abs (p0-q0)
116233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r11, [src], pstep           ; q2
117233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqadd8      r12, r12, r12               ; abs (p0-q0) * 2
118233d2500723e5594f3e7c70896ffeeef32b9c950ywan    and         r6, r7, r6, lsr #1          ; abs (p1-q1) / 2
119233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r7, r9, r10                 ; q0 - q1
120233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqadd8      r12, r12, r6                ; abs (p0-q0)*2 + abs (p1-q1)/2
121233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r6, r10, r9                 ; q1 - q0
122233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r12, r12, r4                ; compare to flimit
123233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r9, r11, r10                ; q2 - q1
124233d2500723e5594f3e7c70896ffeeef32b9c950ywan
125233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         lr, lr, r12
126233d2500723e5594f3e7c70896ffeeef32b9c950ywan
127233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r12, [src], pstep           ; q3
128233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r10, r10, r11               ; q1 - q2
129233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r6, r7, r6                  ; abs (q1-q0)
130233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r10, r9, r10                ; abs (q2-q1)
131233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r7, r6, r2                  ; compare to limit
132233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r10, r10, r2                ; compare to limit
133233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r6, r6, r3                  ; compare to thresh -- save r6 for later
134233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         lr, lr, r7
135233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         lr, lr, r10
136233d2500723e5594f3e7c70896ffeeef32b9c950ywan
137233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r10, r12, r11               ; q3 - q2
138233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r9, r11, r12                ; q2 - q3
139233d2500723e5594f3e7c70896ffeeef32b9c950ywan
140233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mvn         r11, #0                     ; r11 == -1
141233d2500723e5594f3e7c70896ffeeef32b9c950ywan
142233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r10, r10, r9                ; abs (q3-q2)
143233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r10, r10, r2                ; compare to limit
144233d2500723e5594f3e7c70896ffeeef32b9c950ywan
145233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         r12, #0
146233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         lr, lr, r10
147233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         src, src, pstep, lsl #2
148233d2500723e5594f3e7c70896ffeeef32b9c950ywan
149233d2500723e5594f3e7c70896ffeeef32b9c950ywan    usub8       lr, r12, lr                 ; use usub8 instead of ssub8
150233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sel         lr, r11, r12                ; filter mask: lr
151233d2500723e5594f3e7c70896ffeeef32b9c950ywan
152233d2500723e5594f3e7c70896ffeeef32b9c950ywan    cmp         lr, #0
153233d2500723e5594f3e7c70896ffeeef32b9c950ywan    beq         hskip_filter                 ; skip filtering
154233d2500723e5594f3e7c70896ffeeef32b9c950ywan
155233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         src, src, pstep, lsl #1     ; move src pointer down by 6 lines
156233d2500723e5594f3e7c70896ffeeef32b9c950ywan
157233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;vp8_hevmask() function
158233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;calculate high edge variance
159233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r10, r6, r8                 ; calculate vp8_hevmask
160233d2500723e5594f3e7c70896ffeeef32b9c950ywan
161233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r7, [src], pstep            ; p1
162233d2500723e5594f3e7c70896ffeeef32b9c950ywan
163233d2500723e5594f3e7c70896ffeeef32b9c950ywan    usub8       r10, r12, r10               ; use usub8 instead of ssub8
164233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sel         r6, r12, r11                ; obtain vp8_hevmask: r6
165233d2500723e5594f3e7c70896ffeeef32b9c950ywan
166233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;vp8_filter() function
167233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r8, [src], pstep            ; p0
168233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r12, c0x80808080
169233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r9, [src], pstep            ; q0
170233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r10, [src], pstep           ; q1
171233d2500723e5594f3e7c70896ffeeef32b9c950ywan
172233d2500723e5594f3e7c70896ffeeef32b9c950ywan    eor         r7, r7, r12                 ; p1 offset to convert to a signed value
173233d2500723e5594f3e7c70896ffeeef32b9c950ywan    eor         r8, r8, r12                 ; p0 offset to convert to a signed value
174233d2500723e5594f3e7c70896ffeeef32b9c950ywan    eor         r9, r9, r12                 ; q0 offset to convert to a signed value
175233d2500723e5594f3e7c70896ffeeef32b9c950ywan    eor         r10, r10, r12               ; q1 offset to convert to a signed value
176233d2500723e5594f3e7c70896ffeeef32b9c950ywan
177233d2500723e5594f3e7c70896ffeeef32b9c950ywan    str         r9, [sp]                    ; store qs0 temporarily
178233d2500723e5594f3e7c70896ffeeef32b9c950ywan    str         r8, [sp, #4]                ; store ps0 temporarily
179233d2500723e5594f3e7c70896ffeeef32b9c950ywan    str         r10, [sp, #8]               ; store qs1 temporarily
180233d2500723e5594f3e7c70896ffeeef32b9c950ywan    str         r7, [sp, #12]               ; store ps1 temporarily
181233d2500723e5594f3e7c70896ffeeef32b9c950ywan
182233d2500723e5594f3e7c70896ffeeef32b9c950ywan    qsub8       r7, r7, r10                 ; vp8_signed_char_clamp(ps1-qs1)
183233d2500723e5594f3e7c70896ffeeef32b9c950ywan    qsub8       r8, r9, r8                  ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
184233d2500723e5594f3e7c70896ffeeef32b9c950ywan
185233d2500723e5594f3e7c70896ffeeef32b9c950ywan    and         r7, r7, r6                  ; vp8_filter (r7) &= hev
186233d2500723e5594f3e7c70896ffeeef32b9c950ywan
187233d2500723e5594f3e7c70896ffeeef32b9c950ywan    qadd8       r7, r7, r8
188233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r9, c0x03030303             ; r9 = 3 --modified for vp8
189233d2500723e5594f3e7c70896ffeeef32b9c950ywan
190233d2500723e5594f3e7c70896ffeeef32b9c950ywan    qadd8       r7, r7, r8
191233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r10, c0x04040404
192233d2500723e5594f3e7c70896ffeeef32b9c950ywan
193233d2500723e5594f3e7c70896ffeeef32b9c950ywan    qadd8       r7, r7, r8
194233d2500723e5594f3e7c70896ffeeef32b9c950ywan    and         r7, r7, lr                  ; vp8_filter &= mask;
195233d2500723e5594f3e7c70896ffeeef32b9c950ywan
196233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;modify code for vp8 -- Filter1 = vp8_filter (r7)
197233d2500723e5594f3e7c70896ffeeef32b9c950ywan    qadd8       r8 , r7 , r9                ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3)
198233d2500723e5594f3e7c70896ffeeef32b9c950ywan    qadd8       r7 , r7 , r10               ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4)
199233d2500723e5594f3e7c70896ffeeef32b9c950ywan
200233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         r9, #0
201233d2500723e5594f3e7c70896ffeeef32b9c950ywan    shadd8      r8 , r8 , r9                ; Filter2 >>= 3
202233d2500723e5594f3e7c70896ffeeef32b9c950ywan    shadd8      r7 , r7 , r9                ; vp8_filter >>= 3
203233d2500723e5594f3e7c70896ffeeef32b9c950ywan    shadd8      r8 , r8 , r9
204233d2500723e5594f3e7c70896ffeeef32b9c950ywan    shadd8      r7 , r7 , r9
205233d2500723e5594f3e7c70896ffeeef32b9c950ywan    shadd8      lr , r8 , r9                ; lr: Filter2
206233d2500723e5594f3e7c70896ffeeef32b9c950ywan    shadd8      r7 , r7 , r9                ; r7: filter
207233d2500723e5594f3e7c70896ffeeef32b9c950ywan
208233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;usub8      lr, r8, r10                 ; s = (s==4)*-1
209233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;sel        lr, r11, r9
210233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;usub8      r8, r10, r8
211233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;sel        r8, r11, r9
212233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;and        r8, r8, lr                  ; -1 for each element that equals 4
213233d2500723e5594f3e7c70896ffeeef32b9c950ywan
214233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;calculate output
215233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;qadd8      lr, r8, r7                  ; u = vp8_signed_char_clamp(s + vp8_filter)
216233d2500723e5594f3e7c70896ffeeef32b9c950ywan
217233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r8, [sp]                    ; load qs0
218233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r9, [sp, #4]                ; load ps0
219233d2500723e5594f3e7c70896ffeeef32b9c950ywan
220233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r10, c0x01010101
221233d2500723e5594f3e7c70896ffeeef32b9c950ywan
222233d2500723e5594f3e7c70896ffeeef32b9c950ywan    qsub8       r8 ,r8, r7                  ; u = vp8_signed_char_clamp(qs0 - vp8_filter)
223233d2500723e5594f3e7c70896ffeeef32b9c950ywan    qadd8       r9, r9, lr                  ; u = vp8_signed_char_clamp(ps0 + Filter2)
224233d2500723e5594f3e7c70896ffeeef32b9c950ywan
225233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;end of modification for vp8
226233d2500723e5594f3e7c70896ffeeef32b9c950ywan
227233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         lr, #0
228233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sadd8       r7, r7 , r10                ; vp8_filter += 1
229233d2500723e5594f3e7c70896ffeeef32b9c950ywan    shadd8      r7, r7, lr                  ; vp8_filter >>= 1
230233d2500723e5594f3e7c70896ffeeef32b9c950ywan
231233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r11, [sp, #12]              ; load ps1
232233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r10, [sp, #8]               ; load qs1
233233d2500723e5594f3e7c70896ffeeef32b9c950ywan
234233d2500723e5594f3e7c70896ffeeef32b9c950ywan    bic         r7, r7, r6                  ; vp8_filter &= ~hev
235233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         src, src, pstep, lsl #2
236233d2500723e5594f3e7c70896ffeeef32b9c950ywan
237233d2500723e5594f3e7c70896ffeeef32b9c950ywan    qadd8       r11, r11, r7                ; u = vp8_signed_char_clamp(ps1 + vp8_filter)
238233d2500723e5594f3e7c70896ffeeef32b9c950ywan    qsub8       r10, r10,r7                 ; u = vp8_signed_char_clamp(qs1 - vp8_filter)
239233d2500723e5594f3e7c70896ffeeef32b9c950ywan
240233d2500723e5594f3e7c70896ffeeef32b9c950ywan    eor         r11, r11, r12               ; *op1 = u^0x80
241233d2500723e5594f3e7c70896ffeeef32b9c950ywan    str         r11, [src], pstep           ; store op1
242233d2500723e5594f3e7c70896ffeeef32b9c950ywan    eor         r9, r9, r12                 ; *op0 = u^0x80
243233d2500723e5594f3e7c70896ffeeef32b9c950ywan    str         r9, [src], pstep            ; store op0 result
244233d2500723e5594f3e7c70896ffeeef32b9c950ywan    eor         r8, r8, r12                 ; *oq0 = u^0x80
245233d2500723e5594f3e7c70896ffeeef32b9c950ywan    str         r8, [src], pstep            ; store oq0 result
246233d2500723e5594f3e7c70896ffeeef32b9c950ywan    eor         r10, r10, r12               ; *oq1 = u^0x80
247233d2500723e5594f3e7c70896ffeeef32b9c950ywan    str         r10, [src], pstep           ; store oq1
248233d2500723e5594f3e7c70896ffeeef32b9c950ywan
249233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         src, src, pstep, lsl #1
250233d2500723e5594f3e7c70896ffeeef32b9c950ywan
251233d2500723e5594f3e7c70896ffeeef32b9c950ywan|hskip_filter|
252233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add         src, src, #4
253233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         src, src, pstep, lsl #2
254233d2500723e5594f3e7c70896ffeeef32b9c950ywan
255233d2500723e5594f3e7c70896ffeeef32b9c950ywan    subs        count, count, #1
256233d2500723e5594f3e7c70896ffeeef32b9c950ywan
257233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldrne       r9, [src], pstep            ; p3
258233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldrne       r10, [src], pstep           ; p2
259233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldrne       r11, [src], pstep           ; p1
260233d2500723e5594f3e7c70896ffeeef32b9c950ywan
261233d2500723e5594f3e7c70896ffeeef32b9c950ywan    bne         Hnext8
262233d2500723e5594f3e7c70896ffeeef32b9c950ywan
263233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add         sp, sp, #16
264233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldmia       sp!, {r4 - r11, pc}
265233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ENDP        ; |vp8_loop_filter_horizontal_edge_armv6|
266233d2500723e5594f3e7c70896ffeeef32b9c950ywan
267233d2500723e5594f3e7c70896ffeeef32b9c950ywan
268233d2500723e5594f3e7c70896ffeeef32b9c950ywan;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
269233d2500723e5594f3e7c70896ffeeef32b9c950ywan|vp8_mbloop_filter_horizontal_edge_armv6| PROC
270233d2500723e5594f3e7c70896ffeeef32b9c950ywan;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
271233d2500723e5594f3e7c70896ffeeef32b9c950ywan    stmdb       sp!, {r4 - r11, lr}
272233d2500723e5594f3e7c70896ffeeef32b9c950ywan
273233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
274233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         count, [sp, #40]            ; count for 8-in-parallel
275233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r6, [sp, #36]               ; load thresh address
276233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         sp, sp, #16                 ; create temp buffer
277233d2500723e5594f3e7c70896ffeeef32b9c950ywan
278233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r9, [src], pstep            ; p3
279233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldrb        r4, [r2]                    ; blimit
280233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r10, [src], pstep           ; p2
281233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldrb        r2, [r3]                    ; limit
282233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r11, [src], pstep           ; p1
283233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r4, r4, r4, lsl #8
284233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldrb        r3, [r6]                    ; thresh
285233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r2, r2, r2, lsl #8
286233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         count, count, lsl #1        ; 4-in-parallel
287233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r4, r4, r4, lsl #16
288233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r3, r3, r3, lsl #8
289233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r2, r2, r2, lsl #16
290233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r3, r3, r3, lsl #16
291233d2500723e5594f3e7c70896ffeeef32b9c950ywan
292233d2500723e5594f3e7c70896ffeeef32b9c950ywan|MBHnext8|
293233d2500723e5594f3e7c70896ffeeef32b9c950ywan
294233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; vp8_filter_mask() function
295233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; calculate breakout conditions
296233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r12, [src], pstep           ; p0
297233d2500723e5594f3e7c70896ffeeef32b9c950ywan
298233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r6, r9, r10                 ; p3 - p2
299233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r7, r10, r9                 ; p2 - p3
300233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r8, r10, r11                ; p2 - p1
301233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r10, r11, r10               ; p1 - p2
302233d2500723e5594f3e7c70896ffeeef32b9c950ywan
303233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r6, r6, r7                  ; abs (p3-p2)
304233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r8, r8, r10                 ; abs (p2-p1)
305233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      lr, r6, r2                  ; compare to limit. lr: vp8_filter_mask
306233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r8, r8, r2                  ; compare to limit
307233d2500723e5594f3e7c70896ffeeef32b9c950ywan
308233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r6, r11, r12                ; p1 - p0
309233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         lr, lr, r8
310233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r7, r12, r11                ; p0 - p1
311233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r9, [src], pstep            ; q0
312233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r10, [src], pstep           ; q1
313233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r6, r6, r7                  ; abs (p1-p0)
314233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r7, r6, r2                  ; compare to limit
315233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r8, r6, r3                  ; compare to thresh  -- save r8 for later
316233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         lr, lr, r7
317233d2500723e5594f3e7c70896ffeeef32b9c950ywan
318233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r6, r11, r10                ; p1 - q1
319233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r7, r10, r11                ; q1 - p1
320233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r11, r12, r9                ; p0 - q0
321233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r12, r9, r12                ; q0 - p0
322233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r6, r6, r7                  ; abs (p1-q1)
323233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r7, c0x7F7F7F7F
324233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r12, r11, r12               ; abs (p0-q0)
325233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r11, [src], pstep           ; q2
326233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqadd8      r12, r12, r12               ; abs (p0-q0) * 2
327233d2500723e5594f3e7c70896ffeeef32b9c950ywan    and         r6, r7, r6, lsr #1          ; abs (p1-q1) / 2
328233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r7, r9, r10                 ; q0 - q1
329233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqadd8      r12, r12, r6                ; abs (p0-q0)*2 + abs (p1-q1)/2
330233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r6, r10, r9                 ; q1 - q0
331233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r12, r12, r4                ; compare to flimit
332233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r9, r11, r10                ; q2 - q1
333233d2500723e5594f3e7c70896ffeeef32b9c950ywan
334233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         lr, lr, r12
335233d2500723e5594f3e7c70896ffeeef32b9c950ywan
336233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r12, [src], pstep           ; q3
337233d2500723e5594f3e7c70896ffeeef32b9c950ywan
338233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r10, r10, r11               ; q1 - q2
339233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r6, r7, r6                  ; abs (q1-q0)
340233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r10, r9, r10                ; abs (q2-q1)
341233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r7, r6, r2                  ; compare to limit
342233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r10, r10, r2                ; compare to limit
343233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r6, r6, r3                  ; compare to thresh -- save r6 for later
344233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         lr, lr, r7
345233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         lr, lr, r10
346233d2500723e5594f3e7c70896ffeeef32b9c950ywan
347233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r10, r12, r11               ; q3 - q2
348233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r9, r11, r12                ; q2 - q3
349233d2500723e5594f3e7c70896ffeeef32b9c950ywan
350233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mvn         r11, #0                     ; r11 == -1
351233d2500723e5594f3e7c70896ffeeef32b9c950ywan
352233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r10, r10, r9                ; abs (q3-q2)
353233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r10, r10, r2                ; compare to limit
354233d2500723e5594f3e7c70896ffeeef32b9c950ywan
355233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         r12, #0
356233d2500723e5594f3e7c70896ffeeef32b9c950ywan
357233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         lr, lr, r10
358233d2500723e5594f3e7c70896ffeeef32b9c950ywan
359233d2500723e5594f3e7c70896ffeeef32b9c950ywan    usub8       lr, r12, lr                 ; use usub8 instead of ssub8
360233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sel         lr, r11, r12                ; filter mask: lr
361233d2500723e5594f3e7c70896ffeeef32b9c950ywan
362233d2500723e5594f3e7c70896ffeeef32b9c950ywan    cmp         lr, #0
363233d2500723e5594f3e7c70896ffeeef32b9c950ywan    beq         mbhskip_filter               ; skip filtering
364233d2500723e5594f3e7c70896ffeeef32b9c950ywan
365233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;vp8_hevmask() function
366233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;calculate high edge variance
367233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         src, src, pstep, lsl #2     ; move src pointer down by 6 lines
368233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         src, src, pstep, lsl #1
369233d2500723e5594f3e7c70896ffeeef32b9c950ywan
370233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r10, r6, r8
371233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r7, [src], pstep            ; p1
372233d2500723e5594f3e7c70896ffeeef32b9c950ywan
373233d2500723e5594f3e7c70896ffeeef32b9c950ywan    usub8       r10, r12, r10
374233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sel         r6, r12, r11                ; hev mask: r6
375233d2500723e5594f3e7c70896ffeeef32b9c950ywan
376233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;vp8_mbfilter() function
377233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;p2, q2 are only needed at the end. Don't need to load them in now.
378233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r8, [src], pstep            ; p0
379233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r12, c0x80808080
380233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r9, [src], pstep            ; q0
381233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r10, [src]                  ; q1
382233d2500723e5594f3e7c70896ffeeef32b9c950ywan
383233d2500723e5594f3e7c70896ffeeef32b9c950ywan    eor         r7, r7, r12                 ; ps1
384233d2500723e5594f3e7c70896ffeeef32b9c950ywan    eor         r8, r8, r12                 ; ps0
385233d2500723e5594f3e7c70896ffeeef32b9c950ywan    eor         r9, r9, r12                 ; qs0
386233d2500723e5594f3e7c70896ffeeef32b9c950ywan    eor         r10, r10, r12               ; qs1
387233d2500723e5594f3e7c70896ffeeef32b9c950ywan
388233d2500723e5594f3e7c70896ffeeef32b9c950ywan    qsub8       r12, r9, r8                 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
389233d2500723e5594f3e7c70896ffeeef32b9c950ywan    str         r7, [sp, #12]               ; store ps1 temporarily
390233d2500723e5594f3e7c70896ffeeef32b9c950ywan    qsub8       r7, r7, r10                 ; vp8_signed_char_clamp(ps1-qs1)
391233d2500723e5594f3e7c70896ffeeef32b9c950ywan    str         r10, [sp, #8]               ; store qs1 temporarily
392233d2500723e5594f3e7c70896ffeeef32b9c950ywan    qadd8       r7, r7, r12
393233d2500723e5594f3e7c70896ffeeef32b9c950ywan    str         r9, [sp]                    ; store qs0 temporarily
394233d2500723e5594f3e7c70896ffeeef32b9c950ywan    qadd8       r7, r7, r12
395233d2500723e5594f3e7c70896ffeeef32b9c950ywan    str         r8, [sp, #4]                ; store ps0 temporarily
396233d2500723e5594f3e7c70896ffeeef32b9c950ywan    qadd8       r7, r7, r12                 ; vp8_filter: r7
397233d2500723e5594f3e7c70896ffeeef32b9c950ywan
398233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r10, c0x03030303            ; r10 = 3 --modified for vp8
399233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r9, c0x04040404
400233d2500723e5594f3e7c70896ffeeef32b9c950ywan
401233d2500723e5594f3e7c70896ffeeef32b9c950ywan    and         r7, r7, lr                  ; vp8_filter &= mask (lr is free)
402233d2500723e5594f3e7c70896ffeeef32b9c950ywan
403233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         r12, r7                     ; Filter2: r12
404233d2500723e5594f3e7c70896ffeeef32b9c950ywan    and         r12, r12, r6                ; Filter2 &= hev
405233d2500723e5594f3e7c70896ffeeef32b9c950ywan
406233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;modify code for vp8
407233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;save bottom 3 bits so that we round one side +4 and the other +3
408233d2500723e5594f3e7c70896ffeeef32b9c950ywan    qadd8       r8 , r12 , r9               ; Filter1 (r8) = vp8_signed_char_clamp(Filter2+4)
409233d2500723e5594f3e7c70896ffeeef32b9c950ywan    qadd8       r12 , r12 , r10             ; Filter2 (r12) = vp8_signed_char_clamp(Filter2+3)
410233d2500723e5594f3e7c70896ffeeef32b9c950ywan
411233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         r10, #0
412233d2500723e5594f3e7c70896ffeeef32b9c950ywan    shadd8      r8 , r8 , r10               ; Filter1 >>= 3
413233d2500723e5594f3e7c70896ffeeef32b9c950ywan    shadd8      r12 , r12 , r10             ; Filter2 >>= 3
414233d2500723e5594f3e7c70896ffeeef32b9c950ywan    shadd8      r8 , r8 , r10
415233d2500723e5594f3e7c70896ffeeef32b9c950ywan    shadd8      r12 , r12 , r10
416233d2500723e5594f3e7c70896ffeeef32b9c950ywan    shadd8      r8 , r8 , r10               ; r8: Filter1
417233d2500723e5594f3e7c70896ffeeef32b9c950ywan    shadd8      r12 , r12 , r10             ; r12: Filter2
418233d2500723e5594f3e7c70896ffeeef32b9c950ywan
419233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r9, [sp]                    ; load qs0
420233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r11, [sp, #4]               ; load ps0
421233d2500723e5594f3e7c70896ffeeef32b9c950ywan
422233d2500723e5594f3e7c70896ffeeef32b9c950ywan    qsub8       r9 , r9, r8                 ; qs0 = vp8_signed_char_clamp(qs0 - Filter1)
423233d2500723e5594f3e7c70896ffeeef32b9c950ywan    qadd8       r11, r11, r12               ; ps0 = vp8_signed_char_clamp(ps0 + Filter2)
424233d2500723e5594f3e7c70896ffeeef32b9c950ywan
425233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;save bottom 3 bits so that we round one side +4 and the other +3
426233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;and            r8, r12, r10                ; s = Filter2 & 7 (s: r8)
427233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;qadd8      r12 , r12 , r9              ; Filter2 = vp8_signed_char_clamp(Filter2+4)
428233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;mov            r10, #0
429233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;shadd8     r12 , r12 , r10             ; Filter2 >>= 3
430233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;usub8      lr, r8, r9                  ; s = (s==4)*-1
431233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;sel            lr, r11, r10
432233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;shadd8     r12 , r12 , r10
433233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;usub8      r8, r9, r8
434233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;sel            r8, r11, r10
435233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;ldr            r9, [sp]                    ; load qs0
436233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;ldr            r11, [sp, #4]               ; load ps0
437233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;shadd8     r12 , r12 , r10
438233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;and            r8, r8, lr                  ; -1 for each element that equals 4
439233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;qadd8      r10, r8, r12                ; u = vp8_signed_char_clamp(s + Filter2)
440233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;qsub8      r9 , r9, r12                ; qs0 = vp8_signed_char_clamp(qs0 - Filter2)
441233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;qadd8      r11, r11, r10               ; ps0 = vp8_signed_char_clamp(ps0 + u)
442233d2500723e5594f3e7c70896ffeeef32b9c950ywan
443233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;end of modification for vp8
444233d2500723e5594f3e7c70896ffeeef32b9c950ywan
445233d2500723e5594f3e7c70896ffeeef32b9c950ywan    bic         r12, r7, r6                 ; vp8_filter &= ~hev    ( r6 is free)
446233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;mov        r12, r7
447233d2500723e5594f3e7c70896ffeeef32b9c950ywan
448233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;roughly 3/7th difference across boundary
449233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         lr, #0x1b                   ; 27
450233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         r7, #0x3f                   ; 63
451233d2500723e5594f3e7c70896ffeeef32b9c950ywan
452233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sxtb16      r6, r12
453233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sxtb16      r10, r12, ror #8
454233d2500723e5594f3e7c70896ffeeef32b9c950ywan    smlabb      r8, r6, lr, r7
455233d2500723e5594f3e7c70896ffeeef32b9c950ywan    smlatb      r6, r6, lr, r7
456233d2500723e5594f3e7c70896ffeeef32b9c950ywan    smlabb      r7, r10, lr, r7
457233d2500723e5594f3e7c70896ffeeef32b9c950ywan    smultb      r10, r10, lr
458233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ssat        r8, #8, r8, asr #7
459233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ssat        r6, #8, r6, asr #7
460233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add         r10, r10, #63
461233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ssat        r7, #8, r7, asr #7
462233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ssat        r10, #8, r10, asr #7
463233d2500723e5594f3e7c70896ffeeef32b9c950ywan
464233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         lr, c0x80808080
465233d2500723e5594f3e7c70896ffeeef32b9c950ywan
466233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pkhbt       r6, r8, r6, lsl #16
467233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pkhbt       r10, r7, r10, lsl #16
468233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uxtb16      r6, r6
469233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uxtb16      r10, r10
470233d2500723e5594f3e7c70896ffeeef32b9c950ywan
471233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         src, src, pstep
472233d2500723e5594f3e7c70896ffeeef32b9c950ywan
473233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r10, r6, r10, lsl #8        ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
474233d2500723e5594f3e7c70896ffeeef32b9c950ywan
475233d2500723e5594f3e7c70896ffeeef32b9c950ywan    qsub8       r8, r9, r10                 ; s = vp8_signed_char_clamp(qs0 - u)
476233d2500723e5594f3e7c70896ffeeef32b9c950ywan    qadd8       r10, r11, r10               ; s = vp8_signed_char_clamp(ps0 + u)
477233d2500723e5594f3e7c70896ffeeef32b9c950ywan    eor         r8, r8, lr                  ; *oq0 = s^0x80
478233d2500723e5594f3e7c70896ffeeef32b9c950ywan    str         r8, [src]                   ; store *oq0
479233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         src, src, pstep
480233d2500723e5594f3e7c70896ffeeef32b9c950ywan    eor         r10, r10, lr                ; *op0 = s^0x80
481233d2500723e5594f3e7c70896ffeeef32b9c950ywan    str         r10, [src]                  ; store *op0
482233d2500723e5594f3e7c70896ffeeef32b9c950ywan
483233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;roughly 2/7th difference across boundary
484233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         lr, #0x12                   ; 18
485233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         r7, #0x3f                   ; 63
486233d2500723e5594f3e7c70896ffeeef32b9c950ywan
487233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sxtb16      r6, r12
488233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sxtb16      r10, r12, ror #8
489233d2500723e5594f3e7c70896ffeeef32b9c950ywan    smlabb      r8, r6, lr, r7
490233d2500723e5594f3e7c70896ffeeef32b9c950ywan    smlatb      r6, r6, lr, r7
491233d2500723e5594f3e7c70896ffeeef32b9c950ywan    smlabb      r9, r10, lr, r7
492233d2500723e5594f3e7c70896ffeeef32b9c950ywan    smlatb      r10, r10, lr, r7
493233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ssat        r8, #8, r8, asr #7
494233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ssat        r6, #8, r6, asr #7
495233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ssat        r9, #8, r9, asr #7
496233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ssat        r10, #8, r10, asr #7
497233d2500723e5594f3e7c70896ffeeef32b9c950ywan
498233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         lr, c0x80808080
499233d2500723e5594f3e7c70896ffeeef32b9c950ywan
500233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pkhbt       r6, r8, r6, lsl #16
501233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pkhbt       r10, r9, r10, lsl #16
502233d2500723e5594f3e7c70896ffeeef32b9c950ywan
503233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r9, [sp, #8]                ; load qs1
504233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r11, [sp, #12]              ; load ps1
505233d2500723e5594f3e7c70896ffeeef32b9c950ywan
506233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uxtb16      r6, r6
507233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uxtb16      r10, r10
508233d2500723e5594f3e7c70896ffeeef32b9c950ywan
509233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         src, src, pstep
510233d2500723e5594f3e7c70896ffeeef32b9c950ywan
511233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r10, r6, r10, lsl #8        ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
512233d2500723e5594f3e7c70896ffeeef32b9c950ywan
513233d2500723e5594f3e7c70896ffeeef32b9c950ywan    qadd8       r11, r11, r10               ; s = vp8_signed_char_clamp(ps1 + u)
514233d2500723e5594f3e7c70896ffeeef32b9c950ywan    qsub8       r8, r9, r10                 ; s = vp8_signed_char_clamp(qs1 - u)
515233d2500723e5594f3e7c70896ffeeef32b9c950ywan    eor         r11, r11, lr                ; *op1 = s^0x80
516233d2500723e5594f3e7c70896ffeeef32b9c950ywan    str         r11, [src], pstep           ; store *op1
517233d2500723e5594f3e7c70896ffeeef32b9c950ywan    eor         r8, r8, lr                  ; *oq1 = s^0x80
518233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add         src, src, pstep, lsl #1
519233d2500723e5594f3e7c70896ffeeef32b9c950ywan
520233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         r7, #0x3f                   ; 63
521233d2500723e5594f3e7c70896ffeeef32b9c950ywan
522233d2500723e5594f3e7c70896ffeeef32b9c950ywan    str         r8, [src], pstep            ; store *oq1
523233d2500723e5594f3e7c70896ffeeef32b9c950ywan
524233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;roughly 1/7th difference across boundary
525233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         lr, #0x9                    ; 9
526233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r9, [src]                   ; load q2
527233d2500723e5594f3e7c70896ffeeef32b9c950ywan
528233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sxtb16      r6, r12
529233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sxtb16      r10, r12, ror #8
530233d2500723e5594f3e7c70896ffeeef32b9c950ywan    smlabb      r8, r6, lr, r7
531233d2500723e5594f3e7c70896ffeeef32b9c950ywan    smlatb      r6, r6, lr, r7
532233d2500723e5594f3e7c70896ffeeef32b9c950ywan    smlabb      r12, r10, lr, r7
533233d2500723e5594f3e7c70896ffeeef32b9c950ywan    smlatb      r10, r10, lr, r7
534233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ssat        r8, #8, r8, asr #7
535233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ssat        r6, #8, r6, asr #7
536233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ssat        r12, #8, r12, asr #7
537233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ssat        r10, #8, r10, asr #7
538233d2500723e5594f3e7c70896ffeeef32b9c950ywan
539233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         src, src, pstep, lsl #2
540233d2500723e5594f3e7c70896ffeeef32b9c950ywan
541233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pkhbt       r6, r8, r6, lsl #16
542233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pkhbt       r10, r12, r10, lsl #16
543233d2500723e5594f3e7c70896ffeeef32b9c950ywan
544233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         src, src, pstep
545233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         lr, c0x80808080
546233d2500723e5594f3e7c70896ffeeef32b9c950ywan
547233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r11, [src]                  ; load p2
548233d2500723e5594f3e7c70896ffeeef32b9c950ywan
549233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uxtb16      r6, r6
550233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uxtb16      r10, r10
551233d2500723e5594f3e7c70896ffeeef32b9c950ywan
552233d2500723e5594f3e7c70896ffeeef32b9c950ywan    eor         r9, r9, lr
553233d2500723e5594f3e7c70896ffeeef32b9c950ywan    eor         r11, r11, lr
554233d2500723e5594f3e7c70896ffeeef32b9c950ywan
555233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r10, r6, r10, lsl #8        ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
556233d2500723e5594f3e7c70896ffeeef32b9c950ywan
557233d2500723e5594f3e7c70896ffeeef32b9c950ywan    qadd8       r8, r11, r10                ; s = vp8_signed_char_clamp(ps2 + u)
558233d2500723e5594f3e7c70896ffeeef32b9c950ywan    qsub8       r10, r9, r10                ; s = vp8_signed_char_clamp(qs2 - u)
559233d2500723e5594f3e7c70896ffeeef32b9c950ywan    eor         r8, r8, lr                  ; *op2 = s^0x80
560233d2500723e5594f3e7c70896ffeeef32b9c950ywan    str         r8, [src], pstep, lsl #2    ; store *op2
561233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add         src, src, pstep
562233d2500723e5594f3e7c70896ffeeef32b9c950ywan    eor         r10, r10, lr                ; *oq2 = s^0x80
563233d2500723e5594f3e7c70896ffeeef32b9c950ywan    str         r10, [src], pstep, lsl #1   ; store *oq2
564233d2500723e5594f3e7c70896ffeeef32b9c950ywan
565233d2500723e5594f3e7c70896ffeeef32b9c950ywan|mbhskip_filter|
566233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add         src, src, #4
567233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         src, src, pstep, lsl #3
568233d2500723e5594f3e7c70896ffeeef32b9c950ywan    subs        count, count, #1
569233d2500723e5594f3e7c70896ffeeef32b9c950ywan
570233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldrne       r9, [src], pstep            ; p3
571233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldrne       r10, [src], pstep           ; p2
572233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldrne       r11, [src], pstep           ; p1
573233d2500723e5594f3e7c70896ffeeef32b9c950ywan
574233d2500723e5594f3e7c70896ffeeef32b9c950ywan    bne         MBHnext8
575233d2500723e5594f3e7c70896ffeeef32b9c950ywan
576233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add         sp, sp, #16
577233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldmia       sp!, {r4 - r11, pc}
578233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ENDP        ; |vp8_mbloop_filter_horizontal_edge_armv6|
579233d2500723e5594f3e7c70896ffeeef32b9c950ywan
580233d2500723e5594f3e7c70896ffeeef32b9c950ywan
581233d2500723e5594f3e7c70896ffeeef32b9c950ywan;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
582233d2500723e5594f3e7c70896ffeeef32b9c950ywan|vp8_loop_filter_vertical_edge_armv6| PROC
583233d2500723e5594f3e7c70896ffeeef32b9c950ywan;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
584233d2500723e5594f3e7c70896ffeeef32b9c950ywan    stmdb       sp!, {r4 - r11, lr}
585233d2500723e5594f3e7c70896ffeeef32b9c950ywan
586233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         src, src, #4                ; move src pointer down by 4
587233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         count, [sp, #40]            ; count for 8-in-parallel
588233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r12, [sp, #36]              ; load thresh address
589233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         sp, sp, #16                 ; create temp buffer
590233d2500723e5594f3e7c70896ffeeef32b9c950ywan
591233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r6, [src], pstep            ; load source data
592233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldrb        r4, [r2]                    ; blimit
593233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r7, [src], pstep
594233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldrb        r2, [r3]                    ; limit
595233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r8, [src], pstep
596233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r4, r4, r4, lsl #8
597233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldrb        r3, [r12]                   ; thresh
598233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r2, r2, r2, lsl #8
599233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         lr, [src], pstep
600233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         count, count, lsl #1        ; 4-in-parallel
601233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r4, r4, r4, lsl #16
602233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r3, r3, r3, lsl #8
603233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r2, r2, r2, lsl #16
604233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r3, r3, r3, lsl #16
605233d2500723e5594f3e7c70896ffeeef32b9c950ywan
606233d2500723e5594f3e7c70896ffeeef32b9c950ywan|Vnext8|
607233d2500723e5594f3e7c70896ffeeef32b9c950ywan
608233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; vp8_filter_mask() function
609233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; calculate breakout conditions
610233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; transpose the source data for 4-in-parallel operation
611233d2500723e5594f3e7c70896ffeeef32b9c950ywan    TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
612233d2500723e5594f3e7c70896ffeeef32b9c950ywan
613233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r7, r9, r10                 ; p3 - p2
614233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r8, r10, r9                 ; p2 - p3
615233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r9, r10, r11                ; p2 - p1
616233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r10, r11, r10               ; p1 - p2
617233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r7, r7, r8                  ; abs (p3-p2)
618233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r10, r9, r10                ; abs (p2-p1)
619233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      lr, r7, r2                  ; compare to limit. lr: vp8_filter_mask
620233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r10, r10, r2                ; compare to limit
621233d2500723e5594f3e7c70896ffeeef32b9c950ywan
622233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
623233d2500723e5594f3e7c70896ffeeef32b9c950ywan
624233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         lr, lr, r10
625233d2500723e5594f3e7c70896ffeeef32b9c950ywan
626233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r6, r11, r12                ; p1 - p0
627233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r7, r12, r11                ; p0 - p1
628233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add         src, src, #4                ; move src pointer up by 4
629233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r6, r6, r7                  ; abs (p1-p0)
630233d2500723e5594f3e7c70896ffeeef32b9c950ywan    str         r11, [sp, #12]              ; save p1
631233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r10, r6, r2                 ; compare to limit
632233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r11, r6, r3                 ; compare to thresh
633233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         lr, lr, r10
634233d2500723e5594f3e7c70896ffeeef32b9c950ywan
635233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now
636233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; transpose the source data for 4-in-parallel operation
637233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r6, [src], pstep            ; load source data
638233d2500723e5594f3e7c70896ffeeef32b9c950ywan    str         r11, [sp]                   ; push r11 to stack
639233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r7, [src], pstep
640233d2500723e5594f3e7c70896ffeeef32b9c950ywan    str         r12, [sp, #4]               ; save current reg before load q0 - q3 data
641233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r8, [src], pstep
642233d2500723e5594f3e7c70896ffeeef32b9c950ywan    str         lr, [sp, #8]
643233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         lr, [src], pstep
644233d2500723e5594f3e7c70896ffeeef32b9c950ywan
645233d2500723e5594f3e7c70896ffeeef32b9c950ywan    TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
646233d2500723e5594f3e7c70896ffeeef32b9c950ywan
647233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         lr, [sp, #8]                ; load back (f)limit accumulator
648233d2500723e5594f3e7c70896ffeeef32b9c950ywan
649233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r6, r12, r11                ; q3 - q2
650233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r7, r11, r12                ; q2 - q3
651233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r12, r11, r10               ; q2 - q1
652233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r11, r10, r11               ; q1 - q2
653233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r6, r6, r7                  ; abs (q3-q2)
654233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r7, r12, r11                ; abs (q2-q1)
655233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r6, r6, r2                  ; compare to limit
656233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r7, r7, r2                  ; compare to limit
657233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r11, [sp, #4]               ; load back p0
658233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r12, [sp, #12]              ; load back p1
659233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         lr, lr, r6
660233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         lr, lr, r7
661233d2500723e5594f3e7c70896ffeeef32b9c950ywan
662233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r6, r11, r9                 ; p0 - q0
663233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r7, r9, r11                 ; q0 - p0
664233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r8, r12, r10                ; p1 - q1
665233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r11, r10, r12               ; q1 - p1
666233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r6, r6, r7                  ; abs (p0-q0)
667233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r7, c0x7F7F7F7F
668233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r8, r8, r11                 ; abs (p1-q1)
669233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqadd8      r6, r6, r6                  ; abs (p0-q0) * 2
670233d2500723e5594f3e7c70896ffeeef32b9c950ywan    and         r8, r7, r8, lsr #1          ; abs (p1-q1) / 2
671233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r11, r10, r9                ; q1 - q0
672233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqadd8      r6, r8, r6                  ; abs (p0-q0)*2 + abs (p1-q1)/2
673233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r12, r9, r10                ; q0 - q1
674233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r6, r6, r4                  ; compare to flimit
675233d2500723e5594f3e7c70896ffeeef32b9c950ywan
676233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r9, r11, r12                ; abs (q1-q0)
677233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r8, r9, r2                  ; compare to limit
678233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r10, r9, r3                 ; compare to thresh
679233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         lr, lr, r6
680233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         lr, lr, r8
681233d2500723e5594f3e7c70896ffeeef32b9c950ywan
682233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mvn         r11, #0                     ; r11 == -1
683233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         r12, #0
684233d2500723e5594f3e7c70896ffeeef32b9c950ywan
685233d2500723e5594f3e7c70896ffeeef32b9c950ywan    usub8       lr, r12, lr
686233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r9, [sp]                    ; load the compared result
687233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sel         lr, r11, r12                ; filter mask: lr
688233d2500723e5594f3e7c70896ffeeef32b9c950ywan
689233d2500723e5594f3e7c70896ffeeef32b9c950ywan    cmp         lr, #0
690233d2500723e5594f3e7c70896ffeeef32b9c950ywan    beq         vskip_filter                 ; skip filtering
691233d2500723e5594f3e7c70896ffeeef32b9c950ywan
692233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;vp8_hevmask() function
693233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;calculate high edge variance
694233d2500723e5594f3e7c70896ffeeef32b9c950ywan
695233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
696233d2500723e5594f3e7c70896ffeeef32b9c950ywan
697233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r9, r9, r10
698233d2500723e5594f3e7c70896ffeeef32b9c950ywan
699233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldrh        r7, [src, #-2]
700233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldrh        r8, [src], pstep
701233d2500723e5594f3e7c70896ffeeef32b9c950ywan
702233d2500723e5594f3e7c70896ffeeef32b9c950ywan    usub8       r9, r12, r9
703233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sel         r6, r12, r11                ; hev mask: r6
704233d2500723e5594f3e7c70896ffeeef32b9c950ywan
705233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;vp8_filter() function
706233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; load soure data to r6, r11, r12, lr
707233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldrh        r9, [src, #-2]
708233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldrh        r10, [src], pstep
709233d2500723e5594f3e7c70896ffeeef32b9c950ywan
710233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pkhbt       r12, r7, r8, lsl #16
711233d2500723e5594f3e7c70896ffeeef32b9c950ywan
712233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldrh        r7, [src, #-2]
713233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldrh        r8, [src], pstep
714233d2500723e5594f3e7c70896ffeeef32b9c950ywan
715233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pkhbt       r11, r9, r10, lsl #16
716233d2500723e5594f3e7c70896ffeeef32b9c950ywan
717233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldrh        r9, [src, #-2]
718233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldrh        r10, [src], pstep
719233d2500723e5594f3e7c70896ffeeef32b9c950ywan
720233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first
721233d2500723e5594f3e7c70896ffeeef32b9c950ywan    str         r6, [sp]
722233d2500723e5594f3e7c70896ffeeef32b9c950ywan    str         lr, [sp, #4]
723233d2500723e5594f3e7c70896ffeeef32b9c950ywan
724233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pkhbt       r6, r7, r8, lsl #16
725233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pkhbt       lr, r9, r10, lsl #16
726233d2500723e5594f3e7c70896ffeeef32b9c950ywan
727233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;transpose r12, r11, r6, lr to r7, r8, r9, r10
728233d2500723e5594f3e7c70896ffeeef32b9c950ywan    TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10
729233d2500723e5594f3e7c70896ffeeef32b9c950ywan
730233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;load back hev_mask r6 and filter_mask lr
731233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r12, c0x80808080
732233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r6, [sp]
733233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         lr, [sp, #4]
734233d2500723e5594f3e7c70896ffeeef32b9c950ywan
735233d2500723e5594f3e7c70896ffeeef32b9c950ywan    eor         r7, r7, r12                 ; p1 offset to convert to a signed value
736233d2500723e5594f3e7c70896ffeeef32b9c950ywan    eor         r8, r8, r12                 ; p0 offset to convert to a signed value
737233d2500723e5594f3e7c70896ffeeef32b9c950ywan    eor         r9, r9, r12                 ; q0 offset to convert to a signed value
738233d2500723e5594f3e7c70896ffeeef32b9c950ywan    eor         r10, r10, r12               ; q1 offset to convert to a signed value
739233d2500723e5594f3e7c70896ffeeef32b9c950ywan
740233d2500723e5594f3e7c70896ffeeef32b9c950ywan    str         r9, [sp]                    ; store qs0 temporarily
741233d2500723e5594f3e7c70896ffeeef32b9c950ywan    str         r8, [sp, #4]                ; store ps0 temporarily
742233d2500723e5594f3e7c70896ffeeef32b9c950ywan    str         r10, [sp, #8]               ; store qs1 temporarily
743233d2500723e5594f3e7c70896ffeeef32b9c950ywan    str         r7, [sp, #12]               ; store ps1 temporarily
744233d2500723e5594f3e7c70896ffeeef32b9c950ywan
745233d2500723e5594f3e7c70896ffeeef32b9c950ywan    qsub8       r7, r7, r10                 ; vp8_signed_char_clamp(ps1-qs1)
746233d2500723e5594f3e7c70896ffeeef32b9c950ywan    qsub8       r8, r9, r8                  ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
747233d2500723e5594f3e7c70896ffeeef32b9c950ywan
748233d2500723e5594f3e7c70896ffeeef32b9c950ywan    and         r7, r7, r6                  ;  vp8_filter (r7) &= hev (r7 : filter)
749233d2500723e5594f3e7c70896ffeeef32b9c950ywan
750233d2500723e5594f3e7c70896ffeeef32b9c950ywan    qadd8       r7, r7, r8
751233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r9, c0x03030303             ; r9 = 3 --modified for vp8
752233d2500723e5594f3e7c70896ffeeef32b9c950ywan
753233d2500723e5594f3e7c70896ffeeef32b9c950ywan    qadd8       r7, r7, r8
754233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r10, c0x04040404
755233d2500723e5594f3e7c70896ffeeef32b9c950ywan
756233d2500723e5594f3e7c70896ffeeef32b9c950ywan    qadd8       r7, r7, r8
757233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;mvn         r11, #0                     ; r11 == -1
758233d2500723e5594f3e7c70896ffeeef32b9c950ywan
759233d2500723e5594f3e7c70896ffeeef32b9c950ywan    and         r7, r7, lr                  ; vp8_filter &= mask
760233d2500723e5594f3e7c70896ffeeef32b9c950ywan
761233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;modify code for vp8 -- Filter1 = vp8_filter (r7)
762233d2500723e5594f3e7c70896ffeeef32b9c950ywan    qadd8       r8 , r7 , r9                ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3)
763233d2500723e5594f3e7c70896ffeeef32b9c950ywan    qadd8       r7 , r7 , r10               ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4)
764233d2500723e5594f3e7c70896ffeeef32b9c950ywan
765233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         r9, #0
766233d2500723e5594f3e7c70896ffeeef32b9c950ywan    shadd8      r8 , r8 , r9                ; Filter2 >>= 3
767233d2500723e5594f3e7c70896ffeeef32b9c950ywan    shadd8      r7 , r7 , r9                ; vp8_filter >>= 3
768233d2500723e5594f3e7c70896ffeeef32b9c950ywan    shadd8      r8 , r8 , r9
769233d2500723e5594f3e7c70896ffeeef32b9c950ywan    shadd8      r7 , r7 , r9
770233d2500723e5594f3e7c70896ffeeef32b9c950ywan    shadd8      lr , r8 , r9                ; lr: filter2
771233d2500723e5594f3e7c70896ffeeef32b9c950ywan    shadd8      r7 , r7 , r9                ; r7: filter
772233d2500723e5594f3e7c70896ffeeef32b9c950ywan
773233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;usub8      lr, r8, r10                 ; s = (s==4)*-1
774233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;sel            lr, r11, r9
775233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;usub8      r8, r10, r8
776233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;sel            r8, r11, r9
777233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;and            r8, r8, lr                  ; -1 for each element that equals 4 -- r8: s
778233d2500723e5594f3e7c70896ffeeef32b9c950ywan
779233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;calculate output
780233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;qadd8      lr, r8, r7                  ; u = vp8_signed_char_clamp(s + vp8_filter)
781233d2500723e5594f3e7c70896ffeeef32b9c950ywan
782233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r8, [sp]                    ; load qs0
783233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r9, [sp, #4]                ; load ps0
784233d2500723e5594f3e7c70896ffeeef32b9c950ywan
785233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r10, c0x01010101
786233d2500723e5594f3e7c70896ffeeef32b9c950ywan
787233d2500723e5594f3e7c70896ffeeef32b9c950ywan    qsub8       r8, r8, r7                  ; u = vp8_signed_char_clamp(qs0 - vp8_filter)
788233d2500723e5594f3e7c70896ffeeef32b9c950ywan    qadd8       r9, r9, lr                  ; u = vp8_signed_char_clamp(ps0 + Filter2)
789233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;end of modification for vp8
790233d2500723e5594f3e7c70896ffeeef32b9c950ywan
791233d2500723e5594f3e7c70896ffeeef32b9c950ywan    eor         r8, r8, r12
792233d2500723e5594f3e7c70896ffeeef32b9c950ywan    eor         r9, r9, r12
793233d2500723e5594f3e7c70896ffeeef32b9c950ywan
794233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         lr, #0
795233d2500723e5594f3e7c70896ffeeef32b9c950ywan
796233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sadd8       r7, r7, r10
797233d2500723e5594f3e7c70896ffeeef32b9c950ywan    shadd8      r7, r7, lr
798233d2500723e5594f3e7c70896ffeeef32b9c950ywan
799233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r10, [sp, #8]               ; load qs1
800233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r11, [sp, #12]              ; load ps1
801233d2500723e5594f3e7c70896ffeeef32b9c950ywan
802233d2500723e5594f3e7c70896ffeeef32b9c950ywan    bic         r7, r7, r6                  ; r7: vp8_filter
803233d2500723e5594f3e7c70896ffeeef32b9c950ywan
804233d2500723e5594f3e7c70896ffeeef32b9c950ywan    qsub8       r10 , r10, r7               ; u = vp8_signed_char_clamp(qs1 - vp8_filter)
805233d2500723e5594f3e7c70896ffeeef32b9c950ywan    qadd8       r11, r11, r7                ; u = vp8_signed_char_clamp(ps1 + vp8_filter)
806233d2500723e5594f3e7c70896ffeeef32b9c950ywan    eor         r10, r10, r12
807233d2500723e5594f3e7c70896ffeeef32b9c950ywan    eor         r11, r11, r12
808233d2500723e5594f3e7c70896ffeeef32b9c950ywan
809233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         src, src, pstep, lsl #2
810233d2500723e5594f3e7c70896ffeeef32b9c950ywan
811233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;we can use TRANSPOSE_MATRIX macro to transpose output - input: q1, q0, p0, p1
812233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;output is b0, b1, b2, b3
813233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;b0: 03 02 01 00
814233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;b1: 13 12 11 10
815233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;b2: 23 22 21 20
816233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;b3: 33 32 31 30
817233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;    p1 p0 q0 q1
818233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;   (a3 a2 a1 a0)
819233d2500723e5594f3e7c70896ffeeef32b9c950ywan    TRANSPOSE_MATRIX r11, r9, r8, r10, r6, r7, r12, lr
820233d2500723e5594f3e7c70896ffeeef32b9c950ywan
821233d2500723e5594f3e7c70896ffeeef32b9c950ywan    strh        r6, [src, #-2]              ; store the result
822233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         r6, r6, lsr #16
823233d2500723e5594f3e7c70896ffeeef32b9c950ywan    strh        r6, [src], pstep
824233d2500723e5594f3e7c70896ffeeef32b9c950ywan
825233d2500723e5594f3e7c70896ffeeef32b9c950ywan    strh        r7, [src, #-2]
826233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         r7, r7, lsr #16
827233d2500723e5594f3e7c70896ffeeef32b9c950ywan    strh        r7, [src], pstep
828233d2500723e5594f3e7c70896ffeeef32b9c950ywan
829233d2500723e5594f3e7c70896ffeeef32b9c950ywan    strh        r12, [src, #-2]
830233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         r12, r12, lsr #16
831233d2500723e5594f3e7c70896ffeeef32b9c950ywan    strh        r12, [src], pstep
832233d2500723e5594f3e7c70896ffeeef32b9c950ywan
833233d2500723e5594f3e7c70896ffeeef32b9c950ywan    strh        lr, [src, #-2]
834233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         lr, lr, lsr #16
835233d2500723e5594f3e7c70896ffeeef32b9c950ywan    strh        lr, [src], pstep
836233d2500723e5594f3e7c70896ffeeef32b9c950ywan
837233d2500723e5594f3e7c70896ffeeef32b9c950ywan|vskip_filter|
838233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         src, src, #4
839233d2500723e5594f3e7c70896ffeeef32b9c950ywan    subs        count, count, #1
840233d2500723e5594f3e7c70896ffeeef32b9c950ywan
841233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldrne       r6, [src], pstep            ; load source data
842233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldrne       r7, [src], pstep
843233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldrne       r8, [src], pstep
844233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldrne       lr, [src], pstep
845233d2500723e5594f3e7c70896ffeeef32b9c950ywan
846233d2500723e5594f3e7c70896ffeeef32b9c950ywan    bne         Vnext8
847233d2500723e5594f3e7c70896ffeeef32b9c950ywan
848233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add         sp, sp, #16
849233d2500723e5594f3e7c70896ffeeef32b9c950ywan
850233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldmia       sp!, {r4 - r11, pc}
851233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ENDP        ; |vp8_loop_filter_vertical_edge_armv6|
852233d2500723e5594f3e7c70896ffeeef32b9c950ywan
853233d2500723e5594f3e7c70896ffeeef32b9c950ywan
854233d2500723e5594f3e7c70896ffeeef32b9c950ywan
855233d2500723e5594f3e7c70896ffeeef32b9c950ywan;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
856233d2500723e5594f3e7c70896ffeeef32b9c950ywan|vp8_mbloop_filter_vertical_edge_armv6| PROC
857233d2500723e5594f3e7c70896ffeeef32b9c950ywan;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
858233d2500723e5594f3e7c70896ffeeef32b9c950ywan    stmdb       sp!, {r4 - r11, lr}
859233d2500723e5594f3e7c70896ffeeef32b9c950ywan
860233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         src, src, #4                ; move src pointer down by 4
861233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         count, [sp, #40]            ; count for 8-in-parallel
862233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r12, [sp, #36]              ; load thresh address
863233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pld         [src, #23]                  ; preload for next block
864233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         sp, sp, #16                 ; create temp buffer
865233d2500723e5594f3e7c70896ffeeef32b9c950ywan
866233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r6, [src], pstep            ; load source data
867233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldrb        r4, [r2]                    ; blimit
868233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pld         [src, #23]
869233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r7, [src], pstep
870233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldrb        r2, [r3]                    ; limit
871233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pld         [src, #23]
872233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r8, [src], pstep
873233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r4, r4, r4, lsl #8
874233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldrb        r3, [r12]                   ; thresh
875233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r2, r2, r2, lsl #8
876233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pld         [src, #23]
877233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         lr, [src], pstep
878233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         count, count, lsl #1        ; 4-in-parallel
879233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r4, r4, r4, lsl #16
880233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r3, r3, r3, lsl #8
881233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r2, r2, r2, lsl #16
882233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r3, r3, r3, lsl #16
883233d2500723e5594f3e7c70896ffeeef32b9c950ywan
884233d2500723e5594f3e7c70896ffeeef32b9c950ywan|MBVnext8|
885233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; vp8_filter_mask() function
886233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; calculate breakout conditions
887233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; transpose the source data for 4-in-parallel operation
888233d2500723e5594f3e7c70896ffeeef32b9c950ywan    TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
889233d2500723e5594f3e7c70896ffeeef32b9c950ywan
890233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r7, r9, r10                 ; p3 - p2
891233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r8, r10, r9                 ; p2 - p3
892233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r9, r10, r11                ; p2 - p1
893233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r10, r11, r10               ; p1 - p2
894233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r7, r7, r8                  ; abs (p3-p2)
895233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r10, r9, r10                ; abs (p2-p1)
896233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      lr, r7, r2                  ; compare to limit. lr: vp8_filter_mask
897233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r10, r10, r2                ; compare to limit
898233d2500723e5594f3e7c70896ffeeef32b9c950ywan
899233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
900233d2500723e5594f3e7c70896ffeeef32b9c950ywan
901233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         lr, lr, r10
902233d2500723e5594f3e7c70896ffeeef32b9c950ywan
903233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r6, r11, r12                ; p1 - p0
904233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r7, r12, r11                ; p0 - p1
905233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add         src, src, #4                ; move src pointer up by 4
906233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r6, r6, r7                  ; abs (p1-p0)
907233d2500723e5594f3e7c70896ffeeef32b9c950ywan    str         r11, [sp, #12]              ; save p1
908233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r10, r6, r2                 ; compare to limit
909233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r11, r6, r3                 ; compare to thresh
910233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         lr, lr, r10
911233d2500723e5594f3e7c70896ffeeef32b9c950ywan
912233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now
913233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; transpose the source data for 4-in-parallel operation
914233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r6, [src], pstep            ; load source data
915233d2500723e5594f3e7c70896ffeeef32b9c950ywan    str         r11, [sp]                   ; push r11 to stack
916233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r7, [src], pstep
917233d2500723e5594f3e7c70896ffeeef32b9c950ywan    str         r12, [sp, #4]               ; save current reg before load q0 - q3 data
918233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r8, [src], pstep
919233d2500723e5594f3e7c70896ffeeef32b9c950ywan    str         lr, [sp, #8]
920233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         lr, [src], pstep
921233d2500723e5594f3e7c70896ffeeef32b9c950ywan
922233d2500723e5594f3e7c70896ffeeef32b9c950ywan
923233d2500723e5594f3e7c70896ffeeef32b9c950ywan    TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
924233d2500723e5594f3e7c70896ffeeef32b9c950ywan
925233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         lr, [sp, #8]                ; load back (f)limit accumulator
926233d2500723e5594f3e7c70896ffeeef32b9c950ywan
927233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r6, r12, r11                ; q3 - q2
928233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r7, r11, r12                ; q2 - q3
929233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r12, r11, r10               ; q2 - q1
930233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r11, r10, r11               ; q1 - q2
931233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r6, r6, r7                  ; abs (q3-q2)
932233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r7, r12, r11                ; abs (q2-q1)
933233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r6, r6, r2                  ; compare to limit
934233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r7, r7, r2                  ; compare to limit
935233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r11, [sp, #4]               ; load back p0
936233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r12, [sp, #12]              ; load back p1
937233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         lr, lr, r6
938233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         lr, lr, r7
939233d2500723e5594f3e7c70896ffeeef32b9c950ywan
940233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r6, r11, r9                 ; p0 - q0
941233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r7, r9, r11                 ; q0 - p0
942233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r8, r12, r10                ; p1 - q1
943233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r11, r10, r12               ; q1 - p1
944233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r6, r6, r7                  ; abs (p0-q0)
945233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r7, c0x7F7F7F7F
946233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r8, r8, r11                 ; abs (p1-q1)
947233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqadd8      r6, r6, r6                  ; abs (p0-q0) * 2
948233d2500723e5594f3e7c70896ffeeef32b9c950ywan    and         r8, r7, r8, lsr #1          ; abs (p1-q1) / 2
949233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r11, r10, r9                ; q1 - q0
950233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqadd8      r6, r8, r6                  ; abs (p0-q0)*2 + abs (p1-q1)/2
951233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r12, r9, r10                ; q0 - q1
952233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r6, r6, r4                  ; compare to flimit
953233d2500723e5594f3e7c70896ffeeef32b9c950ywan
954233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r9, r11, r12                ; abs (q1-q0)
955233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r8, r9, r2                  ; compare to limit
956233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uqsub8      r10, r9, r3                 ; compare to thresh
957233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         lr, lr, r6
958233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         lr, lr, r8
959233d2500723e5594f3e7c70896ffeeef32b9c950ywan
960233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mvn         r11, #0                     ; r11 == -1
961233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         r12, #0
962233d2500723e5594f3e7c70896ffeeef32b9c950ywan
963233d2500723e5594f3e7c70896ffeeef32b9c950ywan    usub8       lr, r12, lr
964233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r9, [sp]                    ; load the compared result
965233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sel         lr, r11, r12                ; filter mask: lr
966233d2500723e5594f3e7c70896ffeeef32b9c950ywan
967233d2500723e5594f3e7c70896ffeeef32b9c950ywan    cmp         lr, #0
968233d2500723e5594f3e7c70896ffeeef32b9c950ywan    beq         mbvskip_filter               ; skip filtering
969233d2500723e5594f3e7c70896ffeeef32b9c950ywan
970233d2500723e5594f3e7c70896ffeeef32b9c950ywan
971233d2500723e5594f3e7c70896ffeeef32b9c950ywan
972233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;vp8_hevmask() function
973233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;calculate high edge variance
974233d2500723e5594f3e7c70896ffeeef32b9c950ywan
975233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
976233d2500723e5594f3e7c70896ffeeef32b9c950ywan
977233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r9, r9, r10
978233d2500723e5594f3e7c70896ffeeef32b9c950ywan
979233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldrh        r7, [src, #-2]
980233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldrh        r8, [src], pstep
981233d2500723e5594f3e7c70896ffeeef32b9c950ywan
982233d2500723e5594f3e7c70896ffeeef32b9c950ywan    usub8       r9, r12, r9
983233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sel         r6, r12, r11                ; hev mask: r6
984233d2500723e5594f3e7c70896ffeeef32b9c950ywan
985233d2500723e5594f3e7c70896ffeeef32b9c950ywan
986233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; vp8_mbfilter() function
987233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; p2, q2 are only needed at the end. Don't need to load them in now.
988233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first
989233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; load soure data to r6, r11, r12, lr
990233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldrh        r9, [src, #-2]
991233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldrh        r10, [src], pstep
992233d2500723e5594f3e7c70896ffeeef32b9c950ywan
993233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pkhbt       r12, r7, r8, lsl #16
994233d2500723e5594f3e7c70896ffeeef32b9c950ywan
995233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldrh        r7, [src, #-2]
996233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldrh        r8, [src], pstep
997233d2500723e5594f3e7c70896ffeeef32b9c950ywan
998233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pkhbt       r11, r9, r10, lsl #16
999233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1000233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldrh        r9, [src, #-2]
1001233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldrh        r10, [src], pstep
1002233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1003233d2500723e5594f3e7c70896ffeeef32b9c950ywan    str         r6, [sp]                    ; save r6
1004233d2500723e5594f3e7c70896ffeeef32b9c950ywan    str         lr, [sp, #4]                ; save lr
1005233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1006233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pkhbt       r6, r7, r8, lsl #16
1007233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pkhbt       lr, r9, r10, lsl #16
1008233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1009233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;transpose r12, r11, r6, lr to p1, p0, q0, q1
1010233d2500723e5594f3e7c70896ffeeef32b9c950ywan    TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10
1011233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1012233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;load back hev_mask r6 and filter_mask lr
1013233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r12, c0x80808080
1014233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r6, [sp]
1015233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         lr, [sp, #4]
1016233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1017233d2500723e5594f3e7c70896ffeeef32b9c950ywan    eor         r7, r7, r12                 ; ps1
1018233d2500723e5594f3e7c70896ffeeef32b9c950ywan    eor         r8, r8, r12                 ; ps0
1019233d2500723e5594f3e7c70896ffeeef32b9c950ywan    eor         r9, r9, r12                 ; qs0
1020233d2500723e5594f3e7c70896ffeeef32b9c950ywan    eor         r10, r10, r12               ; qs1
1021233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1022233d2500723e5594f3e7c70896ffeeef32b9c950ywan    qsub8       r12, r9, r8                 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
1023233d2500723e5594f3e7c70896ffeeef32b9c950ywan    str         r7, [sp, #12]               ; store ps1 temporarily
1024233d2500723e5594f3e7c70896ffeeef32b9c950ywan    qsub8       r7, r7, r10                 ; vp8_signed_char_clamp(ps1-qs1)
1025233d2500723e5594f3e7c70896ffeeef32b9c950ywan    str         r10, [sp, #8]               ; store qs1 temporarily
1026233d2500723e5594f3e7c70896ffeeef32b9c950ywan    qadd8       r7, r7, r12
1027233d2500723e5594f3e7c70896ffeeef32b9c950ywan    str         r9, [sp]                    ; store qs0 temporarily
1028233d2500723e5594f3e7c70896ffeeef32b9c950ywan    qadd8       r7, r7, r12
1029233d2500723e5594f3e7c70896ffeeef32b9c950ywan    str         r8, [sp, #4]                ; store ps0 temporarily
1030233d2500723e5594f3e7c70896ffeeef32b9c950ywan    qadd8       r7, r7, r12                 ; vp8_filter: r7
1031233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1032233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r10, c0x03030303            ; r10 = 3 --modified for vp8
1033233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r9, c0x04040404
1034233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;mvn         r11, #0                     ; r11 == -1
1035233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1036233d2500723e5594f3e7c70896ffeeef32b9c950ywan    and         r7, r7, lr                  ; vp8_filter &= mask (lr is free)
1037233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1038233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         r12, r7                     ; Filter2: r12
1039233d2500723e5594f3e7c70896ffeeef32b9c950ywan    and         r12, r12, r6                ; Filter2 &= hev
1040233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1041233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;modify code for vp8
1042233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;save bottom 3 bits so that we round one side +4 and the other +3
1043233d2500723e5594f3e7c70896ffeeef32b9c950ywan    qadd8       r8 , r12 , r9               ; Filter1 (r8) = vp8_signed_char_clamp(Filter2+4)
1044233d2500723e5594f3e7c70896ffeeef32b9c950ywan    qadd8       r12 , r12 , r10             ; Filter2 (r12) = vp8_signed_char_clamp(Filter2+3)
1045233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1046233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         r10, #0
1047233d2500723e5594f3e7c70896ffeeef32b9c950ywan    shadd8      r8 , r8 , r10               ; Filter1 >>= 3
1048233d2500723e5594f3e7c70896ffeeef32b9c950ywan    shadd8      r12 , r12 , r10             ; Filter2 >>= 3
1049233d2500723e5594f3e7c70896ffeeef32b9c950ywan    shadd8      r8 , r8 , r10
1050233d2500723e5594f3e7c70896ffeeef32b9c950ywan    shadd8      r12 , r12 , r10
1051233d2500723e5594f3e7c70896ffeeef32b9c950ywan    shadd8      r8 , r8 , r10               ; r8: Filter1
1052233d2500723e5594f3e7c70896ffeeef32b9c950ywan    shadd8      r12 , r12 , r10             ; r12: Filter2
1053233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1054233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r9, [sp]                    ; load qs0
1055233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r11, [sp, #4]               ; load ps0
1056233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1057233d2500723e5594f3e7c70896ffeeef32b9c950ywan    qsub8       r9 , r9, r8                 ; qs0 = vp8_signed_char_clamp(qs0 - Filter1)
1058233d2500723e5594f3e7c70896ffeeef32b9c950ywan    qadd8       r11, r11, r12               ; ps0 = vp8_signed_char_clamp(ps0 + Filter2)
1059233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1060233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;save bottom 3 bits so that we round one side +4 and the other +3
1061233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;and            r8, r12, r10                ; s = Filter2 & 7 (s: r8)
1062233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;qadd8      r12 , r12 , r9              ; Filter2 = vp8_signed_char_clamp(Filter2+4)
1063233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;mov            r10, #0
1064233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;shadd8     r12 , r12 , r10             ; Filter2 >>= 3
1065233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;usub8      lr, r8, r9                  ; s = (s==4)*-1
1066233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;sel            lr, r11, r10
1067233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;shadd8     r12 , r12 , r10
1068233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;usub8      r8, r9, r8
1069233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;sel            r8, r11, r10
1070233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;ldr            r9, [sp]                    ; load qs0
1071233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;ldr            r11, [sp, #4]               ; load ps0
1072233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;shadd8     r12 , r12 , r10
1073233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;and            r8, r8, lr                  ; -1 for each element that equals 4
1074233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;qadd8      r10, r8, r12                ; u = vp8_signed_char_clamp(s + Filter2)
1075233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;qsub8      r9 , r9, r12                ; qs0 = vp8_signed_char_clamp(qs0 - Filter2)
1076233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;qadd8      r11, r11, r10               ; ps0 = vp8_signed_char_clamp(ps0 + u)
1077233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1078233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;end of modification for vp8
1079233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1080233d2500723e5594f3e7c70896ffeeef32b9c950ywan    bic         r12, r7, r6                 ;vp8_filter &= ~hev    ( r6 is free)
1081233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;mov            r12, r7
1082233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1083233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;roughly 3/7th difference across boundary
1084233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         lr, #0x1b                   ; 27
1085233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         r7, #0x3f                   ; 63
1086233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1087233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sxtb16      r6, r12
1088233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sxtb16      r10, r12, ror #8
1089233d2500723e5594f3e7c70896ffeeef32b9c950ywan    smlabb      r8, r6, lr, r7
1090233d2500723e5594f3e7c70896ffeeef32b9c950ywan    smlatb      r6, r6, lr, r7
1091233d2500723e5594f3e7c70896ffeeef32b9c950ywan    smlabb      r7, r10, lr, r7
1092233d2500723e5594f3e7c70896ffeeef32b9c950ywan    smultb      r10, r10, lr
1093233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ssat        r8, #8, r8, asr #7
1094233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ssat        r6, #8, r6, asr #7
1095233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add         r10, r10, #63
1096233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ssat        r7, #8, r7, asr #7
1097233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ssat        r10, #8, r10, asr #7
1098233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1099233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         lr, c0x80808080
1100233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1101233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pkhbt       r6, r8, r6, lsl #16
1102233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pkhbt       r10, r7, r10, lsl #16
1103233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uxtb16      r6, r6
1104233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uxtb16      r10, r10
1105233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1106233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
1107233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1108233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r10, r6, r10, lsl #8        ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
1109233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1110233d2500723e5594f3e7c70896ffeeef32b9c950ywan    qsub8       r8, r9, r10                 ; s = vp8_signed_char_clamp(qs0 - u)
1111233d2500723e5594f3e7c70896ffeeef32b9c950ywan    qadd8       r10, r11, r10               ; s = vp8_signed_char_clamp(ps0 + u)
1112233d2500723e5594f3e7c70896ffeeef32b9c950ywan    eor         r8, r8, lr                  ; *oq0 = s^0x80
1113233d2500723e5594f3e7c70896ffeeef32b9c950ywan    eor         r10, r10, lr                ; *op0 = s^0x80
1114233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1115233d2500723e5594f3e7c70896ffeeef32b9c950ywan    strb        r10, [src, #-1]             ; store op0 result
1116233d2500723e5594f3e7c70896ffeeef32b9c950ywan    strb        r8, [src], pstep            ; store oq0 result
1117233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         r10, r10, lsr #8
1118233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         r8, r8, lsr #8
1119233d2500723e5594f3e7c70896ffeeef32b9c950ywan    strb        r10, [src, #-1]
1120233d2500723e5594f3e7c70896ffeeef32b9c950ywan    strb        r8, [src], pstep
1121233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         r10, r10, lsr #8
1122233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         r8, r8, lsr #8
1123233d2500723e5594f3e7c70896ffeeef32b9c950ywan    strb        r10, [src, #-1]
1124233d2500723e5594f3e7c70896ffeeef32b9c950ywan    strb        r8, [src], pstep
1125233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         r10, r10, lsr #8
1126233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         r8, r8, lsr #8
1127233d2500723e5594f3e7c70896ffeeef32b9c950ywan    strb        r10, [src, #-1]
1128233d2500723e5594f3e7c70896ffeeef32b9c950ywan    strb        r8, [src], pstep
1129233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1130233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;roughly 2/7th difference across boundary
1131233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         lr, #0x12                   ; 18
1132233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         r7, #0x3f                   ; 63
1133233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1134233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sxtb16      r6, r12
1135233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sxtb16      r10, r12, ror #8
1136233d2500723e5594f3e7c70896ffeeef32b9c950ywan    smlabb      r8, r6, lr, r7
1137233d2500723e5594f3e7c70896ffeeef32b9c950ywan    smlatb      r6, r6, lr, r7
1138233d2500723e5594f3e7c70896ffeeef32b9c950ywan    smlabb      r9, r10, lr, r7
1139233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1140233d2500723e5594f3e7c70896ffeeef32b9c950ywan    smlatb      r10, r10, lr, r7
1141233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ssat        r8, #8, r8, asr #7
1142233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ssat        r6, #8, r6, asr #7
1143233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ssat        r9, #8, r9, asr #7
1144233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ssat        r10, #8, r10, asr #7
1145233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1146233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
1147233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1148233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pkhbt       r6, r8, r6, lsl #16
1149233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pkhbt       r10, r9, r10, lsl #16
1150233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1151233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r9, [sp, #8]                ; load qs1
1152233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r11, [sp, #12]              ; load ps1
1153233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         lr, c0x80808080
1154233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1155233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uxtb16      r6, r6
1156233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uxtb16      r10, r10
1157233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1158233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add         src, src, #2
1159233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1160233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r10, r6, r10, lsl #8        ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
1161233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1162233d2500723e5594f3e7c70896ffeeef32b9c950ywan    qsub8       r8, r9, r10                 ; s = vp8_signed_char_clamp(qs1 - u)
1163233d2500723e5594f3e7c70896ffeeef32b9c950ywan    qadd8       r10, r11, r10               ; s = vp8_signed_char_clamp(ps1 + u)
1164233d2500723e5594f3e7c70896ffeeef32b9c950ywan    eor         r8, r8, lr                  ; *oq1 = s^0x80
1165233d2500723e5594f3e7c70896ffeeef32b9c950ywan    eor         r10, r10, lr                ; *op1 = s^0x80
1166233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1167233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldrb        r11, [src, #-5]             ; load p2 for 1/7th difference across boundary
1168233d2500723e5594f3e7c70896ffeeef32b9c950ywan    strb        r10, [src, #-4]             ; store op1
1169233d2500723e5594f3e7c70896ffeeef32b9c950ywan    strb        r8, [src, #-1]              ; store oq1
1170233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldrb        r9, [src], pstep            ; load q2 for 1/7th difference across boundary
1171233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1172233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         r10, r10, lsr #8
1173233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         r8, r8, lsr #8
1174233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1175233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldrb        r6, [src, #-5]
1176233d2500723e5594f3e7c70896ffeeef32b9c950ywan    strb        r10, [src, #-4]
1177233d2500723e5594f3e7c70896ffeeef32b9c950ywan    strb        r8, [src, #-1]
1178233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldrb        r7, [src], pstep
1179233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1180233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         r10, r10, lsr #8
1181233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         r8, r8, lsr #8
1182233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r11, r11, r6, lsl #8
1183233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r9, r9, r7, lsl #8
1184233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1185233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldrb        r6, [src, #-5]
1186233d2500723e5594f3e7c70896ffeeef32b9c950ywan    strb        r10, [src, #-4]
1187233d2500723e5594f3e7c70896ffeeef32b9c950ywan    strb        r8, [src, #-1]
1188233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldrb        r7, [src], pstep
1189233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1190233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         r10, r10, lsr #8
1191233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         r8, r8, lsr #8
1192233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r11, r11, r6, lsl #16
1193233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r9, r9, r7, lsl #16
1194233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1195233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldrb        r6, [src, #-5]
1196233d2500723e5594f3e7c70896ffeeef32b9c950ywan    strb        r10, [src, #-4]
1197233d2500723e5594f3e7c70896ffeeef32b9c950ywan    strb        r8, [src, #-1]
1198233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldrb        r7, [src], pstep
1199233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r11, r11, r6, lsl #24
1200233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r9, r9, r7, lsl #24
1201233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1202233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;roughly 1/7th difference across boundary
1203233d2500723e5594f3e7c70896ffeeef32b9c950ywan    eor         r9, r9, lr
1204233d2500723e5594f3e7c70896ffeeef32b9c950ywan    eor         r11, r11, lr
1205233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1206233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         lr, #0x9                    ; 9
1207233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         r7, #0x3f                   ; 63
1208233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1209233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sxtb16      r6, r12
1210233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sxtb16      r10, r12, ror #8
1211233d2500723e5594f3e7c70896ffeeef32b9c950ywan    smlabb      r8, r6, lr, r7
1212233d2500723e5594f3e7c70896ffeeef32b9c950ywan    smlatb      r6, r6, lr, r7
1213233d2500723e5594f3e7c70896ffeeef32b9c950ywan    smlabb      r12, r10, lr, r7
1214233d2500723e5594f3e7c70896ffeeef32b9c950ywan    smlatb      r10, r10, lr, r7
1215233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ssat        r8, #8, r8, asr #7
1216233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ssat        r6, #8, r6, asr #7
1217233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ssat        r12, #8, r12, asr #7
1218233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ssat        r10, #8, r10, asr #7
1219233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1220233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         src, src, pstep, lsl #2
1221233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1222233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pkhbt       r6, r8, r6, lsl #16
1223233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pkhbt       r10, r12, r10, lsl #16
1224233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1225233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uxtb16      r6, r6
1226233d2500723e5594f3e7c70896ffeeef32b9c950ywan    uxtb16      r10, r10
1227233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1228233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         lr, c0x80808080
1229233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1230233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orr         r10, r6, r10, lsl #8        ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
1231233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1232233d2500723e5594f3e7c70896ffeeef32b9c950ywan    qadd8       r8, r11, r10                ; s = vp8_signed_char_clamp(ps2 + u)
1233233d2500723e5594f3e7c70896ffeeef32b9c950ywan    qsub8       r10, r9, r10                ; s = vp8_signed_char_clamp(qs2 - u)
1234233d2500723e5594f3e7c70896ffeeef32b9c950ywan    eor         r8, r8, lr                  ; *op2 = s^0x80
1235233d2500723e5594f3e7c70896ffeeef32b9c950ywan    eor         r10, r10, lr                ; *oq2 = s^0x80
1236233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1237233d2500723e5594f3e7c70896ffeeef32b9c950ywan    strb        r8, [src, #-5]              ; store *op2
1238233d2500723e5594f3e7c70896ffeeef32b9c950ywan    strb        r10, [src], pstep           ; store *oq2
1239233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         r8, r8, lsr #8
1240233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         r10, r10, lsr #8
1241233d2500723e5594f3e7c70896ffeeef32b9c950ywan    strb        r8, [src, #-5]
1242233d2500723e5594f3e7c70896ffeeef32b9c950ywan    strb        r10, [src], pstep
1243233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         r8, r8, lsr #8
1244233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         r10, r10, lsr #8
1245233d2500723e5594f3e7c70896ffeeef32b9c950ywan    strb        r8, [src, #-5]
1246233d2500723e5594f3e7c70896ffeeef32b9c950ywan    strb        r10, [src], pstep
1247233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         r8, r8, lsr #8
1248233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         r10, r10, lsr #8
1249233d2500723e5594f3e7c70896ffeeef32b9c950ywan    strb        r8, [src, #-5]
1250233d2500723e5594f3e7c70896ffeeef32b9c950ywan    strb        r10, [src], pstep
1251233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1252233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;adjust src pointer for next loop
1253233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         src, src, #2
1254233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1255233d2500723e5594f3e7c70896ffeeef32b9c950ywan|mbvskip_filter|
1256233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         src, src, #4
1257233d2500723e5594f3e7c70896ffeeef32b9c950ywan    subs        count, count, #1
1258233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1259233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pld         [src, #23]                  ; preload for next block
1260233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldrne       r6, [src], pstep            ; load source data
1261233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pld         [src, #23]
1262233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldrne       r7, [src], pstep
1263233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pld         [src, #23]
1264233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldrne       r8, [src], pstep
1265233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pld         [src, #23]
1266233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldrne       lr, [src], pstep
1267233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1268233d2500723e5594f3e7c70896ffeeef32b9c950ywan    bne         MBVnext8
1269233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1270233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add         sp, sp, #16
1271233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1272233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldmia       sp!, {r4 - r11, pc}
1273233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ENDP        ; |vp8_mbloop_filter_vertical_edge_armv6|
1274233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1275233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Constant Pool
1276233d2500723e5594f3e7c70896ffeeef32b9c950ywanc0x80808080 DCD     0x80808080
1277233d2500723e5594f3e7c70896ffeeef32b9c950ywanc0x03030303 DCD     0x03030303
1278233d2500723e5594f3e7c70896ffeeef32b9c950ywanc0x04040404 DCD     0x04040404
1279233d2500723e5594f3e7c70896ffeeef32b9c950ywanc0x01010101 DCD     0x01010101
1280233d2500723e5594f3e7c70896ffeeef32b9c950ywanc0x7F7F7F7F DCD     0x7F7F7F7F
1281233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1282233d2500723e5594f3e7c70896ffeeef32b9c950ywan    END
1283