1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12    EXPORT |vp8_loop_filter_simple_horizontal_edge_armv6|
13    EXPORT |vp8_loop_filter_simple_vertical_edge_armv6|
14
15    AREA    |.text|, CODE, READONLY  ; name this block of code
16
17    MACRO
18    TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3
19    ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3
20    ; a0: 03 02 01 00
21    ; a1: 13 12 11 10
22    ; a2: 23 22 21 20
23    ; a3: 33 32 31 30
24    ;     b3 b2 b1 b0
25
26    uxtb16      $b1, $a1                    ; xx 12 xx 10
27    uxtb16      $b0, $a0                    ; xx 02 xx 00
28    uxtb16      $b3, $a3                    ; xx 32 xx 30
29    uxtb16      $b2, $a2                    ; xx 22 xx 20
30    orr         $b1, $b0, $b1, lsl #8       ; 12 02 10 00
31    orr         $b3, $b2, $b3, lsl #8       ; 32 22 30 20
32
33    uxtb16      $a1, $a1, ror #8            ; xx 13 xx 11
34    uxtb16      $a3, $a3, ror #8            ; xx 33 xx 31
35    uxtb16      $a0, $a0, ror #8            ; xx 03 xx 01
36    uxtb16      $a2, $a2, ror #8            ; xx 23 xx 21
37    orr         $a0, $a0, $a1, lsl #8       ; 13 03 11 01
38    orr         $a2, $a2, $a3, lsl #8       ; 33 23 31 21
39
40    pkhtb       $b2, $b3, $b1, asr #16      ; 32 22 12 02   -- p1
41    pkhbt       $b0, $b1, $b3, lsl #16      ; 30 20 10 00   -- p3
42
43    pkhtb       $b3, $a2, $a0, asr #16      ; 33 23 13 03   -- p0
44    pkhbt       $b1, $a0, $a2, lsl #16      ; 31 21 11 01   -- p2
45    MEND
46
47
48
49src         RN  r0
50pstep       RN  r1
51
52;r0     unsigned char *src_ptr,
53;r1     int src_pixel_step,
54;r2     const char *blimit
55
56;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
57|vp8_loop_filter_simple_horizontal_edge_armv6| PROC
58;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
59    stmdb       sp!, {r4 - r11, lr}
60
61    ldrb        r12, [r2]                   ; blimit
62    ldr         r3, [src, -pstep, lsl #1]   ; p1
63    ldr         r4, [src, -pstep]           ; p0
64    ldr         r5, [src]                   ; q0
65    ldr         r6, [src, pstep]            ; q1
66    orr         r12, r12, r12, lsl #8       ; blimit
67    ldr         r2, c0x80808080
68    orr         r12, r12, r12, lsl #16      ; blimit
69    mov         r9, #4                      ; double the count. we're doing 4 at a time
70    mov         lr, #0                      ; need 0 in a couple places
71
72|simple_hnext8|
73    ; vp8_simple_filter_mask()
74
75    uqsub8      r7, r3, r6                  ; p1 - q1
76    uqsub8      r8, r6, r3                  ; q1 - p1
77    uqsub8      r10, r4, r5                 ; p0 - q0
78    uqsub8      r11, r5, r4                 ; q0 - p0
79    orr         r8, r8, r7                  ; abs(p1 - q1)
80    orr         r10, r10, r11               ; abs(p0 - q0)
81    uqadd8      r10, r10, r10               ; abs(p0 - q0) * 2
82    uhadd8      r8, r8, lr                  ; abs(p1 - q2) >> 1
83    uqadd8      r10, r10, r8                ; abs(p0 - q0)*2 + abs(p1 - q1)/2
84    mvn         r8, #0
85    usub8       r10, r12, r10               ; compare to flimit. usub8 sets GE flags
86    sel         r10, r8, lr                 ; filter mask: F or 0
87    cmp         r10, #0
88    beq         simple_hskip_filter         ; skip filtering if all masks are 0x00
89
90    ;vp8_simple_filter()
91
92    eor         r3, r3, r2                  ; p1 offset to convert to a signed value
93    eor         r6, r6, r2                  ; q1 offset to convert to a signed value
94    eor         r4, r4, r2                  ; p0 offset to convert to a signed value
95    eor         r5, r5, r2                  ; q0 offset to convert to a signed value
96
97    qsub8       r3, r3, r6                  ; vp8_filter = p1 - q1
98    qsub8       r6, r5, r4                  ; q0 - p0
99    qadd8       r3, r3, r6                  ; += q0 - p0
100    ldr         r7, c0x04040404
101    qadd8       r3, r3, r6                  ; += q0 - p0
102    ldr         r8, c0x03030303
103    qadd8       r3, r3, r6                  ; vp8_filter = p1-q1 + 3*(q0-p0))
104    ;STALL
105    and         r3, r3, r10                 ; vp8_filter &= mask
106
107    qadd8       r7 , r3 , r7                ; Filter1 = vp8_filter + 4
108    qadd8       r8 , r3 , r8                ; Filter2 = vp8_filter + 3
109
110    shadd8      r7 , r7 , lr
111    shadd8      r8 , r8 , lr
112    shadd8      r7 , r7 , lr
113    shadd8      r8 , r8 , lr
114    shadd8      r7 , r7 , lr                ; Filter1 >>= 3
115    shadd8      r8 , r8 , lr                ; Filter2 >>= 3
116
117    qsub8       r5 ,r5, r7                  ; u = q0 - Filter1
118    qadd8       r4, r4, r8                  ; u = p0 + Filter2
119    eor         r5, r5, r2                  ; *oq0 = u^0x80
120    str         r5, [src]                   ; store oq0 result
121    eor         r4, r4, r2                  ; *op0 = u^0x80
122    str         r4, [src, -pstep]           ; store op0 result
123
124|simple_hskip_filter|
125    subs        r9, r9, #1
126    addne       src, src, #4                ; next row
127
128    ldrne       r3, [src, -pstep, lsl #1]   ; p1
129    ldrne       r4, [src, -pstep]           ; p0
130    ldrne       r5, [src]                   ; q0
131    ldrne       r6, [src, pstep]            ; q1
132
133    bne         simple_hnext8
134
135    ldmia       sp!, {r4 - r11, pc}
136    ENDP        ; |vp8_loop_filter_simple_horizontal_edge_armv6|
137
138
139;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
140|vp8_loop_filter_simple_vertical_edge_armv6| PROC
141;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
142    stmdb       sp!, {r4 - r11, lr}
143
144    ldrb        r12, [r2]                   ; r12: blimit
145    ldr         r2, c0x80808080
146    orr         r12, r12, r12, lsl #8
147
148    ; load soure data to r7, r8, r9, r10
149    ldrh        r3, [src, #-2]
150    pld         [src, #23]                  ; preload for next block
151    ldrh        r4, [src], pstep
152    orr         r12, r12, r12, lsl #16
153
154    ldrh        r5, [src, #-2]
155    pld         [src, #23]
156    ldrh        r6, [src], pstep
157
158    pkhbt       r7, r3, r4, lsl #16
159
160    ldrh        r3, [src, #-2]
161    pld         [src, #23]
162    ldrh        r4, [src], pstep
163
164    pkhbt       r8, r5, r6, lsl #16
165
166    ldrh        r5, [src, #-2]
167    pld         [src, #23]
168    ldrh        r6, [src], pstep
169    mov         r11, #4                     ; double the count. we're doing 4 at a time
170
171|simple_vnext8|
172    ; vp8_simple_filter_mask() function
173    pkhbt       r9, r3, r4, lsl #16
174    pkhbt       r10, r5, r6, lsl #16
175
176    ;transpose r7, r8, r9, r10 to r3, r4, r5, r6
177    TRANSPOSE_MATRIX r7, r8, r9, r10, r3, r4, r5, r6
178
179    uqsub8      r7, r3, r6                  ; p1 - q1
180    uqsub8      r8, r6, r3                  ; q1 - p1
181    uqsub8      r9, r4, r5                  ; p0 - q0
182    uqsub8      r10, r5, r4                 ; q0 - p0
183    orr         r7, r7, r8                  ; abs(p1 - q1)
184    orr         r9, r9, r10                 ; abs(p0 - q0)
185    mov         r8, #0
186    uqadd8      r9, r9, r9                  ; abs(p0 - q0) * 2
187    uhadd8      r7, r7, r8                  ; abs(p1 - q1) / 2
188    uqadd8      r7, r7, r9                  ; abs(p0 - q0)*2 + abs(p1 - q1)/2
189    mvn         r10, #0                     ; r10 == -1
190
191    usub8       r7, r12, r7                 ; compare to flimit
192    sel         lr, r10, r8                 ; filter mask
193
194    cmp         lr, #0
195    beq         simple_vskip_filter         ; skip filtering
196
197    ;vp8_simple_filter() function
198    eor         r3, r3, r2                  ; p1 offset to convert to a signed value
199    eor         r6, r6, r2                  ; q1 offset to convert to a signed value
200    eor         r4, r4, r2                  ; p0 offset to convert to a signed value
201    eor         r5, r5, r2                  ; q0 offset to convert to a signed value
202
203    qsub8       r3, r3, r6                  ; vp8_filter = p1 - q1
204    qsub8       r6, r5, r4                  ; q0 - p0
205
206    qadd8       r3, r3, r6                  ; vp8_filter += q0 - p0
207    ldr         r9, c0x03030303             ; r9 = 3
208
209    qadd8       r3, r3, r6                  ; vp8_filter += q0 - p0
210    ldr         r7, c0x04040404
211
212    qadd8       r3, r3, r6                  ; vp8_filter = p1-q1 + 3*(q0-p0))
213    ;STALL
214    and         r3, r3, lr                  ; vp8_filter &= mask
215
216    qadd8       r9 , r3 , r9                ; Filter2 = vp8_filter + 3
217    qadd8       r3 , r3 , r7                ; Filter1 = vp8_filter + 4
218
219    shadd8      r9 , r9 , r8
220    shadd8      r3 , r3 , r8
221    shadd8      r9 , r9 , r8
222    shadd8      r3 , r3 , r8
223    shadd8      r9 , r9 , r8                ; Filter2 >>= 3
224    shadd8      r3 , r3 , r8                ; Filter1 >>= 3
225
226    ;calculate output
227    sub         src, src, pstep, lsl #2
228
229    qadd8       r4, r4, r9                  ; u = p0 + Filter2
230    qsub8       r5, r5, r3                  ; u = q0 - Filter1
231    eor         r4, r4, r2                  ; *op0 = u^0x80
232    eor         r5, r5, r2                  ; *oq0 = u^0x80
233
234    strb        r4, [src, #-1]              ; store the result
235    mov         r4, r4, lsr #8
236    strb        r5, [src], pstep
237    mov         r5, r5, lsr #8
238
239    strb        r4, [src, #-1]
240    mov         r4, r4, lsr #8
241    strb        r5, [src], pstep
242    mov         r5, r5, lsr #8
243
244    strb        r4, [src, #-1]
245    mov         r4, r4, lsr #8
246    strb        r5, [src], pstep
247    mov         r5, r5, lsr #8
248
249    strb        r4, [src, #-1]
250    strb        r5, [src], pstep
251
252|simple_vskip_filter|
253    subs        r11, r11, #1
254
255    ; load soure data to r7, r8, r9, r10
256    ldrneh      r3, [src, #-2]
257    pld         [src, #23]                  ; preload for next block
258    ldrneh      r4, [src], pstep
259
260    ldrneh      r5, [src, #-2]
261    pld         [src, #23]
262    ldrneh      r6, [src], pstep
263
264    pkhbt       r7, r3, r4, lsl #16
265
266    ldrneh      r3, [src, #-2]
267    pld         [src, #23]
268    ldrneh      r4, [src], pstep
269
270    pkhbt       r8, r5, r6, lsl #16
271
272    ldrneh      r5, [src, #-2]
273    pld         [src, #23]
274    ldrneh      r6, [src], pstep
275
276    bne         simple_vnext8
277
278    ldmia       sp!, {r4 - r11, pc}
279    ENDP        ; |vp8_loop_filter_simple_vertical_edge_armv6|
280
281; Constant Pool
282c0x80808080 DCD     0x80808080
283c0x03030303 DCD     0x03030303
284c0x04040404 DCD     0x04040404
285
286    END
287