loopfilter_4_neon.asm revision 7ce0a1d1337c01056ba24006efab21f00e179e04
1;
2;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11    EXPORT  |vpx_lpf_horizontal_4_neon|
12    EXPORT  |vpx_lpf_vertical_4_neon|
13    ARM
14
15    AREA ||.text||, CODE, READONLY, ALIGN=2
16
17; Currently vpx only works on iterations 8 at a time. The vp8 loop filter
18; works on 16 iterations at a time.
19; TODO(fgalligan): See about removing the count code as this function is only
20; called with a count of 1.
21;
22; void vpx_lpf_horizontal_4_neon(uint8_t *s,
23;                                int p /* pitch */,
24;                                const uint8_t *blimit,
25;                                const uint8_t *limit,
26;                                const uint8_t *thresh,
27;                                int count)
28;
29; r0    uint8_t *s,
30; r1    int p, /* pitch */
31; r2    const uint8_t *blimit,
32; r3    const uint8_t *limit,
33; sp    const uint8_t *thresh,
34; sp+4  int count
35|vpx_lpf_horizontal_4_neon| PROC
36    push        {lr}
37
38    vld1.8      {d0[]}, [r2]               ; duplicate *blimit
39    ldr         r12, [sp, #8]              ; load count
40    ldr         r2, [sp, #4]               ; load thresh
41    add         r1, r1, r1                 ; double pitch
42
43    cmp         r12, #0
44    beq         end_vpx_lf_h_edge
45
46    vld1.8      {d1[]}, [r3]               ; duplicate *limit
47    vld1.8      {d2[]}, [r2]               ; duplicate *thresh
48
49count_lf_h_loop
50    sub         r2, r0, r1, lsl #1         ; move src pointer down by 4 lines
51    add         r3, r2, r1, lsr #1         ; set to 3 lines down
52
53    vld1.u8     {d3}, [r2@64], r1          ; p3
54    vld1.u8     {d4}, [r3@64], r1          ; p2
55    vld1.u8     {d5}, [r2@64], r1          ; p1
56    vld1.u8     {d6}, [r3@64], r1          ; p0
57    vld1.u8     {d7}, [r2@64], r1          ; q0
58    vld1.u8     {d16}, [r3@64], r1         ; q1
59    vld1.u8     {d17}, [r2@64]             ; q2
60    vld1.u8     {d18}, [r3@64]             ; q3
61
62    sub         r2, r2, r1, lsl #1
63    sub         r3, r3, r1, lsl #1
64
65    bl          vpx_loop_filter_neon
66
67    vst1.u8     {d4}, [r2@64], r1          ; store op1
68    vst1.u8     {d5}, [r3@64], r1          ; store op0
69    vst1.u8     {d6}, [r2@64], r1          ; store oq0
70    vst1.u8     {d7}, [r3@64], r1          ; store oq1
71
72    add         r0, r0, #8
73    subs        r12, r12, #1
74    bne         count_lf_h_loop
75
76end_vpx_lf_h_edge
77    pop         {pc}
78    ENDP        ; |vpx_lpf_horizontal_4_neon|
79
80; Currently vpx only works on iterations 8 at a time. The vp8 loop filter
81; works on 16 iterations at a time.
82; TODO(fgalligan): See about removing the count code as this function is only
83; called with a count of 1.
84;
85; void vpx_lpf_vertical_4_neon(uint8_t *s,
86;                              int p /* pitch */,
87;                              const uint8_t *blimit,
88;                              const uint8_t *limit,
89;                              const uint8_t *thresh,
90;                              int count)
91;
92; r0    uint8_t *s,
93; r1    int p, /* pitch */
94; r2    const uint8_t *blimit,
95; r3    const uint8_t *limit,
96; sp    const uint8_t *thresh,
97; sp+4  int count
98|vpx_lpf_vertical_4_neon| PROC
99    push        {lr}
100
101    vld1.8      {d0[]}, [r2]              ; duplicate *blimit
102    ldr         r12, [sp, #8]             ; load count
103    vld1.8      {d1[]}, [r3]              ; duplicate *limit
104
105    ldr         r3, [sp, #4]              ; load thresh
106    sub         r2, r0, #4                ; move s pointer down by 4 columns
107    cmp         r12, #0
108    beq         end_vpx_lf_v_edge
109
110    vld1.8      {d2[]}, [r3]              ; duplicate *thresh
111
112count_lf_v_loop
113    vld1.u8     {d3}, [r2], r1             ; load s data
114    vld1.u8     {d4}, [r2], r1
115    vld1.u8     {d5}, [r2], r1
116    vld1.u8     {d6}, [r2], r1
117    vld1.u8     {d7}, [r2], r1
118    vld1.u8     {d16}, [r2], r1
119    vld1.u8     {d17}, [r2], r1
120    vld1.u8     {d18}, [r2]
121
122    ;transpose to 8x16 matrix
123    vtrn.32     d3, d7
124    vtrn.32     d4, d16
125    vtrn.32     d5, d17
126    vtrn.32     d6, d18
127
128    vtrn.16     d3, d5
129    vtrn.16     d4, d6
130    vtrn.16     d7, d17
131    vtrn.16     d16, d18
132
133    vtrn.8      d3, d4
134    vtrn.8      d5, d6
135    vtrn.8      d7, d16
136    vtrn.8      d17, d18
137
138    bl          vpx_loop_filter_neon
139
140    sub         r0, r0, #2
141
142    ;store op1, op0, oq0, oq1
143    vst4.8      {d4[0], d5[0], d6[0], d7[0]}, [r0], r1
144    vst4.8      {d4[1], d5[1], d6[1], d7[1]}, [r0], r1
145    vst4.8      {d4[2], d5[2], d6[2], d7[2]}, [r0], r1
146    vst4.8      {d4[3], d5[3], d6[3], d7[3]}, [r0], r1
147    vst4.8      {d4[4], d5[4], d6[4], d7[4]}, [r0], r1
148    vst4.8      {d4[5], d5[5], d6[5], d7[5]}, [r0], r1
149    vst4.8      {d4[6], d5[6], d6[6], d7[6]}, [r0], r1
150    vst4.8      {d4[7], d5[7], d6[7], d7[7]}, [r0]
151
152    add         r0, r0, r1, lsl #3         ; s += pitch * 8
153    subs        r12, r12, #1
154    subne       r2, r0, #4                 ; move s pointer down by 4 columns
155    bne         count_lf_v_loop
156
157end_vpx_lf_v_edge
158    pop         {pc}
159    ENDP        ; |vpx_lpf_vertical_4_neon|
160
161; void vpx_loop_filter_neon();
162; This is a helper function for the loopfilters. The invidual functions do the
163; necessary load, transpose (if necessary) and store. The function does not use
164; registers d8-d15.
165;
166; Inputs:
167; r0-r3, r12 PRESERVE
168; d0    blimit
169; d1    limit
170; d2    thresh
171; d3    p3
172; d4    p2
173; d5    p1
174; d6    p0
175; d7    q0
176; d16   q1
177; d17   q2
178; d18   q3
179;
180; Outputs:
181; d4    op1
182; d5    op0
183; d6    oq0
184; d7    oq1
185|vpx_loop_filter_neon| PROC
186    ; filter_mask
187    vabd.u8     d19, d3, d4                 ; m1 = abs(p3 - p2)
188    vabd.u8     d20, d4, d5                 ; m2 = abs(p2 - p1)
189    vabd.u8     d21, d5, d6                 ; m3 = abs(p1 - p0)
190    vabd.u8     d22, d16, d7                ; m4 = abs(q1 - q0)
191    vabd.u8     d3, d17, d16                ; m5 = abs(q2 - q1)
192    vabd.u8     d4, d18, d17                ; m6 = abs(q3 - q2)
193
194    ; only compare the largest value to limit
195    vmax.u8     d19, d19, d20               ; m1 = max(m1, m2)
196    vmax.u8     d20, d21, d22               ; m2 = max(m3, m4)
197
198    vabd.u8     d17, d6, d7                 ; abs(p0 - q0)
199
200    vmax.u8     d3, d3, d4                  ; m3 = max(m5, m6)
201
202    vmov.u8     d18, #0x80
203
204    vmax.u8     d23, d19, d20               ; m1 = max(m1, m2)
205
206    ; hevmask
207    vcgt.u8     d21, d21, d2                ; (abs(p1 - p0) > thresh)*-1
208    vcgt.u8     d22, d22, d2                ; (abs(q1 - q0) > thresh)*-1
209    vmax.u8     d23, d23, d3                ; m1 = max(m1, m3)
210
211    vabd.u8     d28, d5, d16                ; a = abs(p1 - q1)
212    vqadd.u8    d17, d17, d17               ; b = abs(p0 - q0) * 2
213
214    veor        d7, d7, d18                 ; qs0
215
216    vcge.u8     d23, d1, d23                ; abs(m1) > limit
217
218    ; filter() function
219    ; convert to signed
220
221    vshr.u8     d28, d28, #1                ; a = a / 2
222    veor        d6, d6, d18                 ; ps0
223
224    veor        d5, d5, d18                 ; ps1
225    vqadd.u8    d17, d17, d28               ; a = b + a
226
227    veor        d16, d16, d18               ; qs1
228
229    vmov.u8     d19, #3
230
231    vsub.s8     d28, d7, d6                 ; ( qs0 - ps0)
232
233    vcge.u8     d17, d0, d17                ; a > blimit
234
235    vqsub.s8    d27, d5, d16                ; filter = clamp(ps1-qs1)
236    vorr        d22, d21, d22               ; hevmask
237
238    vmull.s8    q12, d28, d19               ; 3 * ( qs0 - ps0)
239
240    vand        d27, d27, d22               ; filter &= hev
241    vand        d23, d23, d17               ; filter_mask
242
243    vaddw.s8    q12, q12, d27               ; filter + 3 * (qs0 - ps0)
244
245    vmov.u8     d17, #4
246
247    ; filter = clamp(filter + 3 * ( qs0 - ps0))
248    vqmovn.s16  d27, q12
249
250    vand        d27, d27, d23               ; filter &= mask
251
252    vqadd.s8    d28, d27, d19               ; filter2 = clamp(filter+3)
253    vqadd.s8    d27, d27, d17               ; filter1 = clamp(filter+4)
254    vshr.s8     d28, d28, #3                ; filter2 >>= 3
255    vshr.s8     d27, d27, #3                ; filter1 >>= 3
256
257    vqadd.s8    d19, d6, d28                ; u = clamp(ps0 + filter2)
258    vqsub.s8    d26, d7, d27                ; u = clamp(qs0 - filter1)
259
260    ; outer tap adjustments
261    vrshr.s8    d27, d27, #1                ; filter = ++filter1 >> 1
262
263    veor        d6, d26, d18                ; *oq0 = u^0x80
264
265    vbic        d27, d27, d22               ; filter &= ~hev
266
267    vqadd.s8    d21, d5, d27                ; u = clamp(ps1 + filter)
268    vqsub.s8    d20, d16, d27               ; u = clamp(qs1 - filter)
269
270    veor        d5, d19, d18                ; *op0 = u^0x80
271    veor        d4, d21, d18                ; *op1 = u^0x80
272    veor        d7, d20, d18                ; *oq1 = u^0x80
273
274    bx          lr
275    ENDP        ; |vpx_loop_filter_neon|
276
277    END
278