1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12    EXPORT  |vp8_mbloop_filter_horizontal_edge_y_neon|
13    EXPORT  |vp8_mbloop_filter_horizontal_edge_uv_neon|
14    EXPORT  |vp8_mbloop_filter_vertical_edge_y_neon|
15    EXPORT  |vp8_mbloop_filter_vertical_edge_uv_neon|
16    ARM
17    REQUIRE8
18    PRESERVE8
19
20    AREA ||.text||, CODE, READONLY, ALIGN=2
21
22; flimit, limit, and thresh should be positive numbers.
23; All 16 elements in these variables are equal.
24
25; void vp8_mbloop_filter_horizontal_edge_y_neon(unsigned char *src, int pitch,
26;                                               const signed char *flimit,
27;                                               const signed char *limit,
28;                                               const signed char *thresh,
29;                                               int count)
30; r0    unsigned char *src,
31; r1    int pitch,
32; r2    const signed char *flimit,
33; r3    const signed char *limit,
34; sp    const signed char *thresh,
35; sp+4  int count (unused)
36|vp8_mbloop_filter_horizontal_edge_y_neon| PROC
37    stmdb       sp!, {lr}
38    sub         r0, r0, r1, lsl #2          ; move src pointer down by 4 lines
39    ldr         r12, [sp, #4]               ; load thresh pointer
40
41    vld1.u8     {q3}, [r0], r1              ; p3
42    vld1.s8     {d2[], d3[]}, [r3]          ; limit
43    vld1.u8     {q4}, [r0], r1              ; p2
44    vld1.s8     {d4[], d5[]}, [r12]         ; thresh
45    vld1.u8     {q5}, [r0], r1              ; p1
46    vld1.u8     {q6}, [r0], r1              ; p0
47    vld1.u8     {q7}, [r0], r1              ; q0
48    vld1.u8     {q8}, [r0], r1              ; q1
49    vld1.u8     {q9}, [r0], r1              ; q2
50    vld1.u8     {q10}, [r0], r1             ; q3
51
52    bl          vp8_mbloop_filter_neon
53
54    sub         r0, r0, r1, lsl #3
55    add         r0, r0, r1
56    add         r2, r0, r1
57    add         r3, r2, r1
58
59    vst1.u8     {q4}, [r0]                  ; store op2
60    vst1.u8     {q5}, [r2]                  ; store op1
61    vst1.u8     {q6}, [r3], r1              ; store op0
62    add         r12, r3, r1
63    vst1.u8     {q7}, [r3]                  ; store oq0
64    vst1.u8     {q8}, [r12], r1             ; store oq1
65    vst1.u8     {q9}, [r12]             ; store oq2
66
67    ldmia       sp!, {pc}
68    ENDP        ; |vp8_mbloop_filter_horizontal_edge_y_neon|
69
70; void vp8_mbloop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch,
71;                                                const signed char *flimit,
72;                                                const signed char *limit,
73;                                                const signed char *thresh,
74;                                                unsigned char *v)
75; r0    unsigned char *u,
76; r1    int pitch,
77; r2    const signed char *flimit,
78; r3    const signed char *limit,
79; sp    const signed char *thresh,
80; sp+4  unsigned char *v
81|vp8_mbloop_filter_horizontal_edge_uv_neon| PROC
82    stmdb       sp!, {lr}
83    sub         r0, r0, r1, lsl #2          ; move u pointer down by 4 lines
84    vld1.s8     {d2[], d3[]}, [r3]          ; limit
85    ldr         r3, [sp, #8]                ; load v ptr
86    ldr         r12, [sp, #4]               ; load thresh pointer
87    sub         r3, r3, r1, lsl #2          ; move v pointer down by 4 lines
88
89    vld1.u8     {d6}, [r0], r1              ; p3
90    vld1.u8     {d7}, [r3], r1              ; p3
91    vld1.u8     {d8}, [r0], r1              ; p2
92    vld1.u8     {d9}, [r3], r1              ; p2
93    vld1.u8     {d10}, [r0], r1             ; p1
94    vld1.u8     {d11}, [r3], r1             ; p1
95    vld1.u8     {d12}, [r0], r1             ; p0
96    vld1.u8     {d13}, [r3], r1             ; p0
97    vld1.u8     {d14}, [r0], r1             ; q0
98    vld1.u8     {d15}, [r3], r1             ; q0
99    vld1.u8     {d16}, [r0], r1             ; q1
100    vld1.u8     {d17}, [r3], r1             ; q1
101    vld1.u8     {d18}, [r0], r1             ; q2
102    vld1.u8     {d19}, [r3], r1             ; q2
103    vld1.u8     {d20}, [r0], r1             ; q3
104    vld1.u8     {d21}, [r3], r1             ; q3
105
106    vld1.s8     {d4[], d5[]}, [r12]         ; thresh
107
108    bl          vp8_mbloop_filter_neon
109
110    sub         r0, r0, r1, lsl #3
111    sub         r3, r3, r1, lsl #3
112
113    add         r0, r0, r1
114    add         r3, r3, r1
115
116    vst1.u8     {d8}, [r0], r1              ; store u op2
117    vst1.u8     {d9}, [r3], r1              ; store v op2
118    vst1.u8     {d10}, [r0], r1             ; store u op1
119    vst1.u8     {d11}, [r3], r1             ; store v op1
120    vst1.u8     {d12}, [r0], r1             ; store u op0
121    vst1.u8     {d13}, [r3], r1             ; store v op0
122    vst1.u8     {d14}, [r0], r1             ; store u oq0
123    vst1.u8     {d15}, [r3], r1             ; store v oq0
124    vst1.u8     {d16}, [r0], r1             ; store u oq1
125    vst1.u8     {d17}, [r3], r1             ; store v oq1
126    vst1.u8     {d18}, [r0], r1             ; store u oq2
127    vst1.u8     {d19}, [r3], r1             ; store v oq2
128
129    ldmia       sp!, {pc}
130    ENDP        ; |vp8_mbloop_filter_horizontal_edge_uv_neon|
131
132; void vp8_mbloop_filter_vertical_edge_y_neon(unsigned char *src, int pitch,
133;                                             const signed char *flimit,
134;                                             const signed char *limit,
135;                                             const signed char *thresh,
136;                                             int count)
137; r0    unsigned char *src,
138; r1    int pitch,
139; r2    const signed char *flimit,
140; r3    const signed char *limit,
141; sp    const signed char *thresh,
142; sp+4  int count (unused)
143|vp8_mbloop_filter_vertical_edge_y_neon| PROC
144    stmdb       sp!, {lr}
145    sub         r0, r0, #4                  ; move src pointer down by 4 columns
146
147    vld1.u8     {d6}, [r0], r1              ; load first 8-line src data
148    ldr         r12, [sp, #4]               ; load thresh pointer
149    vld1.u8     {d8}, [r0], r1
150    sub         sp, sp, #32
151    vld1.u8     {d10}, [r0], r1
152    vld1.u8     {d12}, [r0], r1
153    vld1.u8     {d14}, [r0], r1
154    vld1.u8     {d16}, [r0], r1
155    vld1.u8     {d18}, [r0], r1
156    vld1.u8     {d20}, [r0], r1
157
158    vld1.u8     {d7}, [r0], r1              ; load second 8-line src data
159    vld1.u8     {d9}, [r0], r1
160    vld1.u8     {d11}, [r0], r1
161    vld1.u8     {d13}, [r0], r1
162    vld1.u8     {d15}, [r0], r1
163    vld1.u8     {d17}, [r0], r1
164    vld1.u8     {d19}, [r0], r1
165    vld1.u8     {d21}, [r0], r1
166
167    ;transpose to 8x16 matrix
168    vtrn.32     q3, q7
169    vtrn.32     q4, q8
170    vtrn.32     q5, q9
171    vtrn.32     q6, q10
172
173    vtrn.16     q3, q5
174    vtrn.16     q4, q6
175    vtrn.16     q7, q9
176    vtrn.16     q8, q10
177
178    vtrn.8      q3, q4
179    vtrn.8      q5, q6
180    vtrn.8      q7, q8
181    vtrn.8      q9, q10
182
183    vld1.s8     {d4[], d5[]}, [r12]         ; thresh
184    vld1.s8     {d2[], d3[]}, [r3]          ; limit
185    mov         r12, sp
186    vst1.u8     {q3}, [r12]!
187    vst1.u8     {q10}, [r12]!
188
189    bl          vp8_mbloop_filter_neon
190
191    sub         r0, r0, r1, lsl #4
192
193    add         r2, r0, r1
194
195    add         r3, r2, r1
196
197    vld1.u8     {q3}, [sp]!
198    vld1.u8     {q10}, [sp]!
199
200    ;transpose to 16x8 matrix
201    vtrn.32     q3, q7
202    vtrn.32     q4, q8
203    vtrn.32     q5, q9
204    vtrn.32     q6, q10
205    add         r12, r3, r1
206
207    vtrn.16     q3, q5
208    vtrn.16     q4, q6
209    vtrn.16     q7, q9
210    vtrn.16     q8, q10
211
212    vtrn.8      q3, q4
213    vtrn.8      q5, q6
214    vtrn.8      q7, q8
215    vtrn.8      q9, q10
216
217    ;store op2, op1, op0, oq0, oq1, oq2
218    vst1.8      {d6}, [r0]
219    vst1.8      {d8}, [r2]
220    vst1.8      {d10}, [r3]
221    vst1.8      {d12}, [r12], r1
222    add         r0, r12, r1
223    vst1.8      {d14}, [r12]
224    vst1.8      {d16}, [r0], r1
225    add         r2, r0, r1
226    vst1.8      {d18}, [r0]
227    vst1.8      {d20}, [r2], r1
228    add         r3, r2, r1
229    vst1.8      {d7}, [r2]
230    vst1.8      {d9}, [r3], r1
231    add         r12, r3, r1
232    vst1.8      {d11}, [r3]
233    vst1.8      {d13}, [r12], r1
234    add         r0, r12, r1
235    vst1.8      {d15}, [r12]
236    vst1.8      {d17}, [r0], r1
237    add         r2, r0, r1
238    vst1.8      {d19}, [r0]
239    vst1.8      {d21}, [r2]
240
241    ldmia       sp!, {pc}
242    ENDP        ; |vp8_mbloop_filter_vertical_edge_y_neon|
243
244; void vp8_mbloop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch,
245;                                              const signed char *flimit,
246;                                              const signed char *limit,
247;                                              const signed char *thresh,
248;                                              unsigned char *v)
249; r0    unsigned char *u,
250; r1    int pitch,
251; r2    const signed char *flimit,
252; r3    const signed char *limit,
253; sp    const signed char *thresh,
254; sp+4  unsigned char *v
255|vp8_mbloop_filter_vertical_edge_uv_neon| PROC
256    stmdb       sp!, {lr}
257    sub         r0, r0, #4                  ; move src pointer down by 4 columns
258    vld1.s8     {d2[], d3[]}, [r3]          ; limit
259    ldr         r3, [sp, #8]                ; load v ptr
260    ldr         r12, [sp, #4]               ; load thresh pointer
261
262    sub         r3, r3, #4                  ; move v pointer down by 4 columns
263
264    vld1.u8     {d6}, [r0], r1              ;load u data
265    vld1.u8     {d7}, [r3], r1              ;load v data
266    vld1.u8     {d8}, [r0], r1
267    vld1.u8     {d9}, [r3], r1
268    vld1.u8     {d10}, [r0], r1
269    vld1.u8     {d11}, [r3], r1
270    vld1.u8     {d12}, [r0], r1
271    vld1.u8     {d13}, [r3], r1
272    vld1.u8     {d14}, [r0], r1
273    vld1.u8     {d15}, [r3], r1
274    vld1.u8     {d16}, [r0], r1
275    vld1.u8     {d17}, [r3], r1
276    vld1.u8     {d18}, [r0], r1
277    vld1.u8     {d19}, [r3], r1
278    vld1.u8     {d20}, [r0], r1
279    vld1.u8     {d21}, [r3], r1
280
281    ;transpose to 8x16 matrix
282    vtrn.32     q3, q7
283    vtrn.32     q4, q8
284    vtrn.32     q5, q9
285    vtrn.32     q6, q10
286
287    vtrn.16     q3, q5
288    vtrn.16     q4, q6
289    vtrn.16     q7, q9
290    vtrn.16     q8, q10
291
292    vtrn.8      q3, q4
293    vtrn.8      q5, q6
294    vtrn.8      q7, q8
295    vtrn.8      q9, q10
296
297    sub         sp, sp, #32
298    vld1.s8     {d4[], d5[]}, [r12]         ; thresh
299    mov         r12, sp
300    vst1.u8     {q3}, [r12]!
301    vst1.u8     {q10}, [r12]!
302
303    bl          vp8_mbloop_filter_neon
304
305    sub         r0, r0, r1, lsl #3
306    sub         r3, r3, r1, lsl #3
307
308    vld1.u8     {q3}, [sp]!
309    vld1.u8     {q10}, [sp]!
310
311    ;transpose to 16x8 matrix
312    vtrn.32     q3, q7
313    vtrn.32     q4, q8
314    vtrn.32     q5, q9
315    vtrn.32     q6, q10
316
317    vtrn.16     q3, q5
318    vtrn.16     q4, q6
319    vtrn.16     q7, q9
320    vtrn.16     q8, q10
321
322    vtrn.8      q3, q4
323    vtrn.8      q5, q6
324    vtrn.8      q7, q8
325    vtrn.8      q9, q10
326
327    ;store op2, op1, op0, oq0, oq1, oq2
328    vst1.8      {d6}, [r0], r1
329    vst1.8      {d7}, [r3], r1
330    vst1.8      {d8}, [r0], r1
331    vst1.8      {d9}, [r3], r1
332    vst1.8      {d10}, [r0], r1
333    vst1.8      {d11}, [r3], r1
334    vst1.8      {d12}, [r0], r1
335    vst1.8      {d13}, [r3], r1
336    vst1.8      {d14}, [r0], r1
337    vst1.8      {d15}, [r3], r1
338    vst1.8      {d16}, [r0], r1
339    vst1.8      {d17}, [r3], r1
340    vst1.8      {d18}, [r0], r1
341    vst1.8      {d19}, [r3], r1
342    vst1.8      {d20}, [r0], r1
343    vst1.8      {d21}, [r3], r1
344
345    ldmia       sp!, {pc}
346    ENDP        ; |vp8_mbloop_filter_vertical_edge_uv_neon|
347
348; void vp8_mbloop_filter_neon()
349; This is a helper function for the macroblock loopfilters. The individual
350; functions do the necessary load, transpose (if necessary), preserve (if
351; necessary) and store.
352
353; TODO:
354; The vertical filter writes p3/q3 back out because two 4 element writes are
355; much simpler than ordering and writing two 3 element sets (or three 2 elements
356; sets, or whichever other combinations are possible).
357; If we can preserve q3 and q10, the vertical filter will be able to avoid
358; storing those values on the stack and reading them back after the filter.
359
360; r0,r1 PRESERVE
361; r2    flimit
362; r3    PRESERVE
363; q1    limit
364; q2    thresh
365; q3    p3
366; q4    p2
367; q5    p1
368; q6    p0
369; q7    q0
370; q8    q1
371; q9    q2
372; q10   q3
373
374|vp8_mbloop_filter_neon| PROC
375    adr         r12, mblf_coeff
376
377    ; vp8_filter_mask
378    vabd.u8     q11, q3, q4                 ; abs(p3 - p2)
379    vabd.u8     q12, q4, q5                 ; abs(p2 - p1)
380    vabd.u8     q13, q5, q6                 ; abs(p1 - p0)
381    vabd.u8     q14, q8, q7                 ; abs(q1 - q0)
382    vabd.u8     q3, q9, q8                  ; abs(q2 - q1)
383    vabd.u8     q0, q10, q9                 ; abs(q3 - q2)
384
385    vmax.u8     q11, q11, q12
386    vmax.u8     q12, q13, q14
387    vmax.u8     q3, q3, q0
388    vmax.u8     q15, q11, q12
389
390    vabd.u8     q12, q6, q7                 ; abs(p0 - q0)
391
392    ; vp8_hevmask
393    vcgt.u8     q13, q13, q2                ; (abs(p1 - p0) > thresh) * -1
394    vcgt.u8     q14, q14, q2                ; (abs(q1 - q0) > thresh) * -1
395    vmax.u8     q15, q15, q3
396
397    vld1.s8     {d4[], d5[]}, [r2]          ; flimit
398
399    vld1.u8     {q0}, [r12]!
400
401    vadd.u8     q2, q2, q2                  ; flimit * 2
402    vadd.u8     q2, q2, q1                  ; flimit * 2 +  limit
403    vcge.u8     q15, q1, q15
404
405    vabd.u8     q1, q5, q8                  ; a = abs(p1 - q1)
406    vqadd.u8    q12, q12, q12               ; b = abs(p0 - q0) * 2
407    vshr.u8     q1, q1, #1                  ; a = a / 2
408    vqadd.u8    q12, q12, q1                ; a = b + a
409    vcge.u8     q12, q2, q12                ; (a > flimit * 2 + limit) * -1
410
411    ; vp8_filter
412    ; convert to signed
413    veor        q7, q7, q0                  ; qs0
414    veor        q6, q6, q0                  ; ps0
415    veor        q5, q5, q0                  ; ps1
416    veor        q8, q8, q0                  ; qs1
417    veor        q4, q4, q0                  ; ps2
418    veor        q9, q9, q0                  ; qs2
419
420    vorr        q14, q13, q14               ; vp8_hevmask
421
422    vsubl.s8    q2, d14, d12                ; qs0 - ps0
423    vsubl.s8    q13, d15, d13
424
425    vqsub.s8    q1, q5, q8                  ; vp8_filter = clamp(ps1-qs1)
426
427    vadd.s16    q10, q2, q2                 ; 3 * (qs0 - ps0)
428    vadd.s16    q11, q13, q13
429    vand        q15, q15, q12               ; vp8_filter_mask
430
431    vadd.s16    q2, q2, q10
432    vadd.s16    q13, q13, q11
433
434    vld1.u8     {q12}, [r12]!               ; #3
435
436    vaddw.s8    q2, q2, d2                  ; vp8_filter + 3 * ( qs0 - ps0)
437    vaddw.s8    q13, q13, d3
438
439    vld1.u8     {q11}, [r12]!               ; #4
440
441    ; vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
442    vqmovn.s16  d2, q2
443    vqmovn.s16  d3, q13
444
445    vand        q1, q1, q15                 ; vp8_filter &= mask
446
447    vld1.u8     {q15}, [r12]!               ; #63
448    ;
449    vand        q13, q1, q14                ; Filter2 &= hev
450
451    vld1.u8     {d7}, [r12]!                ; #9
452
453    vqadd.s8    q2, q13, q11                ; Filter1 = clamp(Filter2+4)
454    vqadd.s8    q13, q13, q12               ; Filter2 = clamp(Filter2+3)
455
456    vld1.u8     {d6}, [r12]!                ; #18
457
458    vshr.s8     q2, q2, #3                  ; Filter1 >>= 3
459    vshr.s8     q13, q13, #3                ; Filter2 >>= 3
460
461    vmov        q10, q15
462    vmov        q12, q15
463
464    vqsub.s8    q7, q7, q2                  ; qs0 = clamp(qs0 - Filter1)
465
466    vld1.u8     {d5}, [r12]!                ; #27
467
468    vqadd.s8    q6, q6, q13                 ; ps0 = clamp(ps0 + Filter2)
469
470    vbic        q1, q1, q14                 ; vp8_filter &= ~hev
471
472    ; roughly 1/7th difference across boundary
473    ; roughly 2/7th difference across boundary
474    ; roughly 3/7th difference across boundary
475    vmov        q11, q15
476    vmov        q13, q15
477    vmov        q14, q15
478
479    vmlal.s8    q10, d2, d7                 ; Filter2 * 9
480    vmlal.s8    q11, d3, d7
481    vmlal.s8    q12, d2, d6                 ; Filter2 * 18
482    vmlal.s8    q13, d3, d6
483    vmlal.s8    q14, d2, d5                 ; Filter2 * 27
484    vmlal.s8    q15, d3, d5
485    vqshrn.s16  d20, q10, #7                ; u = clamp((63 + Filter2 * 9)>>7)
486    vqshrn.s16  d21, q11, #7
487    vqshrn.s16  d24, q12, #7                ; u = clamp((63 + Filter2 * 18)>>7)
488    vqshrn.s16  d25, q13, #7
489    vqshrn.s16  d28, q14, #7                ; u = clamp((63 + Filter2 * 27)>>7)
490    vqshrn.s16  d29, q15, #7
491
492    vqsub.s8    q11, q9, q10                ; s = clamp(qs2 - u)
493    vqadd.s8    q10, q4, q10                ; s = clamp(ps2 + u)
494    vqsub.s8    q13, q8, q12                ; s = clamp(qs1 - u)
495    vqadd.s8    q12, q5, q12                ; s = clamp(ps1 + u)
496    vqsub.s8    q15, q7, q14                ; s = clamp(qs0 - u)
497    vqadd.s8    q14, q6, q14                ; s = clamp(ps0 + u)
498    veor        q9, q11, q0                 ; *oq2 = s^0x80
499    veor        q4, q10, q0                 ; *op2 = s^0x80
500    veor        q8, q13, q0                 ; *oq1 = s^0x80
501    veor        q5, q12, q0                 ; *op2 = s^0x80
502    veor        q7, q15, q0                 ; *oq0 = s^0x80
503    veor        q6, q14, q0                 ; *op0 = s^0x80
504
505    bx          lr
506    ENDP        ; |vp8_mbloop_filter_neon|
507
508;-----------------
509
510mblf_coeff
511    DCD     0x80808080, 0x80808080, 0x80808080, 0x80808080
512    DCD     0x03030303, 0x03030303, 0x03030303, 0x03030303
513    DCD     0x04040404, 0x04040404, 0x04040404, 0x04040404
514    DCD     0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f
515    DCD     0x09090909, 0x09090909, 0x12121212, 0x12121212
516    DCD     0x1b1b1b1b, 0x1b1b1b1b
517
518    END
519