1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12    EXPORT  |vp8_loop_filter_simple_horizontal_edge_neon|
13    ARM
14    REQUIRE8
15    PRESERVE8
16
17    AREA ||.text||, CODE, READONLY, ALIGN=2
18;Note: flimit, limit, and thresh shpuld be positive numbers. All 16 elements in flimit
19;are equal. So, in the code, only one load is needed
20;for flimit. Same way applies to limit and thresh.
21; r0    unsigned char *s,
22; r1    int p, //pitch
23; r2    const signed char *flimit,
24; r3    const signed char *limit,
25; stack(r4) const signed char *thresh,
26; //stack(r5)   int count --unused
27
28|vp8_loop_filter_simple_horizontal_edge_neon| PROC
29    sub         r0, r0, r1, lsl #1          ; move src pointer down by 2 lines
30
31    adr         r12, lfhy_coeff
32    vld1.u8     {q5}, [r0], r1              ; p1
33    vld1.s8     {d2[], d3[]}, [r2]          ; flimit
34    vld1.s8     {d26[], d27[]}, [r3]        ; limit -> q13
35    vld1.u8     {q6}, [r0], r1              ; p0
36    vld1.u8     {q0}, [r12]!                ; 0x80
37    vld1.u8     {q7}, [r0], r1              ; q0
38    vld1.u8     {q10}, [r12]!               ; 0x03
39    vld1.u8     {q8}, [r0]                  ; q1
40
41    ;vp8_filter_mask() function
42    vabd.u8     q15, q6, q7                 ; abs(p0 - q0)
43    vabd.u8     q14, q5, q8                 ; abs(p1 - q1)
44    vqadd.u8    q15, q15, q15               ; abs(p0 - q0) * 2
45    vshr.u8     q14, q14, #1                ; abs(p1 - q1) / 2
46    vqadd.u8    q15, q15, q14               ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
47
48    ;vp8_filter() function
49    veor        q7, q7, q0                  ; qs0: q0 offset to convert to a signed value
50    veor        q6, q6, q0                  ; ps0: p0 offset to convert to a signed value
51    veor        q5, q5, q0                  ; ps1: p1 offset to convert to a signed value
52    veor        q8, q8, q0                  ; qs1: q1 offset to convert to a signed value
53
54    vadd.u8     q1, q1, q1                  ; flimit * 2
55    vadd.u8     q1, q1, q13                 ; flimit * 2 + limit
56    vcge.u8     q15, q1, q15                ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1
57
58;;;;;;;;;;
59    ;vqsub.s8   q2, q7, q6                  ; ( qs0 - ps0)
60    vsubl.s8    q2, d14, d12                ; ( qs0 - ps0)
61    vsubl.s8    q3, d15, d13
62
63    vqsub.s8    q4, q5, q8                  ; q4: vp8_filter = vp8_signed_char_clamp(ps1-qs1)
64
65    ;vmul.i8    q2, q2, q10                 ;  3 * ( qs0 - ps0)
66    vadd.s16    q11, q2, q2                 ;  3 * ( qs0 - ps0)
67    vadd.s16    q12, q3, q3
68
69    vld1.u8     {q9}, [r12]!                ; 0x04
70
71    vadd.s16    q2, q2, q11
72    vadd.s16    q3, q3, q12
73
74    vaddw.s8    q2, q2, d8                  ; vp8_filter + 3 * ( qs0 - ps0)
75    vaddw.s8    q3, q3, d9
76
77    ;vqadd.s8   q4, q4, q2                  ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
78    vqmovn.s16  d8, q2                      ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
79    vqmovn.s16  d9, q3
80;;;;;;;;;;;;;
81
82    vand        q4, q4, q15                 ; vp8_filter &= mask
83
84    vqadd.s8    q2, q4, q10                 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
85    vqadd.s8    q4, q4, q9                  ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
86    vshr.s8     q2, q2, #3                  ; Filter2 >>= 3
87    vshr.s8     q4, q4, #3                  ; Filter1 >>= 3
88
89    sub         r0, r0, r1, lsl #1
90
91    ;calculate output
92    vqadd.s8    q11, q6, q2                 ; u = vp8_signed_char_clamp(ps0 + Filter2)
93    vqsub.s8    q10, q7, q4                 ; u = vp8_signed_char_clamp(qs0 - Filter1)
94
95    add         r3, r0, r1
96
97    veor        q6, q11, q0                 ; *op0 = u^0x80
98    veor        q7, q10, q0                 ; *oq0 = u^0x80
99
100    vst1.u8     {q6}, [r0]                  ; store op0
101    vst1.u8     {q7}, [r3]                  ; store oq0
102
103    bx          lr
104    ENDP        ; |vp8_loop_filter_simple_horizontal_edge_neon|
105
106;-----------------
107
108lfhy_coeff
109    DCD     0x80808080, 0x80808080, 0x80808080, 0x80808080
110    DCD     0x03030303, 0x03030303, 0x03030303, 0x03030303
111    DCD     0x04040404, 0x04040404, 0x04040404, 0x04040404
112
113    END
114