1233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
2233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
4233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  Use of this source code is governed by a BSD-style license
5233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  that can be found in the LICENSE file in the root of the source
6233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  tree. An additional intellectual property rights grant can be found
7233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  in the file PATENTS.  All contributing project authors may
8233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  be found in the AUTHORS file in the root of the source tree.
9233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
10233d2500723e5594f3e7c70896ffeeef32b9c950ywan
11233d2500723e5594f3e7c70896ffeeef32b9c950ywan    EXPORT  |vp9_lpf_horizontal_4_neon|
12233d2500723e5594f3e7c70896ffeeef32b9c950ywan    EXPORT  |vp9_lpf_vertical_4_neon|
13233d2500723e5594f3e7c70896ffeeef32b9c950ywan    EXPORT  |vp9_lpf_horizontal_8_neon|
14233d2500723e5594f3e7c70896ffeeef32b9c950ywan    EXPORT  |vp9_lpf_vertical_8_neon|
15233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ARM
16233d2500723e5594f3e7c70896ffeeef32b9c950ywan
17233d2500723e5594f3e7c70896ffeeef32b9c950ywan    AREA ||.text||, CODE, READONLY, ALIGN=2
18233d2500723e5594f3e7c70896ffeeef32b9c950ywan
19233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter
20233d2500723e5594f3e7c70896ffeeef32b9c950ywan; works on 16 iterations at a time.
21233d2500723e5594f3e7c70896ffeeef32b9c950ywan; TODO(fgalligan): See about removing the count code as this function is only
22233d2500723e5594f3e7c70896ffeeef32b9c950ywan; called with a count of 1.
23233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
24233d2500723e5594f3e7c70896ffeeef32b9c950ywan; void vp9_lpf_horizontal_4_neon(uint8_t *s,
25233d2500723e5594f3e7c70896ffeeef32b9c950ywan;                                int p /* pitch */,
26233d2500723e5594f3e7c70896ffeeef32b9c950ywan;                                const uint8_t *blimit,
27233d2500723e5594f3e7c70896ffeeef32b9c950ywan;                                const uint8_t *limit,
28233d2500723e5594f3e7c70896ffeeef32b9c950ywan;                                const uint8_t *thresh,
29233d2500723e5594f3e7c70896ffeeef32b9c950ywan;                                int count)
30233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
31233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r0    uint8_t *s,
32233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r1    int p, /* pitch */
33233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r2    const uint8_t *blimit,
34233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r3    const uint8_t *limit,
35233d2500723e5594f3e7c70896ffeeef32b9c950ywan; sp    const uint8_t *thresh,
36233d2500723e5594f3e7c70896ffeeef32b9c950ywan; sp+4  int count
37233d2500723e5594f3e7c70896ffeeef32b9c950ywan|vp9_lpf_horizontal_4_neon| PROC
38233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        {lr}
39233d2500723e5594f3e7c70896ffeeef32b9c950ywan
40233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.8      {d0[]}, [r2]               ; duplicate *blimit
41233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r12, [sp, #8]              ; load count
42233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r2, [sp, #4]               ; load thresh
43233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add         r1, r1, r1                 ; double pitch
44233d2500723e5594f3e7c70896ffeeef32b9c950ywan
45233d2500723e5594f3e7c70896ffeeef32b9c950ywan    cmp         r12, #0
46233d2500723e5594f3e7c70896ffeeef32b9c950ywan    beq         end_vp9_lf_h_edge
47233d2500723e5594f3e7c70896ffeeef32b9c950ywan
48233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.8      {d1[]}, [r3]               ; duplicate *limit
49233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.8      {d2[]}, [r2]               ; duplicate *thresh
50233d2500723e5594f3e7c70896ffeeef32b9c950ywan
51233d2500723e5594f3e7c70896ffeeef32b9c950ywancount_lf_h_loop
52233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         r2, r0, r1, lsl #1         ; move src pointer down by 4 lines
53233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add         r3, r2, r1, lsr #1         ; set to 3 lines down
54233d2500723e5594f3e7c70896ffeeef32b9c950ywan
55233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8     {d3}, [r2@64], r1          ; p3
56233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8     {d4}, [r3@64], r1          ; p2
57233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8     {d5}, [r2@64], r1          ; p1
58233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8     {d6}, [r3@64], r1          ; p0
59233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8     {d7}, [r2@64], r1          ; q0
60233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8     {d16}, [r3@64], r1         ; q1
61233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8     {d17}, [r2@64]             ; q2
62233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8     {d18}, [r3@64]             ; q3
63233d2500723e5594f3e7c70896ffeeef32b9c950ywan
64233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         r2, r2, r1, lsl #1
65233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         r3, r3, r1, lsl #1
66233d2500723e5594f3e7c70896ffeeef32b9c950ywan
67233d2500723e5594f3e7c70896ffeeef32b9c950ywan    bl          vp9_loop_filter_neon
68233d2500723e5594f3e7c70896ffeeef32b9c950ywan
69233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8     {d4}, [r2@64], r1          ; store op1
70233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8     {d5}, [r3@64], r1          ; store op0
71233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8     {d6}, [r2@64], r1          ; store oq0
72233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8     {d7}, [r3@64], r1          ; store oq1
73233d2500723e5594f3e7c70896ffeeef32b9c950ywan
74233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add         r0, r0, #8
75233d2500723e5594f3e7c70896ffeeef32b9c950ywan    subs        r12, r12, #1
76233d2500723e5594f3e7c70896ffeeef32b9c950ywan    bne         count_lf_h_loop
77233d2500723e5594f3e7c70896ffeeef32b9c950ywan
78233d2500723e5594f3e7c70896ffeeef32b9c950ywanend_vp9_lf_h_edge
79233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop         {pc}
80233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ENDP        ; |vp9_lpf_horizontal_4_neon|
81233d2500723e5594f3e7c70896ffeeef32b9c950ywan
82233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter
83233d2500723e5594f3e7c70896ffeeef32b9c950ywan; works on 16 iterations at a time.
84233d2500723e5594f3e7c70896ffeeef32b9c950ywan; TODO(fgalligan): See about removing the count code as this function is only
85233d2500723e5594f3e7c70896ffeeef32b9c950ywan; called with a count of 1.
86233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
87233d2500723e5594f3e7c70896ffeeef32b9c950ywan; void vp9_lpf_vertical_4_neon(uint8_t *s,
88233d2500723e5594f3e7c70896ffeeef32b9c950ywan;                              int p /* pitch */,
89233d2500723e5594f3e7c70896ffeeef32b9c950ywan;                              const uint8_t *blimit,
90233d2500723e5594f3e7c70896ffeeef32b9c950ywan;                              const uint8_t *limit,
91233d2500723e5594f3e7c70896ffeeef32b9c950ywan;                              const uint8_t *thresh,
92233d2500723e5594f3e7c70896ffeeef32b9c950ywan;                              int count)
93233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
94233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r0    uint8_t *s,
95233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r1    int p, /* pitch */
96233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r2    const uint8_t *blimit,
97233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r3    const uint8_t *limit,
98233d2500723e5594f3e7c70896ffeeef32b9c950ywan; sp    const uint8_t *thresh,
99233d2500723e5594f3e7c70896ffeeef32b9c950ywan; sp+4  int count
100233d2500723e5594f3e7c70896ffeeef32b9c950ywan|vp9_lpf_vertical_4_neon| PROC
101233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        {lr}
102233d2500723e5594f3e7c70896ffeeef32b9c950ywan
103233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.8      {d0[]}, [r2]              ; duplicate *blimit
104233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r12, [sp, #8]             ; load count
105233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.8      {d1[]}, [r3]              ; duplicate *limit
106233d2500723e5594f3e7c70896ffeeef32b9c950ywan
107233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r3, [sp, #4]              ; load thresh
108233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         r2, r0, #4                ; move s pointer down by 4 columns
109233d2500723e5594f3e7c70896ffeeef32b9c950ywan    cmp         r12, #0
110233d2500723e5594f3e7c70896ffeeef32b9c950ywan    beq         end_vp9_lf_v_edge
111233d2500723e5594f3e7c70896ffeeef32b9c950ywan
112233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.8      {d2[]}, [r3]              ; duplicate *thresh
113233d2500723e5594f3e7c70896ffeeef32b9c950ywan
114233d2500723e5594f3e7c70896ffeeef32b9c950ywancount_lf_v_loop
115233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8     {d3}, [r2], r1             ; load s data
116233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8     {d4}, [r2], r1
117233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8     {d5}, [r2], r1
118233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8     {d6}, [r2], r1
119233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8     {d7}, [r2], r1
120233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8     {d16}, [r2], r1
121233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8     {d17}, [r2], r1
122233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8     {d18}, [r2]
123233d2500723e5594f3e7c70896ffeeef32b9c950ywan
124233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;transpose to 8x16 matrix
125233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.32     d3, d7
126233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.32     d4, d16
127233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.32     d5, d17
128233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.32     d6, d18
129233d2500723e5594f3e7c70896ffeeef32b9c950ywan
130233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.16     d3, d5
131233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.16     d4, d6
132233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.16     d7, d17
133233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.16     d16, d18
134233d2500723e5594f3e7c70896ffeeef32b9c950ywan
135233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.8      d3, d4
136233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.8      d5, d6
137233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.8      d7, d16
138233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.8      d17, d18
139233d2500723e5594f3e7c70896ffeeef32b9c950ywan
140233d2500723e5594f3e7c70896ffeeef32b9c950ywan    bl          vp9_loop_filter_neon
141233d2500723e5594f3e7c70896ffeeef32b9c950ywan
142233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         r0, r0, #2
143233d2500723e5594f3e7c70896ffeeef32b9c950ywan
144233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;store op1, op0, oq0, oq1
145233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst4.8      {d4[0], d5[0], d6[0], d7[0]}, [r0], r1
146233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst4.8      {d4[1], d5[1], d6[1], d7[1]}, [r0], r1
147233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst4.8      {d4[2], d5[2], d6[2], d7[2]}, [r0], r1
148233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst4.8      {d4[3], d5[3], d6[3], d7[3]}, [r0], r1
149233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst4.8      {d4[4], d5[4], d6[4], d7[4]}, [r0], r1
150233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst4.8      {d4[5], d5[5], d6[5], d7[5]}, [r0], r1
151233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst4.8      {d4[6], d5[6], d6[6], d7[6]}, [r0], r1
152233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst4.8      {d4[7], d5[7], d6[7], d7[7]}, [r0]
153233d2500723e5594f3e7c70896ffeeef32b9c950ywan
154233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add         r0, r0, r1, lsl #3         ; s += pitch * 8
155233d2500723e5594f3e7c70896ffeeef32b9c950ywan    subs        r12, r12, #1
156233d2500723e5594f3e7c70896ffeeef32b9c950ywan    subne       r2, r0, #4                 ; move s pointer down by 4 columns
157233d2500723e5594f3e7c70896ffeeef32b9c950ywan    bne         count_lf_v_loop
158233d2500723e5594f3e7c70896ffeeef32b9c950ywan
159233d2500723e5594f3e7c70896ffeeef32b9c950ywanend_vp9_lf_v_edge
160233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop         {pc}
161233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ENDP        ; |vp9_lpf_vertical_4_neon|
162233d2500723e5594f3e7c70896ffeeef32b9c950ywan
163233d2500723e5594f3e7c70896ffeeef32b9c950ywan; void vp9_loop_filter_neon();
164233d2500723e5594f3e7c70896ffeeef32b9c950ywan; This is a helper function for the loopfilters. The invidual functions do the
165233d2500723e5594f3e7c70896ffeeef32b9c950ywan; necessary load, transpose (if necessary) and store. The function does not use
166233d2500723e5594f3e7c70896ffeeef32b9c950ywan; registers d8-d15.
167233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
168233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Inputs:
169233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r0-r3, r12 PRESERVE
170233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d0    blimit
171233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d1    limit
172233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d2    thresh
173233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d3    p3
174233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d4    p2
175233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d5    p1
176233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d6    p0
177233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d7    q0
178233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d16   q1
179233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d17   q2
180233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d18   q3
181233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
182233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Outputs:
183233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d4    op1
184233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d5    op0
185233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d6    oq0
186233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d7    oq1
187233d2500723e5594f3e7c70896ffeeef32b9c950ywan|vp9_loop_filter_neon| PROC
188233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; filter_mask
189233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vabd.u8     d19, d3, d4                 ; m1 = abs(p3 - p2)
190233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vabd.u8     d20, d4, d5                 ; m2 = abs(p2 - p1)
191233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vabd.u8     d21, d5, d6                 ; m3 = abs(p1 - p0)
192233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vabd.u8     d22, d16, d7                ; m4 = abs(q1 - q0)
193233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vabd.u8     d3, d17, d16                ; m5 = abs(q2 - q1)
194233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vabd.u8     d4, d18, d17                ; m6 = abs(q3 - q2)
195233d2500723e5594f3e7c70896ffeeef32b9c950ywan
196233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; only compare the largest value to limit
197233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmax.u8     d19, d19, d20               ; m1 = max(m1, m2)
198233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmax.u8     d20, d21, d22               ; m2 = max(m3, m4)
199233d2500723e5594f3e7c70896ffeeef32b9c950ywan
200233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vabd.u8     d17, d6, d7                 ; abs(p0 - q0)
201233d2500723e5594f3e7c70896ffeeef32b9c950ywan
202233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmax.u8     d3, d3, d4                  ; m3 = max(m5, m6)
203233d2500723e5594f3e7c70896ffeeef32b9c950ywan
204233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmov.u8     d18, #0x80
205233d2500723e5594f3e7c70896ffeeef32b9c950ywan
206233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmax.u8     d23, d19, d20               ; m1 = max(m1, m2)
207233d2500723e5594f3e7c70896ffeeef32b9c950ywan
208233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; hevmask
209233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vcgt.u8     d21, d21, d2                ; (abs(p1 - p0) > thresh)*-1
210233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vcgt.u8     d22, d22, d2                ; (abs(q1 - q0) > thresh)*-1
211233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmax.u8     d23, d23, d3                ; m1 = max(m1, m3)
212233d2500723e5594f3e7c70896ffeeef32b9c950ywan
213233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vabd.u8     d28, d5, d16                ; a = abs(p1 - q1)
214233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqadd.u8    d17, d17, d17               ; b = abs(p0 - q0) * 2
215233d2500723e5594f3e7c70896ffeeef32b9c950ywan
216233d2500723e5594f3e7c70896ffeeef32b9c950ywan    veor        d7, d7, d18                 ; qs0
217233d2500723e5594f3e7c70896ffeeef32b9c950ywan
218233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vcge.u8     d23, d1, d23                ; abs(m1) > limit
219233d2500723e5594f3e7c70896ffeeef32b9c950ywan
220233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; filter() function
221233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; convert to signed
222233d2500723e5594f3e7c70896ffeeef32b9c950ywan
223233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vshr.u8     d28, d28, #1                ; a = a / 2
224233d2500723e5594f3e7c70896ffeeef32b9c950ywan    veor        d6, d6, d18                 ; ps0
225233d2500723e5594f3e7c70896ffeeef32b9c950ywan
226233d2500723e5594f3e7c70896ffeeef32b9c950ywan    veor        d5, d5, d18                 ; ps1
227233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqadd.u8    d17, d17, d28               ; a = b + a
228233d2500723e5594f3e7c70896ffeeef32b9c950ywan
229233d2500723e5594f3e7c70896ffeeef32b9c950ywan    veor        d16, d16, d18               ; qs1
230233d2500723e5594f3e7c70896ffeeef32b9c950ywan
231233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmov.u8     d19, #3
232233d2500723e5594f3e7c70896ffeeef32b9c950ywan
233233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsub.s8     d28, d7, d6                 ; ( qs0 - ps0)
234233d2500723e5594f3e7c70896ffeeef32b9c950ywan
235233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vcge.u8     d17, d0, d17                ; a > blimit
236233d2500723e5594f3e7c70896ffeeef32b9c950ywan
237233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqsub.s8    d27, d5, d16                ; filter = clamp(ps1-qs1)
238233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vorr        d22, d21, d22               ; hevmask
239233d2500723e5594f3e7c70896ffeeef32b9c950ywan
240233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.s8    q12, d28, d19               ; 3 * ( qs0 - ps0)
241233d2500723e5594f3e7c70896ffeeef32b9c950ywan
242233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vand        d27, d27, d22               ; filter &= hev
243233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vand        d23, d23, d17               ; filter_mask
244233d2500723e5594f3e7c70896ffeeef32b9c950ywan
245233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddw.s8    q12, q12, d27               ; filter + 3 * (qs0 - ps0)
246233d2500723e5594f3e7c70896ffeeef32b9c950ywan
247233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmov.u8     d17, #4
248233d2500723e5594f3e7c70896ffeeef32b9c950ywan
249233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; filter = clamp(filter + 3 * ( qs0 - ps0))
250233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqmovn.s16  d27, q12
251233d2500723e5594f3e7c70896ffeeef32b9c950ywan
252233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vand        d27, d27, d23               ; filter &= mask
253233d2500723e5594f3e7c70896ffeeef32b9c950ywan
254233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqadd.s8    d28, d27, d19               ; filter2 = clamp(filter+3)
255233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqadd.s8    d27, d27, d17               ; filter1 = clamp(filter+4)
256233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vshr.s8     d28, d28, #3                ; filter2 >>= 3
257233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vshr.s8     d27, d27, #3                ; filter1 >>= 3
258233d2500723e5594f3e7c70896ffeeef32b9c950ywan
259233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqadd.s8    d19, d6, d28                ; u = clamp(ps0 + filter2)
260233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqsub.s8    d26, d7, d27                ; u = clamp(qs0 - filter1)
261233d2500723e5594f3e7c70896ffeeef32b9c950ywan
262233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; outer tap adjustments
263233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vrshr.s8    d27, d27, #1                ; filter = ++filter1 >> 1
264233d2500723e5594f3e7c70896ffeeef32b9c950ywan
265233d2500723e5594f3e7c70896ffeeef32b9c950ywan    veor        d6, d26, d18                ; *oq0 = u^0x80
266233d2500723e5594f3e7c70896ffeeef32b9c950ywan
267233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vbic        d27, d27, d22               ; filter &= ~hev
268233d2500723e5594f3e7c70896ffeeef32b9c950ywan
269233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqadd.s8    d21, d5, d27                ; u = clamp(ps1 + filter)
270233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqsub.s8    d20, d16, d27               ; u = clamp(qs1 - filter)
271233d2500723e5594f3e7c70896ffeeef32b9c950ywan
272233d2500723e5594f3e7c70896ffeeef32b9c950ywan    veor        d5, d19, d18                ; *op0 = u^0x80
273233d2500723e5594f3e7c70896ffeeef32b9c950ywan    veor        d4, d21, d18                ; *op1 = u^0x80
274233d2500723e5594f3e7c70896ffeeef32b9c950ywan    veor        d7, d20, d18                ; *oq1 = u^0x80
275233d2500723e5594f3e7c70896ffeeef32b9c950ywan
276233d2500723e5594f3e7c70896ffeeef32b9c950ywan    bx          lr
277233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ENDP        ; |vp9_loop_filter_neon|
278233d2500723e5594f3e7c70896ffeeef32b9c950ywan
279233d2500723e5594f3e7c70896ffeeef32b9c950ywan; void vp9_lpf_horizontal_8_neon(uint8_t *s, int p,
280233d2500723e5594f3e7c70896ffeeef32b9c950ywan;                                const uint8_t *blimit,
281233d2500723e5594f3e7c70896ffeeef32b9c950ywan;                                const uint8_t *limit,
282233d2500723e5594f3e7c70896ffeeef32b9c950ywan;                                const uint8_t *thresh,
283233d2500723e5594f3e7c70896ffeeef32b9c950ywan;                                int count)
284233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r0    uint8_t *s,
285233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r1    int p, /* pitch */
286233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r2    const uint8_t *blimit,
287233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r3    const uint8_t *limit,
288233d2500723e5594f3e7c70896ffeeef32b9c950ywan; sp    const uint8_t *thresh,
289233d2500723e5594f3e7c70896ffeeef32b9c950ywan; sp+4  int count
290233d2500723e5594f3e7c70896ffeeef32b9c950ywan|vp9_lpf_horizontal_8_neon| PROC
291233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        {r4-r5, lr}
292233d2500723e5594f3e7c70896ffeeef32b9c950ywan
293233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.8      {d0[]}, [r2]               ; duplicate *blimit
294233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r12, [sp, #16]             ; load count
295233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r2, [sp, #12]              ; load thresh
296233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add         r1, r1, r1                 ; double pitch
297233d2500723e5594f3e7c70896ffeeef32b9c950ywan
298233d2500723e5594f3e7c70896ffeeef32b9c950ywan    cmp         r12, #0
299233d2500723e5594f3e7c70896ffeeef32b9c950ywan    beq         end_vp9_mblf_h_edge
300233d2500723e5594f3e7c70896ffeeef32b9c950ywan
301233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.8      {d1[]}, [r3]               ; duplicate *limit
302233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.8      {d2[]}, [r2]               ; duplicate *thresh
303233d2500723e5594f3e7c70896ffeeef32b9c950ywan
304233d2500723e5594f3e7c70896ffeeef32b9c950ywancount_mblf_h_loop
305233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         r3, r0, r1, lsl #1         ; move src pointer down by 4 lines
306233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add         r2, r3, r1, lsr #1         ; set to 3 lines down
307233d2500723e5594f3e7c70896ffeeef32b9c950ywan
308233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8     {d3}, [r3@64], r1          ; p3
309233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8     {d4}, [r2@64], r1          ; p2
310233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8     {d5}, [r3@64], r1          ; p1
311233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8     {d6}, [r2@64], r1          ; p0
312233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8     {d7}, [r3@64], r1          ; q0
313233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8     {d16}, [r2@64], r1         ; q1
314233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8     {d17}, [r3@64]             ; q2
315233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8     {d18}, [r2@64], r1         ; q3
316233d2500723e5594f3e7c70896ffeeef32b9c950ywan
317233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         r3, r3, r1, lsl #1
318233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         r2, r2, r1, lsl #2
319233d2500723e5594f3e7c70896ffeeef32b9c950ywan
320233d2500723e5594f3e7c70896ffeeef32b9c950ywan    bl          vp9_mbloop_filter_neon
321233d2500723e5594f3e7c70896ffeeef32b9c950ywan
322233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8     {d0}, [r2@64], r1          ; store op2
323233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8     {d1}, [r3@64], r1          ; store op1
324233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8     {d2}, [r2@64], r1          ; store op0
325233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8     {d3}, [r3@64], r1          ; store oq0
326233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8     {d4}, [r2@64], r1          ; store oq1
327233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8     {d5}, [r3@64], r1          ; store oq2
328233d2500723e5594f3e7c70896ffeeef32b9c950ywan
329233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add         r0, r0, #8
330233d2500723e5594f3e7c70896ffeeef32b9c950ywan    subs        r12, r12, #1
331233d2500723e5594f3e7c70896ffeeef32b9c950ywan    bne         count_mblf_h_loop
332233d2500723e5594f3e7c70896ffeeef32b9c950ywan
333233d2500723e5594f3e7c70896ffeeef32b9c950ywanend_vp9_mblf_h_edge
334233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop         {r4-r5, pc}
335233d2500723e5594f3e7c70896ffeeef32b9c950ywan
336233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ENDP        ; |vp9_lpf_horizontal_8_neon|
337233d2500723e5594f3e7c70896ffeeef32b9c950ywan
338233d2500723e5594f3e7c70896ffeeef32b9c950ywan; void vp9_lpf_vertical_8_neon(uint8_t *s,
339233d2500723e5594f3e7c70896ffeeef32b9c950ywan;                              int pitch,
340233d2500723e5594f3e7c70896ffeeef32b9c950ywan;                              const uint8_t *blimit,
341233d2500723e5594f3e7c70896ffeeef32b9c950ywan;                              const uint8_t *limit,
342233d2500723e5594f3e7c70896ffeeef32b9c950ywan;                              const uint8_t *thresh,
343233d2500723e5594f3e7c70896ffeeef32b9c950ywan;                              int count)
344233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
345233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r0    uint8_t *s,
346233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r1    int pitch,
347233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r2    const uint8_t *blimit,
348233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r3    const uint8_t *limit,
349233d2500723e5594f3e7c70896ffeeef32b9c950ywan; sp    const uint8_t *thresh,
350233d2500723e5594f3e7c70896ffeeef32b9c950ywan; sp+4  int count
351233d2500723e5594f3e7c70896ffeeef32b9c950ywan|vp9_lpf_vertical_8_neon| PROC
352233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        {r4-r5, lr}
353233d2500723e5594f3e7c70896ffeeef32b9c950ywan
354233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.8      {d0[]}, [r2]              ; duplicate *blimit
355233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r12, [sp, #16]            ; load count
356233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.8      {d1[]}, [r3]              ; duplicate *limit
357233d2500723e5594f3e7c70896ffeeef32b9c950ywan
358233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r3, [sp, #12]             ; load thresh
359233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         r2, r0, #4                ; move s pointer down by 4 columns
360233d2500723e5594f3e7c70896ffeeef32b9c950ywan    cmp         r12, #0
361233d2500723e5594f3e7c70896ffeeef32b9c950ywan    beq         end_vp9_mblf_v_edge
362233d2500723e5594f3e7c70896ffeeef32b9c950ywan
363233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.8      {d2[]}, [r3]              ; duplicate *thresh
364233d2500723e5594f3e7c70896ffeeef32b9c950ywan
365233d2500723e5594f3e7c70896ffeeef32b9c950ywancount_mblf_v_loop
366233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8     {d3}, [r2], r1             ; load s data
367233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8     {d4}, [r2], r1
368233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8     {d5}, [r2], r1
369233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8     {d6}, [r2], r1
370233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8     {d7}, [r2], r1
371233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8     {d16}, [r2], r1
372233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8     {d17}, [r2], r1
373233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8     {d18}, [r2]
374233d2500723e5594f3e7c70896ffeeef32b9c950ywan
375233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;transpose to 8x16 matrix
376233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.32     d3, d7
377233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.32     d4, d16
378233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.32     d5, d17
379233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.32     d6, d18
380233d2500723e5594f3e7c70896ffeeef32b9c950ywan
381233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.16     d3, d5
382233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.16     d4, d6
383233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.16     d7, d17
384233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.16     d16, d18
385233d2500723e5594f3e7c70896ffeeef32b9c950ywan
386233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.8      d3, d4
387233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.8      d5, d6
388233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.8      d7, d16
389233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.8      d17, d18
390233d2500723e5594f3e7c70896ffeeef32b9c950ywan
391233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         r2, r0, #3
392233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add         r3, r0, #1
393233d2500723e5594f3e7c70896ffeeef32b9c950ywan
394233d2500723e5594f3e7c70896ffeeef32b9c950ywan    bl          vp9_mbloop_filter_neon
395233d2500723e5594f3e7c70896ffeeef32b9c950ywan
396233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;store op2, op1, op0, oq0
397233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r2], r1
398233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst4.8      {d0[1], d1[1], d2[1], d3[1]}, [r2], r1
399233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst4.8      {d0[2], d1[2], d2[2], d3[2]}, [r2], r1
400233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst4.8      {d0[3], d1[3], d2[3], d3[3]}, [r2], r1
401233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst4.8      {d0[4], d1[4], d2[4], d3[4]}, [r2], r1
402233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst4.8      {d0[5], d1[5], d2[5], d3[5]}, [r2], r1
403233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst4.8      {d0[6], d1[6], d2[6], d3[6]}, [r2], r1
404233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst4.8      {d0[7], d1[7], d2[7], d3[7]}, [r2]
405233d2500723e5594f3e7c70896ffeeef32b9c950ywan
406233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;store oq1, oq2
407233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst2.8      {d4[0], d5[0]}, [r3], r1
408233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst2.8      {d4[1], d5[1]}, [r3], r1
409233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst2.8      {d4[2], d5[2]}, [r3], r1
410233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst2.8      {d4[3], d5[3]}, [r3], r1
411233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst2.8      {d4[4], d5[4]}, [r3], r1
412233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst2.8      {d4[5], d5[5]}, [r3], r1
413233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst2.8      {d4[6], d5[6]}, [r3], r1
414233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst2.8      {d4[7], d5[7]}, [r3]
415233d2500723e5594f3e7c70896ffeeef32b9c950ywan
416233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add         r0, r0, r1, lsl #3         ; s += pitch * 8
417233d2500723e5594f3e7c70896ffeeef32b9c950ywan    subs        r12, r12, #1
418233d2500723e5594f3e7c70896ffeeef32b9c950ywan    subne       r2, r0, #4                 ; move s pointer down by 4 columns
419233d2500723e5594f3e7c70896ffeeef32b9c950ywan    bne         count_mblf_v_loop
420233d2500723e5594f3e7c70896ffeeef32b9c950ywan
421233d2500723e5594f3e7c70896ffeeef32b9c950ywanend_vp9_mblf_v_edge
422233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop         {r4-r5, pc}
423233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ENDP        ; |vp9_lpf_vertical_8_neon|
424233d2500723e5594f3e7c70896ffeeef32b9c950ywan
425233d2500723e5594f3e7c70896ffeeef32b9c950ywan; void vp9_mbloop_filter_neon();
426233d2500723e5594f3e7c70896ffeeef32b9c950ywan; This is a helper function for the loopfilters. The invidual functions do the
427233d2500723e5594f3e7c70896ffeeef32b9c950ywan; necessary load, transpose (if necessary) and store. The function does not use
428233d2500723e5594f3e7c70896ffeeef32b9c950ywan; registers d8-d15.
429233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
430233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Inputs:
431233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r0-r3, r12 PRESERVE
432233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d0    blimit
433233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d1    limit
434233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d2    thresh
435233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d3    p3
436233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d4    p2
437233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d5    p1
438233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d6    p0
439233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d7    q0
440233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d16   q1
441233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d17   q2
442233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d18   q3
443233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
444233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Outputs:
445233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d0    op2
446233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d1    op1
447233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d2    op0
448233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d3    oq0
449233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d4    oq1
450233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d5    oq2
451233d2500723e5594f3e7c70896ffeeef32b9c950ywan|vp9_mbloop_filter_neon| PROC
452233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; filter_mask
453233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vabd.u8     d19, d3, d4                ; m1 = abs(p3 - p2)
454233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vabd.u8     d20, d4, d5                ; m2 = abs(p2 - p1)
455233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vabd.u8     d21, d5, d6                ; m3 = abs(p1 - p0)
456233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vabd.u8     d22, d16, d7               ; m4 = abs(q1 - q0)
457233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vabd.u8     d23, d17, d16              ; m5 = abs(q2 - q1)
458233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vabd.u8     d24, d18, d17              ; m6 = abs(q3 - q2)
459233d2500723e5594f3e7c70896ffeeef32b9c950ywan
460233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; only compare the largest value to limit
461233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmax.u8     d19, d19, d20              ; m1 = max(m1, m2)
462233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmax.u8     d20, d21, d22              ; m2 = max(m3, m4)
463233d2500723e5594f3e7c70896ffeeef32b9c950ywan
464233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vabd.u8     d25, d6, d4                ; m7 = abs(p0 - p2)
465233d2500723e5594f3e7c70896ffeeef32b9c950ywan
466233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmax.u8     d23, d23, d24              ; m3 = max(m5, m6)
467233d2500723e5594f3e7c70896ffeeef32b9c950ywan
468233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vabd.u8     d26, d7, d17               ; m8 = abs(q0 - q2)
469233d2500723e5594f3e7c70896ffeeef32b9c950ywan
470233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmax.u8     d19, d19, d20
471233d2500723e5594f3e7c70896ffeeef32b9c950ywan
472233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vabd.u8     d24, d6, d7                ; m9 = abs(p0 - q0)
473233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vabd.u8     d27, d3, d6                ; m10 = abs(p3 - p0)
474233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vabd.u8     d28, d18, d7               ; m11 = abs(q3 - q0)
475233d2500723e5594f3e7c70896ffeeef32b9c950ywan
476233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmax.u8     d19, d19, d23
477233d2500723e5594f3e7c70896ffeeef32b9c950ywan
478233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vabd.u8     d23, d5, d16               ; a = abs(p1 - q1)
479233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqadd.u8    d24, d24, d24              ; b = abs(p0 - q0) * 2
480233d2500723e5594f3e7c70896ffeeef32b9c950ywan
481233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; abs () > limit
482233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vcge.u8     d19, d1, d19
483233d2500723e5594f3e7c70896ffeeef32b9c950ywan
484233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; only compare the largest value to thresh
485233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmax.u8     d25, d25, d26              ; m4 = max(m7, m8)
486233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmax.u8     d26, d27, d28              ; m5 = max(m10, m11)
487233d2500723e5594f3e7c70896ffeeef32b9c950ywan
488233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vshr.u8     d23, d23, #1               ; a = a / 2
489233d2500723e5594f3e7c70896ffeeef32b9c950ywan
490233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmax.u8     d25, d25, d26              ; m4 = max(m4, m5)
491233d2500723e5594f3e7c70896ffeeef32b9c950ywan
492233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqadd.u8    d24, d24, d23              ; a = b + a
493233d2500723e5594f3e7c70896ffeeef32b9c950ywan
494233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmax.u8     d20, d20, d25              ; m2 = max(m2, m4)
495233d2500723e5594f3e7c70896ffeeef32b9c950ywan
496233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmov.u8     d23, #1
497233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vcge.u8     d24, d0, d24               ; a > blimit
498233d2500723e5594f3e7c70896ffeeef32b9c950ywan
499233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vcgt.u8     d21, d21, d2               ; (abs(p1 - p0) > thresh)*-1
500233d2500723e5594f3e7c70896ffeeef32b9c950ywan
501233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vcge.u8     d20, d23, d20              ; flat
502233d2500723e5594f3e7c70896ffeeef32b9c950ywan
503233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vand        d19, d19, d24              ; mask
504233d2500723e5594f3e7c70896ffeeef32b9c950ywan
505233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vcgt.u8     d23, d22, d2               ; (abs(q1 - q0) > thresh)*-1
506233d2500723e5594f3e7c70896ffeeef32b9c950ywan
507233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vand        d20, d20, d19              ; flat & mask
508233d2500723e5594f3e7c70896ffeeef32b9c950ywan
509233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmov.u8     d22, #0x80
510233d2500723e5594f3e7c70896ffeeef32b9c950ywan
511233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vorr        d23, d21, d23              ; hev
512233d2500723e5594f3e7c70896ffeeef32b9c950ywan
513233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; This instruction will truncate the "flat & mask" masks down to 4 bits
514233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; each to fit into one 32 bit arm register. The values are stored in
515233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; q10.64[0].
516233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vshrn.u16   d30, q10, #4
517233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmov.u32    r4, d30[0]                 ; flat & mask 4bits
518233d2500723e5594f3e7c70896ffeeef32b9c950ywan
519233d2500723e5594f3e7c70896ffeeef32b9c950ywan    adds        r5, r4, #1                 ; Check for all 1's
520233d2500723e5594f3e7c70896ffeeef32b9c950ywan
521233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; If mask and flat are 1's for all vectors, then we only need to execute
522233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; the power branch for all vectors.
523233d2500723e5594f3e7c70896ffeeef32b9c950ywan    beq         power_branch_only
524233d2500723e5594f3e7c70896ffeeef32b9c950ywan
525233d2500723e5594f3e7c70896ffeeef32b9c950ywan    cmp         r4, #0                     ; Check for 0, set flag for later
526233d2500723e5594f3e7c70896ffeeef32b9c950ywan
527233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; mbfilter() function
528233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; filter() function
529233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; convert to signed
530233d2500723e5594f3e7c70896ffeeef32b9c950ywan    veor        d21, d7, d22               ; qs0
531233d2500723e5594f3e7c70896ffeeef32b9c950ywan    veor        d24, d6, d22               ; ps0
532233d2500723e5594f3e7c70896ffeeef32b9c950ywan    veor        d25, d5, d22               ; ps1
533233d2500723e5594f3e7c70896ffeeef32b9c950ywan    veor        d26, d16, d22              ; qs1
534233d2500723e5594f3e7c70896ffeeef32b9c950ywan
535233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmov.u8     d27, #3
536233d2500723e5594f3e7c70896ffeeef32b9c950ywan
537233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsub.s8     d28, d21, d24              ; ( qs0 - ps0)
538233d2500723e5594f3e7c70896ffeeef32b9c950ywan
539233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqsub.s8    d29, d25, d26              ; filter = clamp(ps1-qs1)
540233d2500723e5594f3e7c70896ffeeef32b9c950ywan
541233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.s8    q15, d28, d27              ; 3 * ( qs0 - ps0)
542233d2500723e5594f3e7c70896ffeeef32b9c950ywan
543233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vand        d29, d29, d23              ; filter &= hev
544233d2500723e5594f3e7c70896ffeeef32b9c950ywan
545233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddw.s8    q15, q15, d29              ; filter + 3 * (qs0 - ps0)
546233d2500723e5594f3e7c70896ffeeef32b9c950ywan
547233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmov.u8     d29, #4
548233d2500723e5594f3e7c70896ffeeef32b9c950ywan
549233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; filter = clamp(filter + 3 * ( qs0 - ps0))
550233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqmovn.s16  d28, q15
551233d2500723e5594f3e7c70896ffeeef32b9c950ywan
552233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vand        d28, d28, d19              ; filter &= mask
553233d2500723e5594f3e7c70896ffeeef32b9c950ywan
554233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqadd.s8    d30, d28, d27              ; filter2 = clamp(filter+3)
555233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqadd.s8    d29, d28, d29              ; filter1 = clamp(filter+4)
556233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vshr.s8     d30, d30, #3               ; filter2 >>= 3
557233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vshr.s8     d29, d29, #3               ; filter1 >>= 3
558233d2500723e5594f3e7c70896ffeeef32b9c950ywan
559233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqadd.s8    d24, d24, d30              ; op0 = clamp(ps0 + filter2)
560233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqsub.s8    d21, d21, d29              ; oq0 = clamp(qs0 - filter1)
561233d2500723e5594f3e7c70896ffeeef32b9c950ywan
562233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; outer tap adjustments: ++filter1 >> 1
563233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vrshr.s8    d29, d29, #1
564233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vbic        d29, d29, d23              ; filter &= ~hev
565233d2500723e5594f3e7c70896ffeeef32b9c950ywan
566233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqadd.s8    d25, d25, d29              ; op1 = clamp(ps1 + filter)
567233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqsub.s8    d26, d26, d29              ; oq1 = clamp(qs1 - filter)
568233d2500723e5594f3e7c70896ffeeef32b9c950ywan
569233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; If mask and flat are 0's for all vectors, then we only need to execute
570233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; the filter branch for all vectors.
571233d2500723e5594f3e7c70896ffeeef32b9c950ywan    beq         filter_branch_only
572233d2500723e5594f3e7c70896ffeeef32b9c950ywan
573233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; If mask and flat are mixed then we must perform both branches and
574233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; combine the data.
575233d2500723e5594f3e7c70896ffeeef32b9c950ywan    veor        d24, d24, d22              ; *f_op0 = u^0x80
576233d2500723e5594f3e7c70896ffeeef32b9c950ywan    veor        d21, d21, d22              ; *f_oq0 = u^0x80
577233d2500723e5594f3e7c70896ffeeef32b9c950ywan    veor        d25, d25, d22              ; *f_op1 = u^0x80
578233d2500723e5594f3e7c70896ffeeef32b9c950ywan    veor        d26, d26, d22              ; *f_oq1 = u^0x80
579233d2500723e5594f3e7c70896ffeeef32b9c950ywan
580233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; At this point we have already executed the filter branch. The filter
581233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; branch does not set op2 or oq2, so use p2 and q2. Execute the power
582233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; branch and combine the data.
583233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmov.u8     d23, #2
584233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddl.u8    q14, d6, d7                ; r_op2 = p0 + q0
585233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8    q14, d3, d27               ; r_op2 += p3 * 3
586233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8    q14, d4, d23               ; r_op2 += p2 * 2
587233d2500723e5594f3e7c70896ffeeef32b9c950ywan
588233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vbif        d0, d4, d20                ; op2 |= p2 & ~(flat & mask)
589233d2500723e5594f3e7c70896ffeeef32b9c950ywan
590233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddw.u8    q14, d5                    ; r_op2 += p1
591233d2500723e5594f3e7c70896ffeeef32b9c950ywan
592233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vbif        d1, d25, d20               ; op1 |= f_op1 & ~(flat & mask)
593233d2500723e5594f3e7c70896ffeeef32b9c950ywan
594233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16 d30, q14, #3               ; r_op2
595233d2500723e5594f3e7c70896ffeeef32b9c950ywan
596233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsubw.u8    q14, d3                    ; r_op1 = r_op2 - p3
597233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsubw.u8    q14, d4                    ; r_op1 -= p2
598233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddw.u8    q14, d5                    ; r_op1 += p1
599233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddw.u8    q14, d16                   ; r_op1 += q1
600233d2500723e5594f3e7c70896ffeeef32b9c950ywan
601233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vbif        d2, d24, d20               ; op0 |= f_op0 & ~(flat & mask)
602233d2500723e5594f3e7c70896ffeeef32b9c950ywan
603233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16 d31, q14, #3               ; r_op1
604233d2500723e5594f3e7c70896ffeeef32b9c950ywan
605233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsubw.u8    q14, d3                    ; r_op0 = r_op1 - p3
606233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsubw.u8    q14, d5                    ; r_op0 -= p1
607233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddw.u8    q14, d6                    ; r_op0 += p0
608233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddw.u8    q14, d17                   ; r_op0 += q2
609233d2500723e5594f3e7c70896ffeeef32b9c950ywan
610233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vbit        d0, d30, d20               ; op2 |= r_op2 & (flat & mask)
611233d2500723e5594f3e7c70896ffeeef32b9c950ywan
612233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16 d23, q14, #3               ; r_op0
613233d2500723e5594f3e7c70896ffeeef32b9c950ywan
614233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsubw.u8    q14, d3                    ; r_oq0 = r_op0 - p3
615233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsubw.u8    q14, d6                    ; r_oq0 -= p0
616233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddw.u8    q14, d7                    ; r_oq0 += q0
617233d2500723e5594f3e7c70896ffeeef32b9c950ywan
618233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vbit        d1, d31, d20               ; op1 |= r_op1 & (flat & mask)
619233d2500723e5594f3e7c70896ffeeef32b9c950ywan
620233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddw.u8    q14, d18                   ; oq0 += q3
621233d2500723e5594f3e7c70896ffeeef32b9c950ywan
622233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vbit        d2, d23, d20               ; op0 |= r_op0 & (flat & mask)
623233d2500723e5594f3e7c70896ffeeef32b9c950ywan
624233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16 d22, q14, #3               ; r_oq0
625233d2500723e5594f3e7c70896ffeeef32b9c950ywan
626233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsubw.u8    q14, d4                    ; r_oq1 = r_oq0 - p2
627233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsubw.u8    q14, d7                    ; r_oq1 -= q0
628233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddw.u8    q14, d16                   ; r_oq1 += q1
629233d2500723e5594f3e7c70896ffeeef32b9c950ywan
630233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vbif        d3, d21, d20               ; oq0 |= f_oq0 & ~(flat & mask)
631233d2500723e5594f3e7c70896ffeeef32b9c950ywan
632233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddw.u8    q14, d18                   ; r_oq1 += q3
633233d2500723e5594f3e7c70896ffeeef32b9c950ywan
634233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vbif        d4, d26, d20               ; oq1 |= f_oq1 & ~(flat & mask)
635233d2500723e5594f3e7c70896ffeeef32b9c950ywan
636233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16 d6, q14, #3                ; r_oq1
637233d2500723e5594f3e7c70896ffeeef32b9c950ywan
638233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsubw.u8    q14, d5                    ; r_oq2 = r_oq1 - p1
639233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsubw.u8    q14, d16                   ; r_oq2 -= q1
640233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddw.u8    q14, d17                   ; r_oq2 += q2
641233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddw.u8    q14, d18                   ; r_oq2 += q3
642233d2500723e5594f3e7c70896ffeeef32b9c950ywan
643233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vbif        d5, d17, d20               ; oq2 |= q2 & ~(flat & mask)
644233d2500723e5594f3e7c70896ffeeef32b9c950ywan
645233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16 d7, q14, #3                ; r_oq2
646233d2500723e5594f3e7c70896ffeeef32b9c950ywan
647233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vbit        d3, d22, d20               ; oq0 |= r_oq0 & (flat & mask)
648233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vbit        d4, d6, d20                ; oq1 |= r_oq1 & (flat & mask)
649233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vbit        d5, d7, d20                ; oq2 |= r_oq2 & (flat & mask)
650233d2500723e5594f3e7c70896ffeeef32b9c950ywan
651233d2500723e5594f3e7c70896ffeeef32b9c950ywan    bx          lr
652233d2500723e5594f3e7c70896ffeeef32b9c950ywan
653233d2500723e5594f3e7c70896ffeeef32b9c950ywanpower_branch_only
654233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmov.u8     d27, #3
655233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmov.u8     d21, #2
656233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddl.u8    q14, d6, d7                ; op2 = p0 + q0
657233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8    q14, d3, d27               ; op2 += p3 * 3
658233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8    q14, d4, d21               ; op2 += p2 * 2
659233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddw.u8    q14, d5                    ; op2 += p1
660233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16 d0, q14, #3                ; op2
661233d2500723e5594f3e7c70896ffeeef32b9c950ywan
662233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsubw.u8    q14, d3                    ; op1 = op2 - p3
663233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsubw.u8    q14, d4                    ; op1 -= p2
664233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddw.u8    q14, d5                    ; op1 += p1
665233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddw.u8    q14, d16                   ; op1 += q1
666233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16 d1, q14, #3                ; op1
667233d2500723e5594f3e7c70896ffeeef32b9c950ywan
668233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsubw.u8    q14, d3                    ; op0 = op1 - p3
669233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsubw.u8    q14, d5                    ; op0 -= p1
670233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddw.u8    q14, d6                    ; op0 += p0
671233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddw.u8    q14, d17                   ; op0 += q2
672233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16 d2, q14, #3                ; op0
673233d2500723e5594f3e7c70896ffeeef32b9c950ywan
674233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsubw.u8    q14, d3                    ; oq0 = op0 - p3
675233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsubw.u8    q14, d6                    ; oq0 -= p0
676233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddw.u8    q14, d7                    ; oq0 += q0
677233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddw.u8    q14, d18                   ; oq0 += q3
678233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16 d3, q14, #3                ; oq0
679233d2500723e5594f3e7c70896ffeeef32b9c950ywan
680233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsubw.u8    q14, d4                    ; oq1 = oq0 - p2
681233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsubw.u8    q14, d7                    ; oq1 -= q0
682233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddw.u8    q14, d16                   ; oq1 += q1
683233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddw.u8    q14, d18                   ; oq1 += q3
684233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16 d4, q14, #3                ; oq1
685233d2500723e5594f3e7c70896ffeeef32b9c950ywan
686233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsubw.u8    q14, d5                    ; oq2 = oq1 - p1
687233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsubw.u8    q14, d16                   ; oq2 -= q1
688233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddw.u8    q14, d17                   ; oq2 += q2
689233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddw.u8    q14, d18                   ; oq2 += q3
690233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16 d5, q14, #3                ; oq2
691233d2500723e5594f3e7c70896ffeeef32b9c950ywan
692233d2500723e5594f3e7c70896ffeeef32b9c950ywan    bx          lr
693233d2500723e5594f3e7c70896ffeeef32b9c950ywan
694233d2500723e5594f3e7c70896ffeeef32b9c950ywanfilter_branch_only
695233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; TODO(fgalligan): See if we can rearange registers so we do not need to
696233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; do the 2 vswp.
697233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vswp        d0, d4                      ; op2
698233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vswp        d5, d17                     ; oq2
699233d2500723e5594f3e7c70896ffeeef32b9c950ywan    veor        d2, d24, d22                ; *op0 = u^0x80
700233d2500723e5594f3e7c70896ffeeef32b9c950ywan    veor        d3, d21, d22                ; *oq0 = u^0x80
701233d2500723e5594f3e7c70896ffeeef32b9c950ywan    veor        d1, d25, d22                ; *op1 = u^0x80
702233d2500723e5594f3e7c70896ffeeef32b9c950ywan    veor        d4, d26, d22                ; *oq1 = u^0x80
703233d2500723e5594f3e7c70896ffeeef32b9c950ywan
704233d2500723e5594f3e7c70896ffeeef32b9c950ywan    bx          lr
705233d2500723e5594f3e7c70896ffeeef32b9c950ywan
706233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ENDP        ; |vp9_mbloop_filter_neon|
707233d2500723e5594f3e7c70896ffeeef32b9c950ywan
708233d2500723e5594f3e7c70896ffeeef32b9c950ywan    END
709