1233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
2233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
4233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  Use of this source code is governed by a BSD-style license
5233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  that can be found in the LICENSE file in the root of the source
6233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  tree. An additional intellectual property rights grant can be found
7233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  in the file PATENTS.  All contributing project authors may
8233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  be found in the AUTHORS file in the root of the source tree.
9233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
10233d2500723e5594f3e7c70896ffeeef32b9c950ywan
11233d2500723e5594f3e7c70896ffeeef32b9c950ywan    EXPORT  |vp9_lpf_horizontal_16_neon|
12233d2500723e5594f3e7c70896ffeeef32b9c950ywan    EXPORT  |vp9_lpf_vertical_16_neon|
13233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ARM
14233d2500723e5594f3e7c70896ffeeef32b9c950ywan
15233d2500723e5594f3e7c70896ffeeef32b9c950ywan    AREA ||.text||, CODE, READONLY, ALIGN=2
16233d2500723e5594f3e7c70896ffeeef32b9c950ywan
17233d2500723e5594f3e7c70896ffeeef32b9c950ywan; void vp9_lpf_horizontal_16_neon(uint8_t *s, int p,
18233d2500723e5594f3e7c70896ffeeef32b9c950ywan;                                 const uint8_t *blimit,
19233d2500723e5594f3e7c70896ffeeef32b9c950ywan;                                 const uint8_t *limit,
20233d2500723e5594f3e7c70896ffeeef32b9c950ywan;                                 const uint8_t *thresh
21233d2500723e5594f3e7c70896ffeeef32b9c950ywan;                                 int count)
22233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r0    uint8_t *s,
23233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r1    int p, /* pitch */
24233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r2    const uint8_t *blimit,
25233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r3    const uint8_t *limit,
26233d2500723e5594f3e7c70896ffeeef32b9c950ywan; sp    const uint8_t *thresh,
27233d2500723e5594f3e7c70896ffeeef32b9c950ywan|vp9_lpf_horizontal_16_neon| PROC
28233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        {r4-r8, lr}
29233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vpush       {d8-d15}
30233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r4, [sp, #88]              ; load thresh
31233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r12, [sp, #92]             ; load count
32233d2500723e5594f3e7c70896ffeeef32b9c950ywan
33233d2500723e5594f3e7c70896ffeeef32b9c950ywanh_count
34233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.8      {d16[]}, [r2]              ; load *blimit
35233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.8      {d17[]}, [r3]              ; load *limit
36233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.8      {d18[]}, [r4]              ; load *thresh
37233d2500723e5594f3e7c70896ffeeef32b9c950ywan
38233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         r8, r0, r1, lsl #3         ; move src pointer down by 8 lines
39233d2500723e5594f3e7c70896ffeeef32b9c950ywan
40233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8     {d0}, [r8@64], r1          ; p7
41233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8     {d1}, [r8@64], r1          ; p6
42233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8     {d2}, [r8@64], r1          ; p5
43233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8     {d3}, [r8@64], r1          ; p4
44233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8     {d4}, [r8@64], r1          ; p3
45233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8     {d5}, [r8@64], r1          ; p2
46233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8     {d6}, [r8@64], r1          ; p1
47233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8     {d7}, [r8@64], r1          ; p0
48233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8     {d8}, [r8@64], r1          ; q0
49233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8     {d9}, [r8@64], r1          ; q1
50233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8     {d10}, [r8@64], r1         ; q2
51233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8     {d11}, [r8@64], r1         ; q3
52233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8     {d12}, [r8@64], r1         ; q4
53233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8     {d13}, [r8@64], r1         ; q5
54233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8     {d14}, [r8@64], r1         ; q6
55233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.u8     {d15}, [r8@64], r1         ; q7
56233d2500723e5594f3e7c70896ffeeef32b9c950ywan
57233d2500723e5594f3e7c70896ffeeef32b9c950ywan    bl          vp9_wide_mbfilter_neon
58233d2500723e5594f3e7c70896ffeeef32b9c950ywan
59233d2500723e5594f3e7c70896ffeeef32b9c950ywan    tst         r7, #1
60233d2500723e5594f3e7c70896ffeeef32b9c950ywan    beq         h_mbfilter
61233d2500723e5594f3e7c70896ffeeef32b9c950ywan
62233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; flat && mask were not set for any of the channels. Just store the values
63233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; from filter.
64233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         r8, r0, r1, lsl #1
65233d2500723e5594f3e7c70896ffeeef32b9c950ywan
66233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8     {d25}, [r8@64], r1         ; store op1
67233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8     {d24}, [r8@64], r1         ; store op0
68233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8     {d23}, [r8@64], r1         ; store oq0
69233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8     {d26}, [r8@64], r1         ; store oq1
70233d2500723e5594f3e7c70896ffeeef32b9c950ywan
71233d2500723e5594f3e7c70896ffeeef32b9c950ywan    b           h_next
72233d2500723e5594f3e7c70896ffeeef32b9c950ywan
73233d2500723e5594f3e7c70896ffeeef32b9c950ywanh_mbfilter
74233d2500723e5594f3e7c70896ffeeef32b9c950ywan    tst         r7, #2
75233d2500723e5594f3e7c70896ffeeef32b9c950ywan    beq         h_wide_mbfilter
76233d2500723e5594f3e7c70896ffeeef32b9c950ywan
77233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; flat2 was not set for any of the channels. Just store the values from
78233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; mbfilter.
79233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         r8, r0, r1, lsl #1
80233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         r8, r8, r1
81233d2500723e5594f3e7c70896ffeeef32b9c950ywan
82233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8     {d18}, [r8@64], r1         ; store op2
83233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8     {d19}, [r8@64], r1         ; store op1
84233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8     {d20}, [r8@64], r1         ; store op0
85233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8     {d21}, [r8@64], r1         ; store oq0
86233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8     {d22}, [r8@64], r1         ; store oq1
87233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8     {d23}, [r8@64], r1         ; store oq2
88233d2500723e5594f3e7c70896ffeeef32b9c950ywan
89233d2500723e5594f3e7c70896ffeeef32b9c950ywan    b           h_next
90233d2500723e5594f3e7c70896ffeeef32b9c950ywan
91233d2500723e5594f3e7c70896ffeeef32b9c950ywanh_wide_mbfilter
92233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         r8, r0, r1, lsl #3
93233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add         r8, r8, r1
94233d2500723e5594f3e7c70896ffeeef32b9c950ywan
95233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8     {d16}, [r8@64], r1         ; store op6
96233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8     {d24}, [r8@64], r1         ; store op5
97233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8     {d25}, [r8@64], r1         ; store op4
98233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8     {d26}, [r8@64], r1         ; store op3
99233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8     {d27}, [r8@64], r1         ; store op2
100233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8     {d18}, [r8@64], r1         ; store op1
101233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8     {d19}, [r8@64], r1         ; store op0
102233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8     {d20}, [r8@64], r1         ; store oq0
103233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8     {d21}, [r8@64], r1         ; store oq1
104233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8     {d22}, [r8@64], r1         ; store oq2
105233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8     {d23}, [r8@64], r1         ; store oq3
106233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8     {d1}, [r8@64], r1          ; store oq4
107233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8     {d2}, [r8@64], r1          ; store oq5
108233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.u8     {d3}, [r8@64], r1          ; store oq6
109233d2500723e5594f3e7c70896ffeeef32b9c950ywan
110233d2500723e5594f3e7c70896ffeeef32b9c950ywanh_next
111233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add         r0, r0, #8
112233d2500723e5594f3e7c70896ffeeef32b9c950ywan    subs        r12, r12, #1
113233d2500723e5594f3e7c70896ffeeef32b9c950ywan    bne         h_count
114233d2500723e5594f3e7c70896ffeeef32b9c950ywan
115233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vpop        {d8-d15}
116233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop         {r4-r8, pc}
117233d2500723e5594f3e7c70896ffeeef32b9c950ywan
118233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ENDP        ; |vp9_lpf_horizontal_16_neon|
119233d2500723e5594f3e7c70896ffeeef32b9c950ywan
120233d2500723e5594f3e7c70896ffeeef32b9c950ywan; void vp9_lpf_vertical_16_neon(uint8_t *s, int p,
121233d2500723e5594f3e7c70896ffeeef32b9c950ywan;                               const uint8_t *blimit,
122233d2500723e5594f3e7c70896ffeeef32b9c950ywan;                               const uint8_t *limit,
123233d2500723e5594f3e7c70896ffeeef32b9c950ywan;                               const uint8_t *thresh)
124233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r0    uint8_t *s,
125233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r1    int p, /* pitch */
126233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r2    const uint8_t *blimit,
127233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r3    const uint8_t *limit,
128233d2500723e5594f3e7c70896ffeeef32b9c950ywan; sp    const uint8_t *thresh,
129233d2500723e5594f3e7c70896ffeeef32b9c950ywan|vp9_lpf_vertical_16_neon| PROC
130233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        {r4-r8, lr}
131233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vpush       {d8-d15}
132233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ldr         r4, [sp, #88]              ; load thresh
133233d2500723e5594f3e7c70896ffeeef32b9c950ywan
134233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.8      {d16[]}, [r2]              ; load *blimit
135233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.8      {d17[]}, [r3]              ; load *limit
136233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.8      {d18[]}, [r4]              ; load *thresh
137233d2500723e5594f3e7c70896ffeeef32b9c950ywan
138233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         r8, r0, #8
139233d2500723e5594f3e7c70896ffeeef32b9c950ywan
140233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.8      {d0}, [r8@64], r1
141233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.8      {d8}, [r0@64], r1
142233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.8      {d1}, [r8@64], r1
143233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.8      {d9}, [r0@64], r1
144233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.8      {d2}, [r8@64], r1
145233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.8      {d10}, [r0@64], r1
146233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.8      {d3}, [r8@64], r1
147233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.8      {d11}, [r0@64], r1
148233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.8      {d4}, [r8@64], r1
149233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.8      {d12}, [r0@64], r1
150233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.8      {d5}, [r8@64], r1
151233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.8      {d13}, [r0@64], r1
152233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.8      {d6}, [r8@64], r1
153233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.8      {d14}, [r0@64], r1
154233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.8      {d7}, [r8@64], r1
155233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vld1.8      {d15}, [r0@64], r1
156233d2500723e5594f3e7c70896ffeeef32b9c950ywan
157233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         r0, r0, r1, lsl #3
158233d2500723e5594f3e7c70896ffeeef32b9c950ywan
159233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.32     q0, q2
160233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.32     q1, q3
161233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.32     q4, q6
162233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.32     q5, q7
163233d2500723e5594f3e7c70896ffeeef32b9c950ywan
164233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.16     q0, q1
165233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.16     q2, q3
166233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.16     q4, q5
167233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.16     q6, q7
168233d2500723e5594f3e7c70896ffeeef32b9c950ywan
169233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.8      d0, d1
170233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.8      d2, d3
171233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.8      d4, d5
172233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.8      d6, d7
173233d2500723e5594f3e7c70896ffeeef32b9c950ywan
174233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.8      d8, d9
175233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.8      d10, d11
176233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.8      d12, d13
177233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.8      d14, d15
178233d2500723e5594f3e7c70896ffeeef32b9c950ywan
179233d2500723e5594f3e7c70896ffeeef32b9c950ywan    bl          vp9_wide_mbfilter_neon
180233d2500723e5594f3e7c70896ffeeef32b9c950ywan
181233d2500723e5594f3e7c70896ffeeef32b9c950ywan    tst         r7, #1
182233d2500723e5594f3e7c70896ffeeef32b9c950ywan    beq         v_mbfilter
183233d2500723e5594f3e7c70896ffeeef32b9c950ywan
184233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; flat && mask were not set for any of the channels. Just store the values
185233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; from filter.
186233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         r8, r0, #2
187233d2500723e5594f3e7c70896ffeeef32b9c950ywan
188233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vswp        d23, d25
189233d2500723e5594f3e7c70896ffeeef32b9c950ywan
190233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst4.8      {d23[0], d24[0], d25[0], d26[0]}, [r8], r1
191233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst4.8      {d23[1], d24[1], d25[1], d26[1]}, [r8], r1
192233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst4.8      {d23[2], d24[2], d25[2], d26[2]}, [r8], r1
193233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst4.8      {d23[3], d24[3], d25[3], d26[3]}, [r8], r1
194233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst4.8      {d23[4], d24[4], d25[4], d26[4]}, [r8], r1
195233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst4.8      {d23[5], d24[5], d25[5], d26[5]}, [r8], r1
196233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst4.8      {d23[6], d24[6], d25[6], d26[6]}, [r8], r1
197233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst4.8      {d23[7], d24[7], d25[7], d26[7]}, [r8], r1
198233d2500723e5594f3e7c70896ffeeef32b9c950ywan
199233d2500723e5594f3e7c70896ffeeef32b9c950ywan    b           v_end
200233d2500723e5594f3e7c70896ffeeef32b9c950ywan
201233d2500723e5594f3e7c70896ffeeef32b9c950ywanv_mbfilter
202233d2500723e5594f3e7c70896ffeeef32b9c950ywan    tst         r7, #2
203233d2500723e5594f3e7c70896ffeeef32b9c950ywan    beq         v_wide_mbfilter
204233d2500723e5594f3e7c70896ffeeef32b9c950ywan
205233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; flat2 was not set for any of the channels. Just store the values from
206233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; mbfilter.
207233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         r8, r0, #3
208233d2500723e5594f3e7c70896ffeeef32b9c950ywan
209233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst3.8      {d18[0], d19[0], d20[0]}, [r8], r1
210233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst3.8      {d21[0], d22[0], d23[0]}, [r0], r1
211233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst3.8      {d18[1], d19[1], d20[1]}, [r8], r1
212233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst3.8      {d21[1], d22[1], d23[1]}, [r0], r1
213233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst3.8      {d18[2], d19[2], d20[2]}, [r8], r1
214233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst3.8      {d21[2], d22[2], d23[2]}, [r0], r1
215233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst3.8      {d18[3], d19[3], d20[3]}, [r8], r1
216233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst3.8      {d21[3], d22[3], d23[3]}, [r0], r1
217233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst3.8      {d18[4], d19[4], d20[4]}, [r8], r1
218233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst3.8      {d21[4], d22[4], d23[4]}, [r0], r1
219233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst3.8      {d18[5], d19[5], d20[5]}, [r8], r1
220233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst3.8      {d21[5], d22[5], d23[5]}, [r0], r1
221233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst3.8      {d18[6], d19[6], d20[6]}, [r8], r1
222233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst3.8      {d21[6], d22[6], d23[6]}, [r0], r1
223233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst3.8      {d18[7], d19[7], d20[7]}, [r8], r1
224233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst3.8      {d21[7], d22[7], d23[7]}, [r0], r1
225233d2500723e5594f3e7c70896ffeeef32b9c950ywan
226233d2500723e5594f3e7c70896ffeeef32b9c950ywan    b           v_end
227233d2500723e5594f3e7c70896ffeeef32b9c950ywan
228233d2500723e5594f3e7c70896ffeeef32b9c950ywanv_wide_mbfilter
229233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         r8, r0, #8
230233d2500723e5594f3e7c70896ffeeef32b9c950ywan
231233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.32     d0,  d26
232233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.32     d16, d27
233233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.32     d24, d18
234233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.32     d25, d19
235233d2500723e5594f3e7c70896ffeeef32b9c950ywan
236233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.16     d0,  d24
237233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.16     d16, d25
238233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.16     d26, d18
239233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.16     d27, d19
240233d2500723e5594f3e7c70896ffeeef32b9c950ywan
241233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.8      d0,  d16
242233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.8      d24, d25
243233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.8      d26, d27
244233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.8      d18, d19
245233d2500723e5594f3e7c70896ffeeef32b9c950ywan
246233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.32     d20, d1
247233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.32     d21, d2
248233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.32     d22, d3
249233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.32     d23, d15
250233d2500723e5594f3e7c70896ffeeef32b9c950ywan
251233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.16     d20, d22
252233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.16     d21, d23
253233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.16     d1,  d3
254233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.16     d2,  d15
255233d2500723e5594f3e7c70896ffeeef32b9c950ywan
256233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.8      d20, d21
257233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.8      d22, d23
258233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.8      d1,  d2
259233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vtrn.8      d3,  d15
260233d2500723e5594f3e7c70896ffeeef32b9c950ywan
261233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.8      {d0}, [r8@64], r1
262233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.8      {d20}, [r0@64], r1
263233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.8      {d16}, [r8@64], r1
264233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.8      {d21}, [r0@64], r1
265233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.8      {d24}, [r8@64], r1
266233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.8      {d22}, [r0@64], r1
267233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.8      {d25}, [r8@64], r1
268233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.8      {d23}, [r0@64], r1
269233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.8      {d26}, [r8@64], r1
270233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.8      {d1}, [r0@64], r1
271233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.8      {d27}, [r8@64], r1
272233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.8      {d2}, [r0@64], r1
273233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.8      {d18}, [r8@64], r1
274233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.8      {d3}, [r0@64], r1
275233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.8      {d19}, [r8@64], r1
276233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1.8      {d15}, [r0@64], r1
277233d2500723e5594f3e7c70896ffeeef32b9c950ywan
278233d2500723e5594f3e7c70896ffeeef32b9c950ywanv_end
279233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vpop        {d8-d15}
280233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop         {r4-r8, pc}
281233d2500723e5594f3e7c70896ffeeef32b9c950ywan
282233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ENDP        ; |vp9_lpf_vertical_16_neon|
283233d2500723e5594f3e7c70896ffeeef32b9c950ywan
284233d2500723e5594f3e7c70896ffeeef32b9c950ywan; void vp9_wide_mbfilter_neon();
285233d2500723e5594f3e7c70896ffeeef32b9c950ywan; This is a helper function for the loopfilters. The invidual functions do the
286233d2500723e5594f3e7c70896ffeeef32b9c950ywan; necessary load, transpose (if necessary) and store.
287233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
288233d2500723e5594f3e7c70896ffeeef32b9c950ywan; r0-r3 PRESERVE
289233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d16    blimit
290233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d17    limit
291233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d18    thresh
292233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d0    p7
293233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d1    p6
294233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d2    p5
295233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d3    p4
296233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d4    p3
297233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d5    p2
298233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d6    p1
299233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d7    p0
300233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d8    q0
301233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d9    q1
302233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d10   q2
303233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d11   q3
304233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d12   q4
305233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d13   q5
306233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d14   q6
307233d2500723e5594f3e7c70896ffeeef32b9c950ywan; d15   q7
308233d2500723e5594f3e7c70896ffeeef32b9c950ywan|vp9_wide_mbfilter_neon| PROC
309233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         r7, #0
310233d2500723e5594f3e7c70896ffeeef32b9c950ywan
311233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; filter_mask
312233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vabd.u8     d19, d4, d5                ; abs(p3 - p2)
313233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vabd.u8     d20, d5, d6                ; abs(p2 - p1)
314233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vabd.u8     d21, d6, d7                ; abs(p1 - p0)
315233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vabd.u8     d22, d9, d8                ; abs(q1 - q0)
316233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vabd.u8     d23, d10, d9               ; abs(q2 - q1)
317233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vabd.u8     d24, d11, d10              ; abs(q3 - q2)
318233d2500723e5594f3e7c70896ffeeef32b9c950ywan
319233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; only compare the largest value to limit
320233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmax.u8     d19, d19, d20              ; max(abs(p3 - p2), abs(p2 - p1))
321233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmax.u8     d20, d21, d22              ; max(abs(p1 - p0), abs(q1 - q0))
322233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmax.u8     d23, d23, d24              ; max(abs(q2 - q1), abs(q3 - q2))
323233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmax.u8     d19, d19, d20
324233d2500723e5594f3e7c70896ffeeef32b9c950ywan
325233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vabd.u8     d24, d7, d8                ; abs(p0 - q0)
326233d2500723e5594f3e7c70896ffeeef32b9c950ywan
327233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmax.u8     d19, d19, d23
328233d2500723e5594f3e7c70896ffeeef32b9c950ywan
329233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vabd.u8     d23, d6, d9                ; a = abs(p1 - q1)
330233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqadd.u8    d24, d24, d24              ; b = abs(p0 - q0) * 2
331233d2500723e5594f3e7c70896ffeeef32b9c950ywan
332233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; abs () > limit
333233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vcge.u8     d19, d17, d19
334233d2500723e5594f3e7c70896ffeeef32b9c950ywan
335233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; flatmask4
336233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vabd.u8     d25, d7, d5                ; abs(p0 - p2)
337233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vabd.u8     d26, d8, d10               ; abs(q0 - q2)
338233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vabd.u8     d27, d4, d7                ; abs(p3 - p0)
339233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vabd.u8     d28, d11, d8               ; abs(q3 - q0)
340233d2500723e5594f3e7c70896ffeeef32b9c950ywan
341233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; only compare the largest value to thresh
342233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmax.u8     d25, d25, d26              ; max(abs(p0 - p2), abs(q0 - q2))
343233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmax.u8     d26, d27, d28              ; max(abs(p3 - p0), abs(q3 - q0))
344233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmax.u8     d25, d25, d26
345233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmax.u8     d20, d20, d25
346233d2500723e5594f3e7c70896ffeeef32b9c950ywan
347233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vshr.u8     d23, d23, #1               ; a = a / 2
348233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqadd.u8    d24, d24, d23              ; a = b + a
349233d2500723e5594f3e7c70896ffeeef32b9c950ywan
350233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmov.u8     d30, #1
351233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vcge.u8     d24, d16, d24              ; (a > blimit * 2 + limit) * -1
352233d2500723e5594f3e7c70896ffeeef32b9c950ywan
353233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vcge.u8     d20, d30, d20              ; flat
354233d2500723e5594f3e7c70896ffeeef32b9c950ywan
355233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vand        d19, d19, d24              ; mask
356233d2500723e5594f3e7c70896ffeeef32b9c950ywan
357233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; hevmask
358233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vcgt.u8     d21, d21, d18              ; (abs(p1 - p0) > thresh)*-1
359233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vcgt.u8     d22, d22, d18              ; (abs(q1 - q0) > thresh)*-1
360233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vorr        d21, d21, d22              ; hev
361233d2500723e5594f3e7c70896ffeeef32b9c950ywan
362233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vand        d16, d20, d19              ; flat && mask
363233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmov        r5, r6, d16
364233d2500723e5594f3e7c70896ffeeef32b9c950ywan
365233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; flatmask5(1, p7, p6, p5, p4, p0, q0, q4, q5, q6, q7)
366233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vabd.u8     d22, d3, d7                ; abs(p4 - p0)
367233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vabd.u8     d23, d12, d8               ; abs(q4 - q0)
368233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vabd.u8     d24, d7, d2                ; abs(p0 - p5)
369233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vabd.u8     d25, d8, d13               ; abs(q0 - q5)
370233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vabd.u8     d26, d1, d7                ; abs(p6 - p0)
371233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vabd.u8     d27, d14, d8               ; abs(q6 - q0)
372233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vabd.u8     d28, d0, d7                ; abs(p7 - p0)
373233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vabd.u8     d29, d15, d8               ; abs(q7 - q0)
374233d2500723e5594f3e7c70896ffeeef32b9c950ywan
375233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; only compare the largest value to thresh
376233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmax.u8     d22, d22, d23              ; max(abs(p4 - p0), abs(q4 - q0))
377233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmax.u8     d23, d24, d25              ; max(abs(p0 - p5), abs(q0 - q5))
378233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmax.u8     d24, d26, d27              ; max(abs(p6 - p0), abs(q6 - q0))
379233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmax.u8     d25, d28, d29              ; max(abs(p7 - p0), abs(q7 - q0))
380233d2500723e5594f3e7c70896ffeeef32b9c950ywan
381233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmax.u8     d26, d22, d23
382233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmax.u8     d27, d24, d25
383233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmax.u8     d23, d26, d27
384233d2500723e5594f3e7c70896ffeeef32b9c950ywan
385233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vcge.u8     d18, d30, d23              ; flat2
386233d2500723e5594f3e7c70896ffeeef32b9c950ywan
387233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmov.u8     d22, #0x80
388233d2500723e5594f3e7c70896ffeeef32b9c950ywan
389233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orrs        r5, r5, r6                 ; Check for 0
390233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orreq       r7, r7, #1                 ; Only do filter branch
391233d2500723e5594f3e7c70896ffeeef32b9c950ywan
392233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vand        d17, d18, d16              ; flat2 && flat && mask
393233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmov        r5, r6, d17
394233d2500723e5594f3e7c70896ffeeef32b9c950ywan
395233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; mbfilter() function
396233d2500723e5594f3e7c70896ffeeef32b9c950ywan
397233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; filter() function
398233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; convert to signed
399233d2500723e5594f3e7c70896ffeeef32b9c950ywan    veor        d23, d8, d22               ; qs0
400233d2500723e5594f3e7c70896ffeeef32b9c950ywan    veor        d24, d7, d22               ; ps0
401233d2500723e5594f3e7c70896ffeeef32b9c950ywan    veor        d25, d6, d22               ; ps1
402233d2500723e5594f3e7c70896ffeeef32b9c950ywan    veor        d26, d9, d22               ; qs1
403233d2500723e5594f3e7c70896ffeeef32b9c950ywan
404233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmov.u8     d27, #3
405233d2500723e5594f3e7c70896ffeeef32b9c950ywan
406233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsub.s8     d28, d23, d24              ; ( qs0 - ps0)
407233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqsub.s8    d29, d25, d26              ; filter = clamp(ps1-qs1)
408233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmull.s8    q15, d28, d27              ; 3 * ( qs0 - ps0)
409233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vand        d29, d29, d21              ; filter &= hev
410233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddw.s8    q15, q15, d29              ; filter + 3 * (qs0 - ps0)
411233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmov.u8     d29, #4
412233d2500723e5594f3e7c70896ffeeef32b9c950ywan
413233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; filter = clamp(filter + 3 * ( qs0 - ps0))
414233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqmovn.s16  d28, q15
415233d2500723e5594f3e7c70896ffeeef32b9c950ywan
416233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vand        d28, d28, d19              ; filter &= mask
417233d2500723e5594f3e7c70896ffeeef32b9c950ywan
418233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqadd.s8    d30, d28, d27              ; filter2 = clamp(filter+3)
419233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqadd.s8    d29, d28, d29              ; filter1 = clamp(filter+4)
420233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vshr.s8     d30, d30, #3               ; filter2 >>= 3
421233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vshr.s8     d29, d29, #3               ; filter1 >>= 3
422233d2500723e5594f3e7c70896ffeeef32b9c950ywan
423233d2500723e5594f3e7c70896ffeeef32b9c950ywan
424233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqadd.s8    d24, d24, d30              ; op0 = clamp(ps0 + filter2)
425233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqsub.s8    d23, d23, d29              ; oq0 = clamp(qs0 - filter1)
426233d2500723e5594f3e7c70896ffeeef32b9c950ywan
427233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; outer tap adjustments: ++filter1 >> 1
428233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vrshr.s8    d29, d29, #1
429233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vbic        d29, d29, d21              ; filter &= ~hev
430233d2500723e5594f3e7c70896ffeeef32b9c950ywan
431233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqadd.s8    d25, d25, d29              ; op1 = clamp(ps1 + filter)
432233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqsub.s8    d26, d26, d29              ; oq1 = clamp(qs1 - filter)
433233d2500723e5594f3e7c70896ffeeef32b9c950ywan
434233d2500723e5594f3e7c70896ffeeef32b9c950ywan    veor        d24, d24, d22              ; *f_op0 = u^0x80
435233d2500723e5594f3e7c70896ffeeef32b9c950ywan    veor        d23, d23, d22              ; *f_oq0 = u^0x80
436233d2500723e5594f3e7c70896ffeeef32b9c950ywan    veor        d25, d25, d22              ; *f_op1 = u^0x80
437233d2500723e5594f3e7c70896ffeeef32b9c950ywan    veor        d26, d26, d22              ; *f_oq1 = u^0x80
438233d2500723e5594f3e7c70896ffeeef32b9c950ywan
439233d2500723e5594f3e7c70896ffeeef32b9c950ywan    tst         r7, #1
440233d2500723e5594f3e7c70896ffeeef32b9c950ywan    bxne        lr
441233d2500723e5594f3e7c70896ffeeef32b9c950ywan
442233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orrs        r5, r5, r6                 ; Check for 0
443233d2500723e5594f3e7c70896ffeeef32b9c950ywan    orreq       r7, r7, #2                 ; Only do mbfilter branch
444233d2500723e5594f3e7c70896ffeeef32b9c950ywan
445233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; mbfilter flat && mask branch
446233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; TODO(fgalligan): Can I decrease the cycles shifting to consective d's
447233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; and using vibt on the q's?
448233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmov.u8     d29, #2
449233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddl.u8    q15, d7, d8                ; op2 = p0 + q0
450233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8    q15, d4, d27               ; op2 = p0 + q0 + p3 * 3
451233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8    q15, d5, d29               ; op2 = p0 + q0 + p3 * 3 + p2 * 2
452233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddl.u8    q10, d4, d5
453233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddw.u8    q15, d6                    ; op2=p1 + p0 + q0 + p3 * 3 + p2 *2
454233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddl.u8    q14, d6, d9
455233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16 d18, q15, #3               ; r_op2
456233d2500723e5594f3e7c70896ffeeef32b9c950ywan
457233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsub.i16    q15, q10
458233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddl.u8    q10, d4, d6
459233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vadd.i16    q15, q14
460233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddl.u8    q14, d7, d10
461233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16 d19, q15, #3               ; r_op1
462233d2500723e5594f3e7c70896ffeeef32b9c950ywan
463233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsub.i16    q15, q10
464233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vadd.i16    q15, q14
465233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddl.u8    q14, d8, d11
466233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16 d20, q15, #3               ; r_op0
467233d2500723e5594f3e7c70896ffeeef32b9c950ywan
468233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsubw.u8    q15, d4                    ; oq0 = op0 - p3
469233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsubw.u8    q15, d7                    ; oq0 -= p0
470233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vadd.i16    q15, q14
471233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddl.u8    q14, d9, d11
472233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16 d21, q15, #3               ; r_oq0
473233d2500723e5594f3e7c70896ffeeef32b9c950ywan
474233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsubw.u8    q15, d5                    ; oq1 = oq0 - p2
475233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsubw.u8    q15, d8                    ; oq1 -= q0
476233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vadd.i16    q15, q14
477233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddl.u8    q14, d10, d11
478233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16 d22, q15, #3               ; r_oq1
479233d2500723e5594f3e7c70896ffeeef32b9c950ywan
480233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsubw.u8    q15, d6                    ; oq2 = oq0 - p1
481233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsubw.u8    q15, d9                    ; oq2 -= q1
482233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vadd.i16    q15, q14
483233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16 d27, q15, #3               ; r_oq2
484233d2500723e5594f3e7c70896ffeeef32b9c950ywan
485233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; Filter does not set op2 or oq2, so use p2 and q2.
486233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vbif        d18, d5, d16               ; t_op2 |= p2 & ~(flat & mask)
487233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vbif        d19, d25, d16              ; t_op1 |= f_op1 & ~(flat & mask)
488233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vbif        d20, d24, d16              ; t_op0 |= f_op0 & ~(flat & mask)
489233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vbif        d21, d23, d16              ; t_oq0 |= f_oq0 & ~(flat & mask)
490233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vbif        d22, d26, d16              ; t_oq1 |= f_oq1 & ~(flat & mask)
491233d2500723e5594f3e7c70896ffeeef32b9c950ywan
492233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vbit        d23, d27, d16              ; t_oq2 |= r_oq2 & (flat & mask)
493233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vbif        d23, d10, d16              ; t_oq2 |= q2 & ~(flat & mask)
494233d2500723e5594f3e7c70896ffeeef32b9c950ywan
495233d2500723e5594f3e7c70896ffeeef32b9c950ywan    tst         r7, #2
496233d2500723e5594f3e7c70896ffeeef32b9c950ywan    bxne        lr
497233d2500723e5594f3e7c70896ffeeef32b9c950ywan
498233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; wide_mbfilter flat2 && flat && mask branch
499233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmov.u8     d16, #7
500233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddl.u8    q15, d7, d8                ; op6 = p0 + q0
501233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddl.u8    q12, d2, d3
502233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddl.u8    q13, d4, d5
503233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddl.u8    q14, d1, d6
504233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmlal.u8    q15, d0, d16               ; op6 += p7 * 3
505233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vadd.i16    q12, q13
506233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vadd.i16    q15, q14
507233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddl.u8    q14, d2, d9
508233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vadd.i16    q15, q12
509233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddl.u8    q12, d0, d1
510233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddw.u8    q15, d1
511233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddl.u8    q13, d0, d2
512233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vadd.i16    q14, q15, q14
513233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16 d16, q15, #4               ; w_op6
514233d2500723e5594f3e7c70896ffeeef32b9c950ywan
515233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsub.i16    q15, q14, q12
516233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddl.u8    q14, d3, d10
517233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16 d24, q15, #4               ; w_op5
518233d2500723e5594f3e7c70896ffeeef32b9c950ywan
519233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsub.i16    q15, q13
520233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddl.u8    q13, d0, d3
521233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vadd.i16    q15, q14
522233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddl.u8    q14, d4, d11
523233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16 d25, q15, #4               ; w_op4
524233d2500723e5594f3e7c70896ffeeef32b9c950ywan
525233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vadd.i16    q15, q14
526233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddl.u8    q14, d0, d4
527233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsub.i16    q15, q13
528233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsub.i16    q14, q15, q14
529233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16 d26, q15, #4               ; w_op3
530233d2500723e5594f3e7c70896ffeeef32b9c950ywan
531233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddw.u8    q15, q14, d5               ; op2 += p2
532233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddl.u8    q14, d0, d5
533233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddw.u8    q15, d12                   ; op2 += q4
534233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vbif        d26, d4, d17               ; op3 |= p3 & ~(f2 & f & m)
535233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16 d27, q15, #4               ; w_op2
536233d2500723e5594f3e7c70896ffeeef32b9c950ywan
537233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsub.i16    q15, q14
538233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddl.u8    q14, d0, d6
539233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddw.u8    q15, d6                    ; op1 += p1
540233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddw.u8    q15, d13                   ; op1 += q5
541233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vbif        d27, d18, d17              ; op2 |= t_op2 & ~(f2 & f & m)
542233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16 d18, q15, #4               ; w_op1
543233d2500723e5594f3e7c70896ffeeef32b9c950ywan
544233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsub.i16    q15, q14
545233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddl.u8    q14, d0, d7
546233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddw.u8    q15, d7                    ; op0 += p0
547233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddw.u8    q15, d14                   ; op0 += q6
548233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vbif        d18, d19, d17              ; op1 |= t_op1 & ~(f2 & f & m)
549233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16 d19, q15, #4               ; w_op0
550233d2500723e5594f3e7c70896ffeeef32b9c950ywan
551233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsub.i16    q15, q14
552233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddl.u8    q14, d1, d8
553233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddw.u8    q15, d8                    ; oq0 += q0
554233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddw.u8    q15, d15                   ; oq0 += q7
555233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vbif        d19, d20, d17              ; op0 |= t_op0 & ~(f2 & f & m)
556233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16 d20, q15, #4               ; w_oq0
557233d2500723e5594f3e7c70896ffeeef32b9c950ywan
558233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsub.i16    q15, q14
559233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddl.u8    q14, d2, d9
560233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddw.u8    q15, d9                    ; oq1 += q1
561233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddl.u8    q4, d10, d15
562233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddw.u8    q15, d15                   ; oq1 += q7
563233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vbif        d20, d21, d17              ; oq0 |= t_oq0 & ~(f2 & f & m)
564233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16 d21, q15, #4               ; w_oq1
565233d2500723e5594f3e7c70896ffeeef32b9c950ywan
566233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsub.i16    q15, q14
567233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddl.u8    q14, d3, d10
568233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vadd.i16    q15, q4
569233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddl.u8    q4, d11, d15
570233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vbif        d21, d22, d17              ; oq1 |= t_oq1 & ~(f2 & f & m)
571233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16 d22, q15, #4               ; w_oq2
572233d2500723e5594f3e7c70896ffeeef32b9c950ywan
573233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsub.i16    q15, q14
574233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddl.u8    q14, d4, d11
575233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vadd.i16    q15, q4
576233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddl.u8    q4, d12, d15
577233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vbif        d22, d23, d17              ; oq2 |= t_oq2 & ~(f2 & f & m)
578233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16 d23, q15, #4               ; w_oq3
579233d2500723e5594f3e7c70896ffeeef32b9c950ywan
580233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsub.i16    q15, q14
581233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddl.u8    q14, d5, d12
582233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vadd.i16    q15, q4
583233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddl.u8    q4, d13, d15
584233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vbif        d16, d1, d17               ; op6 |= p6 & ~(f2 & f & m)
585233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16 d1, q15, #4                ; w_oq4
586233d2500723e5594f3e7c70896ffeeef32b9c950ywan
587233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsub.i16    q15, q14
588233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddl.u8    q14, d6, d13
589233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vadd.i16    q15, q4
590233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vaddl.u8    q4, d14, d15
591233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vbif        d24, d2, d17               ; op5 |= p5 & ~(f2 & f & m)
592233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16 d2, q15, #4                ; w_oq5
593233d2500723e5594f3e7c70896ffeeef32b9c950ywan
594233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsub.i16    q15, q14
595233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vbif        d25, d3, d17               ; op4 |= p4 & ~(f2 & f & m)
596233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vadd.i16    q15, q4
597233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vbif        d23, d11, d17              ; oq3 |= q3 & ~(f2 & f & m)
598233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vqrshrn.u16 d3, q15, #4                ; w_oq6
599233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vbif        d1, d12, d17               ; oq4 |= q4 & ~(f2 & f & m)
600233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vbif        d2, d13, d17               ; oq5 |= q5 & ~(f2 & f & m)
601233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vbif        d3, d14, d17               ; oq6 |= q6 & ~(f2 & f & m)
602233d2500723e5594f3e7c70896ffeeef32b9c950ywan
603233d2500723e5594f3e7c70896ffeeef32b9c950ywan    bx          lr
604233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ENDP        ; |vp9_wide_mbfilter_neon|
605233d2500723e5594f3e7c70896ffeeef32b9c950ywan
606233d2500723e5594f3e7c70896ffeeef32b9c950ywan    END
607