1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12    EXPORT  |vp8_sub_pixel_variance8x8_neon|
13    ARM
14    REQUIRE8
15    PRESERVE8
16
17    AREA ||.text||, CODE, READONLY, ALIGN=2
18; r0    unsigned char  *src_ptr,
19; r1    int  src_pixels_per_line,
20; r2    int  xoffset,
21; r3    int  yoffset,
22; stack(r4) unsigned char *dst_ptr,
23; stack(r5) int dst_pixels_per_line,
24; stack(r6) unsigned int *sse
25;note: most of the code is copied from bilinear_predict8x8_neon and vp8_variance8x8_neon.
26
27|vp8_sub_pixel_variance8x8_neon| PROC
28    push            {r4-r5, lr}
29
30    adr             r12, BilinearTaps_coeff
31    ldr             r4, [sp, #12]           ;load *dst_ptr from stack
32    ldr             r5, [sp, #16]           ;load dst_pixels_per_line from stack
33    ldr             lr, [sp, #20]           ;load *sse from stack
34
35    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
36    beq             skip_firstpass_filter
37
38;First pass: output_height lines x output_width columns (9x8)
39    add             r2, r12, r2, lsl #3     ;calculate filter location
40
41    vld1.u8         {q1}, [r0], r1          ;load src data
42    vld1.u32        {d31}, [r2]             ;load first_pass filter
43    vld1.u8         {q2}, [r0], r1
44    vdup.8          d0, d31[0]              ;first_pass filter (d0 d1)
45    vld1.u8         {q3}, [r0], r1
46    vdup.8          d1, d31[4]
47    vld1.u8         {q4}, [r0], r1
48
49    vmull.u8        q6, d2, d0              ;(src_ptr[0] * Filter[0])
50    vmull.u8        q7, d4, d0
51    vmull.u8        q8, d6, d0
52    vmull.u8        q9, d8, d0
53
54    vext.8          d3, d2, d3, #1          ;construct src_ptr[-1]
55    vext.8          d5, d4, d5, #1
56    vext.8          d7, d6, d7, #1
57    vext.8          d9, d8, d9, #1
58
59    vmlal.u8        q6, d3, d1              ;(src_ptr[1] * Filter[1])
60    vmlal.u8        q7, d5, d1
61    vmlal.u8        q8, d7, d1
62    vmlal.u8        q9, d9, d1
63
64    vld1.u8         {q1}, [r0], r1          ;load src data
65    vqrshrn.u16    d22, q6, #7              ;shift/round/saturate to u8
66    vld1.u8         {q2}, [r0], r1
67    vqrshrn.u16    d23, q7, #7
68    vld1.u8         {q3}, [r0], r1
69    vqrshrn.u16    d24, q8, #7
70    vld1.u8         {q4}, [r0], r1
71    vqrshrn.u16    d25, q9, #7
72
73    ;first_pass filtering on the rest 5-line data
74    vld1.u8         {q5}, [r0], r1
75
76    vmull.u8        q6, d2, d0              ;(src_ptr[0] * Filter[0])
77    vmull.u8        q7, d4, d0
78    vmull.u8        q8, d6, d0
79    vmull.u8        q9, d8, d0
80    vmull.u8        q10, d10, d0
81
82    vext.8          d3, d2, d3, #1          ;construct src_ptr[-1]
83    vext.8          d5, d4, d5, #1
84    vext.8          d7, d6, d7, #1
85    vext.8          d9, d8, d9, #1
86    vext.8          d11, d10, d11, #1
87
88    vmlal.u8        q6, d3, d1              ;(src_ptr[1] * Filter[1])
89    vmlal.u8        q7, d5, d1
90    vmlal.u8        q8, d7, d1
91    vmlal.u8        q9, d9, d1
92    vmlal.u8        q10, d11, d1
93
94    vqrshrn.u16    d26, q6, #7              ;shift/round/saturate to u8
95    vqrshrn.u16    d27, q7, #7
96    vqrshrn.u16    d28, q8, #7
97    vqrshrn.u16    d29, q9, #7
98    vqrshrn.u16    d30, q10, #7
99
100;Second pass: 8x8
101secondpass_filter
102    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
103    ;skip_secondpass_filter
104    beq             sub_pixel_variance8x8_neon
105
106    add             r3, r12, r3, lsl #3
107
108    vld1.u32        {d31}, [r3]             ;load second_pass filter
109
110    vdup.8          d0, d31[0]              ;second_pass filter parameters (d0 d1)
111    vdup.8          d1, d31[4]
112
113    vmull.u8        q1, d22, d0             ;(src_ptr[0] * Filter[0])
114    vmull.u8        q2, d23, d0
115    vmull.u8        q3, d24, d0
116    vmull.u8        q4, d25, d0
117    vmull.u8        q5, d26, d0
118    vmull.u8        q6, d27, d0
119    vmull.u8        q7, d28, d0
120    vmull.u8        q8, d29, d0
121
122    vmlal.u8        q1, d23, d1             ;(src_ptr[pixel_step] * Filter[1])
123    vmlal.u8        q2, d24, d1
124    vmlal.u8        q3, d25, d1
125    vmlal.u8        q4, d26, d1
126    vmlal.u8        q5, d27, d1
127    vmlal.u8        q6, d28, d1
128    vmlal.u8        q7, d29, d1
129    vmlal.u8        q8, d30, d1
130
131    vqrshrn.u16    d22, q1, #7              ;shift/round/saturate to u8
132    vqrshrn.u16    d23, q2, #7
133    vqrshrn.u16    d24, q3, #7
134    vqrshrn.u16    d25, q4, #7
135    vqrshrn.u16    d26, q5, #7
136    vqrshrn.u16    d27, q6, #7
137    vqrshrn.u16    d28, q7, #7
138    vqrshrn.u16    d29, q8, #7
139
140    b               sub_pixel_variance8x8_neon
141
142;--------------------
143skip_firstpass_filter
144    vld1.u8         {d22}, [r0], r1         ;load src data
145    vld1.u8         {d23}, [r0], r1
146    vld1.u8         {d24}, [r0], r1
147    vld1.u8         {d25}, [r0], r1
148    vld1.u8         {d26}, [r0], r1
149    vld1.u8         {d27}, [r0], r1
150    vld1.u8         {d28}, [r0], r1
151    vld1.u8         {d29}, [r0], r1
152    vld1.u8         {d30}, [r0], r1
153
154    b               secondpass_filter
155
156;----------------------
157;vp8_variance8x8_neon
158sub_pixel_variance8x8_neon
159    vmov.i8         q8, #0                      ;q8 - sum
160    vmov.i8         q9, #0                      ;q9, q10 - sse
161    vmov.i8         q10, #0
162
163    mov             r12, #2
164
165sub_pixel_variance8x8_neon_loop
166    vld1.8          {d0}, [r4], r5              ;load dst data
167    subs            r12, r12, #1
168    vld1.8          {d1}, [r4], r5
169    vld1.8          {d2}, [r4], r5
170    vsubl.u8        q4, d22, d0                 ;calculate diff
171    vld1.8          {d3}, [r4], r5
172
173    vsubl.u8        q5, d23, d1
174    vsubl.u8        q6, d24, d2
175
176    vpadal.s16      q8, q4                      ;sum
177    vmlal.s16       q9, d8, d8                  ;sse
178    vmlal.s16       q10, d9, d9
179
180    vsubl.u8        q7, d25, d3
181
182    vpadal.s16      q8, q5
183    vmlal.s16       q9, d10, d10
184    vmlal.s16       q10, d11, d11
185
186    vmov            q11, q13
187
188    vpadal.s16      q8, q6
189    vmlal.s16       q9, d12, d12
190    vmlal.s16       q10, d13, d13
191
192    vmov            q12, q14
193
194    vpadal.s16      q8, q7
195    vmlal.s16       q9, d14, d14
196    vmlal.s16       q10, d15, d15
197
198    bne             sub_pixel_variance8x8_neon_loop
199
200    vadd.u32        q10, q9, q10                ;accumulate sse
201    vpaddl.s32      q0, q8                      ;accumulate sum
202
203    vpaddl.u32      q1, q10
204    vadd.s64        d0, d0, d1
205    vadd.u64        d1, d2, d3
206
207    vmull.s32       q5, d0, d0
208    vst1.32         {d1[0]}, [lr]               ;store sse
209    vshr.s32        d10, d10, #6
210    vsub.s32        d0, d1, d10
211
212    vmov.32         r0, d0[0]                   ;return
213    pop             {r4-r5, pc}
214
215    ENDP
216
217;-----------------
218
219bilinear_taps_coeff
220    DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
221
222    END
223