1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12    EXPORT  |vp8_bilinear_predict8x8_neon|
13    ARM
14    REQUIRE8
15    PRESERVE8
16
17    AREA ||.text||, CODE, READONLY, ALIGN=2
18; r0    unsigned char  *src_ptr,
19; r1    int  src_pixels_per_line,
20; r2    int  xoffset,
21; r3    int  yoffset,
22; r4    unsigned char *dst_ptr,
23; stack(lr) int  dst_pitch
24
25|vp8_bilinear_predict8x8_neon| PROC
26    push            {r4, lr}
27
28    ldr             r12, _bifilter8_coeff_
29    ldr             r4, [sp, #8]            ;load parameters from stack
30    ldr             lr, [sp, #12]           ;load parameters from stack
31
32    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
33    beq             skip_firstpass_filter
34
35;First pass: output_height lines x output_width columns (9x8)
36    add             r2, r12, r2, lsl #3     ;calculate filter location
37
38    vld1.u8         {q1}, [r0], r1          ;load src data
39    vld1.u32        {d31}, [r2]             ;load first_pass filter
40    vld1.u8         {q2}, [r0], r1
41    vdup.8          d0, d31[0]              ;first_pass filter (d0 d1)
42    vld1.u8         {q3}, [r0], r1
43    vdup.8          d1, d31[4]
44    vld1.u8         {q4}, [r0], r1
45
46    vmull.u8        q6, d2, d0              ;(src_ptr[0] * vp8_filter[0])
47    vmull.u8        q7, d4, d0
48    vmull.u8        q8, d6, d0
49    vmull.u8        q9, d8, d0
50
51    vext.8          d3, d2, d3, #1          ;construct src_ptr[-1]
52    vext.8          d5, d4, d5, #1
53    vext.8          d7, d6, d7, #1
54    vext.8          d9, d8, d9, #1
55
56    vmlal.u8        q6, d3, d1              ;(src_ptr[1] * vp8_filter[1])
57    vmlal.u8        q7, d5, d1
58    vmlal.u8        q8, d7, d1
59    vmlal.u8        q9, d9, d1
60
61    vld1.u8         {q1}, [r0], r1          ;load src data
62    vqrshrn.u16    d22, q6, #7              ;shift/round/saturate to u8
63    vld1.u8         {q2}, [r0], r1
64    vqrshrn.u16    d23, q7, #7
65    vld1.u8         {q3}, [r0], r1
66    vqrshrn.u16    d24, q8, #7
67    vld1.u8         {q4}, [r0], r1
68    vqrshrn.u16    d25, q9, #7
69
70    ;first_pass filtering on the rest 5-line data
71    vld1.u8         {q5}, [r0], r1
72
73    vmull.u8        q6, d2, d0              ;(src_ptr[0] * vp8_filter[0])
74    vmull.u8        q7, d4, d0
75    vmull.u8        q8, d6, d0
76    vmull.u8        q9, d8, d0
77    vmull.u8        q10, d10, d0
78
79    vext.8          d3, d2, d3, #1          ;construct src_ptr[-1]
80    vext.8          d5, d4, d5, #1
81    vext.8          d7, d6, d7, #1
82    vext.8          d9, d8, d9, #1
83    vext.8          d11, d10, d11, #1
84
85    vmlal.u8        q6, d3, d1              ;(src_ptr[1] * vp8_filter[1])
86    vmlal.u8        q7, d5, d1
87    vmlal.u8        q8, d7, d1
88    vmlal.u8        q9, d9, d1
89    vmlal.u8        q10, d11, d1
90
91    vqrshrn.u16    d26, q6, #7              ;shift/round/saturate to u8
92    vqrshrn.u16    d27, q7, #7
93    vqrshrn.u16    d28, q8, #7
94    vqrshrn.u16    d29, q9, #7
95    vqrshrn.u16    d30, q10, #7
96
97;Second pass: 8x8
98secondpass_filter
99    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
100    beq             skip_secondpass_filter
101
102    add             r3, r12, r3, lsl #3
103    add             r0, r4, lr
104
105    vld1.u32        {d31}, [r3]             ;load second_pass filter
106    add             r1, r0, lr
107
108    vdup.8          d0, d31[0]              ;second_pass filter parameters (d0 d1)
109    vdup.8          d1, d31[4]
110
111    vmull.u8        q1, d22, d0             ;(src_ptr[0] * vp8_filter[0])
112    vmull.u8        q2, d23, d0
113    vmull.u8        q3, d24, d0
114    vmull.u8        q4, d25, d0
115    vmull.u8        q5, d26, d0
116    vmull.u8        q6, d27, d0
117    vmull.u8        q7, d28, d0
118    vmull.u8        q8, d29, d0
119
120    vmlal.u8        q1, d23, d1             ;(src_ptr[pixel_step] * vp8_filter[1])
121    vmlal.u8        q2, d24, d1
122    vmlal.u8        q3, d25, d1
123    vmlal.u8        q4, d26, d1
124    vmlal.u8        q5, d27, d1
125    vmlal.u8        q6, d28, d1
126    vmlal.u8        q7, d29, d1
127    vmlal.u8        q8, d30, d1
128
129    vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8
130    vqrshrn.u16    d3, q2, #7
131    vqrshrn.u16    d4, q3, #7
132    vqrshrn.u16    d5, q4, #7
133    vqrshrn.u16    d6, q5, #7
134    vqrshrn.u16    d7, q6, #7
135    vqrshrn.u16    d8, q7, #7
136    vqrshrn.u16    d9, q8, #7
137
138    vst1.u8         {d2}, [r4]              ;store result
139    vst1.u8         {d3}, [r0]
140    vst1.u8         {d4}, [r1], lr
141    vst1.u8         {d5}, [r1], lr
142    vst1.u8         {d6}, [r1], lr
143    vst1.u8         {d7}, [r1], lr
144    vst1.u8         {d8}, [r1], lr
145    vst1.u8         {d9}, [r1], lr
146
147    pop             {r4, pc}
148
149;--------------------
150skip_firstpass_filter
151    vld1.u8         {d22}, [r0], r1         ;load src data
152    vld1.u8         {d23}, [r0], r1
153    vld1.u8         {d24}, [r0], r1
154    vld1.u8         {d25}, [r0], r1
155    vld1.u8         {d26}, [r0], r1
156    vld1.u8         {d27}, [r0], r1
157    vld1.u8         {d28}, [r0], r1
158    vld1.u8         {d29}, [r0], r1
159    vld1.u8         {d30}, [r0], r1
160
161    b               secondpass_filter
162
163;---------------------
164skip_secondpass_filter
165    vst1.u8         {d22}, [r4], lr         ;store result
166    vst1.u8         {d23}, [r4], lr
167    vst1.u8         {d24}, [r4], lr
168    vst1.u8         {d25}, [r4], lr
169    vst1.u8         {d26}, [r4], lr
170    vst1.u8         {d27}, [r4], lr
171    vst1.u8         {d28}, [r4], lr
172    vst1.u8         {d29}, [r4], lr
173
174    pop             {r4, pc}
175
176    ENDP
177
178;-----------------
179
180_bifilter8_coeff_
181    DCD     bifilter8_coeff
182bifilter8_coeff
183    DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
184
185    END
186