1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12    EXPORT  |vp8_bilinear_predict8x4_neon|
13    ARM
14    REQUIRE8
15    PRESERVE8
16
17    AREA ||.text||, CODE, READONLY, ALIGN=2
18; r0    unsigned char  *src_ptr,
19; r1    int  src_pixels_per_line,
20; r2    int  xoffset,
21; r3    int  yoffset,
22; r4    unsigned char *dst_ptr,
23; stack(lr) int  dst_pitch
24
25|vp8_bilinear_predict8x4_neon| PROC
26    push            {r4, lr}
27
28    adr             r12, bifilter8x4_coeff
29    ldr             r4, [sp, #8]            ;load parameters from stack
30    ldr             lr, [sp, #12]           ;load parameters from stack
31
32    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
33    beq             skip_firstpass_filter
34
35;First pass: output_height lines x output_width columns (5x8)
36    add             r2, r12, r2, lsl #3     ;calculate filter location
37
38    vld1.u8         {q1}, [r0], r1          ;load src data
39    vld1.u32        {d31}, [r2]             ;load first_pass filter
40    vld1.u8         {q2}, [r0], r1
41    vdup.8          d0, d31[0]              ;first_pass filter (d0 d1)
42    vld1.u8         {q3}, [r0], r1
43    vdup.8          d1, d31[4]
44    vld1.u8         {q4}, [r0], r1
45
46    vmull.u8        q6, d2, d0              ;(src_ptr[0] * vp8_filter[0])
47    vld1.u8         {q5}, [r0], r1
48    vmull.u8        q7, d4, d0
49    vmull.u8        q8, d6, d0
50    vmull.u8        q9, d8, d0
51    vmull.u8        q10, d10, d0
52
53    vext.8          d3, d2, d3, #1          ;construct src_ptr[-1]
54    vext.8          d5, d4, d5, #1
55    vext.8          d7, d6, d7, #1
56    vext.8          d9, d8, d9, #1
57    vext.8          d11, d10, d11, #1
58
59    vmlal.u8        q6, d3, d1              ;(src_ptr[1] * vp8_filter[1])
60    vmlal.u8        q7, d5, d1
61    vmlal.u8        q8, d7, d1
62    vmlal.u8        q9, d9, d1
63    vmlal.u8        q10, d11, d1
64
65    vqrshrn.u16    d22, q6, #7              ;shift/round/saturate to u8
66    vqrshrn.u16    d23, q7, #7
67    vqrshrn.u16    d24, q8, #7
68    vqrshrn.u16    d25, q9, #7
69    vqrshrn.u16    d26, q10, #7
70
71;Second pass: 4x8
72secondpass_filter
73    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
74    beq             skip_secondpass_filter
75
76    add             r3, r12, r3, lsl #3
77    add             r0, r4, lr
78
79    vld1.u32        {d31}, [r3]             ;load second_pass filter
80    add             r1, r0, lr
81
82    vdup.8          d0, d31[0]              ;second_pass filter parameters (d0 d1)
83    vdup.8          d1, d31[4]
84
85    vmull.u8        q1, d22, d0             ;(src_ptr[0] * vp8_filter[0])
86    vmull.u8        q2, d23, d0
87    vmull.u8        q3, d24, d0
88    vmull.u8        q4, d25, d0
89
90    vmlal.u8        q1, d23, d1             ;(src_ptr[pixel_step] * vp8_filter[1])
91    vmlal.u8        q2, d24, d1
92    vmlal.u8        q3, d25, d1
93    vmlal.u8        q4, d26, d1
94
95    add             r2, r1, lr
96
97    vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8
98    vqrshrn.u16    d3, q2, #7
99    vqrshrn.u16    d4, q3, #7
100    vqrshrn.u16    d5, q4, #7
101
102    vst1.u8         {d2}, [r4]              ;store result
103    vst1.u8         {d3}, [r0]
104    vst1.u8         {d4}, [r1]
105    vst1.u8         {d5}, [r2]
106
107    pop             {r4, pc}
108
109;--------------------
110skip_firstpass_filter
111    vld1.u8         {d22}, [r0], r1         ;load src data
112    vld1.u8         {d23}, [r0], r1
113    vld1.u8         {d24}, [r0], r1
114    vld1.u8         {d25}, [r0], r1
115    vld1.u8         {d26}, [r0], r1
116
117    b               secondpass_filter
118
119;---------------------
120skip_secondpass_filter
121    vst1.u8         {d22}, [r4], lr         ;store result
122    vst1.u8         {d23}, [r4], lr
123    vst1.u8         {d24}, [r4], lr
124    vst1.u8         {d25}, [r4], lr
125
126    pop             {r4, pc}
127
128    ENDP
129
130;-----------------
131
132bifilter8x4_coeff
133    DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
134
135    END
136