1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12    EXPORT  |vp8_mse16x16_neon|
13    EXPORT  |vp8_get16x16pred_error_neon|
14    EXPORT  |vp8_get4x4sse_cs_neon|
15
16    ARM
17    REQUIRE8
18    PRESERVE8
19
20    AREA ||.text||, CODE, READONLY, ALIGN=2
21;============================
22; r0    unsigned char *src_ptr
23; r1    int source_stride
24; r2    unsigned char *ref_ptr
25; r3    int  recon_stride
26; stack unsigned int *sse
27;note: in this function, sum is never used. So, we can remove this part of calculation
28;from vp8_variance().
29
30|vp8_mse16x16_neon| PROC
31    vmov.i8         q7, #0                      ;q7, q8, q9, q10 - sse
32    vmov.i8         q8, #0
33    vmov.i8         q9, #0
34    vmov.i8         q10, #0
35
36    mov             r12, #8
37
38mse16x16_neon_loop
39    vld1.8          {q0}, [r0], r1              ;Load up source and reference
40    vld1.8          {q2}, [r2], r3
41    vld1.8          {q1}, [r0], r1
42    vld1.8          {q3}, [r2], r3
43
44    vsubl.u8        q11, d0, d4
45    vsubl.u8        q12, d1, d5
46    vsubl.u8        q13, d2, d6
47    vsubl.u8        q14, d3, d7
48
49    vmlal.s16       q7, d22, d22
50    vmlal.s16       q8, d23, d23
51
52    subs            r12, r12, #1
53
54    vmlal.s16       q9, d24, d24
55    vmlal.s16       q10, d25, d25
56    vmlal.s16       q7, d26, d26
57    vmlal.s16       q8, d27, d27
58    vmlal.s16       q9, d28, d28
59    vmlal.s16       q10, d29, d29
60
61    bne             mse16x16_neon_loop
62
63    vadd.u32        q7, q7, q8
64    vadd.u32        q9, q9, q10
65
66    ldr             r12, [sp]               ;load *sse from stack
67
68    vadd.u32        q10, q7, q9
69    vpaddl.u32      q1, q10
70    vadd.u64        d0, d2, d3
71
72    vst1.32         {d0[0]}, [r12]
73    vmov.32         r0, d0[0]
74
75    bx              lr
76
77    ENDP
78
79;============================
80; r0    unsigned char *src_ptr
81; r1    int src_stride
82; r2    unsigned char *ref_ptr
83; r3    int ref_stride
84|vp8_get16x16pred_error_neon| PROC
85    vmov.i8         q8, #0                      ;q8 - sum
86    vmov.i8         q9, #0                      ;q9, q10 - pred_error
87    vmov.i8         q10, #0
88
89    mov             r12, #8
90
91get16x16pred_error_neon_loop
92    vld1.8          {q0}, [r0], r1              ;Load up source and reference
93    vld1.8          {q2}, [r2], r3
94    vld1.8          {q1}, [r0], r1
95    vld1.8          {q3}, [r2], r3
96
97    vsubl.u8        q11, d0, d4
98    vsubl.u8        q12, d1, d5
99    vsubl.u8        q13, d2, d6
100    vsubl.u8        q14, d3, d7
101
102    vpadal.s16      q8, q11
103    vmlal.s16       q9, d22, d22
104    vmlal.s16       q10, d23, d23
105
106    subs            r12, r12, #1
107
108    vpadal.s16      q8, q12
109    vmlal.s16       q9, d24, d24
110    vmlal.s16       q10, d25, d25
111    vpadal.s16      q8, q13
112    vmlal.s16       q9, d26, d26
113    vmlal.s16       q10, d27, d27
114    vpadal.s16      q8, q14
115    vmlal.s16       q9, d28, d28
116    vmlal.s16       q10, d29, d29
117
118    bne             get16x16pred_error_neon_loop
119
120    vadd.u32        q10, q9, q10
121    vpaddl.s32      q0, q8
122
123    vpaddl.u32      q1, q10
124    vadd.s64        d0, d0, d1
125    vadd.u64        d1, d2, d3
126
127    vmull.s32       q5, d0, d0
128    vshr.s32        d10, d10, #8
129    vsub.s32        d0, d1, d10
130
131    vmov.32         r0, d0[0]
132    bx              lr
133
134    ENDP
135
136;=============================
137; r0    unsigned char *src_ptr,
138; r1    int  source_stride,
139; r2    unsigned char *ref_ptr,
140; r3    int  recon_stride
141|vp8_get4x4sse_cs_neon| PROC
142    vld1.8          {d0}, [r0], r1              ;Load up source and reference
143    vld1.8          {d4}, [r2], r3
144    vld1.8          {d1}, [r0], r1
145    vld1.8          {d5}, [r2], r3
146    vld1.8          {d2}, [r0], r1
147    vld1.8          {d6}, [r2], r3
148    vld1.8          {d3}, [r0], r1
149    vld1.8          {d7}, [r2], r3
150
151    vsubl.u8        q11, d0, d4
152    vsubl.u8        q12, d1, d5
153    vsubl.u8        q13, d2, d6
154    vsubl.u8        q14, d3, d7
155
156    vmull.s16       q7, d22, d22
157    vmull.s16       q8, d24, d24
158    vmull.s16       q9, d26, d26
159    vmull.s16       q10, d28, d28
160
161    vadd.u32        q7, q7, q8
162    vadd.u32        q9, q9, q10
163    vadd.u32        q9, q7, q9
164
165    vpaddl.u32      q1, q9
166    vadd.u64        d0, d2, d3
167
168    vmov.32         r0, d0[0]
169    bx              lr
170
171    ENDP
172
173    END
174