1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11    EXPORT |vp8_subtract_b_neon|
12    EXPORT |vp8_subtract_mby_neon|
13    EXPORT |vp8_subtract_mbuv_neon|
14
15    INCLUDE asm_enc_offsets.asm
16
17    ARM
18    REQUIRE8
19    PRESERVE8
20
21    AREA ||.text||, CODE, READONLY, ALIGN=2
22
23;void vp8_subtract_b_neon(BLOCK *be, BLOCKD *bd, int pitch)
24|vp8_subtract_b_neon| PROC
25
26    stmfd   sp!, {r4-r7}
27
28    ldr     r3, [r0, #vp8_block_base_src]
29    ldr     r4, [r0, #vp8_block_src]
30    ldr     r5, [r0, #vp8_block_src_diff]
31    ldr     r3, [r3]
32    ldr     r6, [r0, #vp8_block_src_stride]
33    add     r3, r3, r4                      ; src = *base_src + src
34    ldr     r7, [r1, #vp8_blockd_predictor]
35
36    vld1.8          {d0}, [r3], r6          ;load src
37    vld1.8          {d1}, [r7], r2          ;load pred
38    vld1.8          {d2}, [r3], r6
39    vld1.8          {d3}, [r7], r2
40    vld1.8          {d4}, [r3], r6
41    vld1.8          {d5}, [r7], r2
42    vld1.8          {d6}, [r3], r6
43    vld1.8          {d7}, [r7], r2
44
45    vsubl.u8        q10, d0, d1
46    vsubl.u8        q11, d2, d3
47    vsubl.u8        q12, d4, d5
48    vsubl.u8        q13, d6, d7
49
50    mov             r2, r2, lsl #1
51
52    vst1.16         {d20}, [r5], r2         ;store diff
53    vst1.16         {d22}, [r5], r2
54    vst1.16         {d24}, [r5], r2
55    vst1.16         {d26}, [r5], r2
56
57    ldmfd   sp!, {r4-r7}
58    bx              lr
59
60    ENDP
61
62
63;==========================================
64;void vp8_subtract_mby_neon(short *diff, unsigned char *src, unsigned char *pred, int stride)
65|vp8_subtract_mby_neon| PROC
66    mov             r12, #4
67
68subtract_mby_loop
69    vld1.8          {q0}, [r1], r3          ;load src
70    vld1.8          {q1}, [r2]!             ;load pred
71    vld1.8          {q2}, [r1], r3
72    vld1.8          {q3}, [r2]!
73    vld1.8          {q4}, [r1], r3
74    vld1.8          {q5}, [r2]!
75    vld1.8          {q6}, [r1], r3
76    vld1.8          {q7}, [r2]!
77
78    vsubl.u8        q8, d0, d2
79    vsubl.u8        q9, d1, d3
80    vsubl.u8        q10, d4, d6
81    vsubl.u8        q11, d5, d7
82    vsubl.u8        q12, d8, d10
83    vsubl.u8        q13, d9, d11
84    vsubl.u8        q14, d12, d14
85    vsubl.u8        q15, d13, d15
86
87    vst1.16         {q8}, [r0]!             ;store diff
88    vst1.16         {q9}, [r0]!
89    vst1.16         {q10}, [r0]!
90    vst1.16         {q11}, [r0]!
91    vst1.16         {q12}, [r0]!
92    vst1.16         {q13}, [r0]!
93    vst1.16         {q14}, [r0]!
94    vst1.16         {q15}, [r0]!
95
96    subs            r12, r12, #1
97    bne             subtract_mby_loop
98
99    bx              lr
100    ENDP
101
102;=================================
103;void vp8_subtract_mbuv_neon(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
104|vp8_subtract_mbuv_neon| PROC
105    ldr             r12, [sp]
106
107;u
108    add             r0, r0, #512        ;   short *udiff = diff + 256;
109    add             r3, r3, #256        ;   unsigned char *upred = pred + 256;
110
111    vld1.8          {d0}, [r1], r12         ;load src
112    vld1.8          {d1}, [r3]!             ;load pred
113    vld1.8          {d2}, [r1], r12
114    vld1.8          {d3}, [r3]!
115    vld1.8          {d4}, [r1], r12
116    vld1.8          {d5}, [r3]!
117    vld1.8          {d6}, [r1], r12
118    vld1.8          {d7}, [r3]!
119    vld1.8          {d8}, [r1], r12
120    vld1.8          {d9}, [r3]!
121    vld1.8          {d10}, [r1], r12
122    vld1.8          {d11}, [r3]!
123    vld1.8          {d12}, [r1], r12
124    vld1.8          {d13}, [r3]!
125    vld1.8          {d14}, [r1], r12
126    vld1.8          {d15}, [r3]!
127
128    vsubl.u8        q8, d0, d1
129    vsubl.u8        q9, d2, d3
130    vsubl.u8        q10, d4, d5
131    vsubl.u8        q11, d6, d7
132    vsubl.u8        q12, d8, d9
133    vsubl.u8        q13, d10, d11
134    vsubl.u8        q14, d12, d13
135    vsubl.u8        q15, d14, d15
136
137    vst1.16         {q8}, [r0]!             ;store diff
138    vst1.16         {q9}, [r0]!
139    vst1.16         {q10}, [r0]!
140    vst1.16         {q11}, [r0]!
141    vst1.16         {q12}, [r0]!
142    vst1.16         {q13}, [r0]!
143    vst1.16         {q14}, [r0]!
144    vst1.16         {q15}, [r0]!
145
146;v
147    vld1.8          {d0}, [r2], r12         ;load src
148    vld1.8          {d1}, [r3]!             ;load pred
149    vld1.8          {d2}, [r2], r12
150    vld1.8          {d3}, [r3]!
151    vld1.8          {d4}, [r2], r12
152    vld1.8          {d5}, [r3]!
153    vld1.8          {d6}, [r2], r12
154    vld1.8          {d7}, [r3]!
155    vld1.8          {d8}, [r2], r12
156    vld1.8          {d9}, [r3]!
157    vld1.8          {d10}, [r2], r12
158    vld1.8          {d11}, [r3]!
159    vld1.8          {d12}, [r2], r12
160    vld1.8          {d13}, [r3]!
161    vld1.8          {d14}, [r2], r12
162    vld1.8          {d15}, [r3]!
163
164    vsubl.u8        q8, d0, d1
165    vsubl.u8        q9, d2, d3
166    vsubl.u8        q10, d4, d5
167    vsubl.u8        q11, d6, d7
168    vsubl.u8        q12, d8, d9
169    vsubl.u8        q13, d10, d11
170    vsubl.u8        q14, d12, d13
171    vsubl.u8        q15, d14, d15
172
173    vst1.16         {q8}, [r0]!             ;store diff
174    vst1.16         {q9}, [r0]!
175    vst1.16         {q10}, [r0]!
176    vst1.16         {q11}, [r0]!
177    vst1.16         {q12}, [r0]!
178    vst1.16         {q13}, [r0]!
179    vst1.16         {q14}, [r0]!
180    vst1.16         {q15}, [r0]!
181
182    bx              lr
183    ENDP
184
185    END
186