1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11    EXPORT |vp8_subtract_b_neon|
12    EXPORT |vp8_subtract_mby_neon|
13    EXPORT |vp8_subtract_mbuv_neon|
14
15    INCLUDE vp8_asm_enc_offsets.asm
16
17    ARM
18    REQUIRE8
19    PRESERVE8
20
21    AREA ||.text||, CODE, READONLY, ALIGN=2
22
23;void vp8_subtract_b_neon(BLOCK *be, BLOCKD *bd, int pitch)
24|vp8_subtract_b_neon| PROC
25
26    stmfd   sp!, {r4-r7}
27
28    ldr     r3, [r0, #vp8_block_base_src]
29    ldr     r4, [r0, #vp8_block_src]
30    ldr     r5, [r0, #vp8_block_src_diff]
31    ldr     r3, [r3]
32    ldr     r6, [r0, #vp8_block_src_stride]
33    add     r3, r3, r4                      ; src = *base_src + src
34    ldr     r7, [r1, #vp8_blockd_predictor]
35
36    vld1.8          {d0}, [r3], r6          ;load src
37    vld1.8          {d1}, [r7], r2          ;load pred
38    vld1.8          {d2}, [r3], r6
39    vld1.8          {d3}, [r7], r2
40    vld1.8          {d4}, [r3], r6
41    vld1.8          {d5}, [r7], r2
42    vld1.8          {d6}, [r3], r6
43    vld1.8          {d7}, [r7], r2
44
45    vsubl.u8        q10, d0, d1
46    vsubl.u8        q11, d2, d3
47    vsubl.u8        q12, d4, d5
48    vsubl.u8        q13, d6, d7
49
50    mov             r2, r2, lsl #1
51
52    vst1.16         {d20}, [r5], r2         ;store diff
53    vst1.16         {d22}, [r5], r2
54    vst1.16         {d24}, [r5], r2
55    vst1.16         {d26}, [r5], r2
56
57    ldmfd   sp!, {r4-r7}
58    bx              lr
59
60    ENDP
61
62
63;==========================================
64;void vp8_subtract_mby_neon(short *diff, unsigned char *src, int src_stride
65;                           unsigned char *pred, int pred_stride)
66|vp8_subtract_mby_neon| PROC
67    push            {r4-r7}
68    mov             r12, #4
69    ldr             r4, [sp, #16]           ; pred_stride
70    mov             r6, #32                 ; "diff" stride x2
71    add             r5, r0, #16             ; second diff pointer
72
73subtract_mby_loop
74    vld1.8          {q0}, [r1], r2          ;load src
75    vld1.8          {q1}, [r3], r4          ;load pred
76    vld1.8          {q2}, [r1], r2
77    vld1.8          {q3}, [r3], r4
78    vld1.8          {q4}, [r1], r2
79    vld1.8          {q5}, [r3], r4
80    vld1.8          {q6}, [r1], r2
81    vld1.8          {q7}, [r3], r4
82
83    vsubl.u8        q8, d0, d2
84    vsubl.u8        q9, d1, d3
85    vsubl.u8        q10, d4, d6
86    vsubl.u8        q11, d5, d7
87    vsubl.u8        q12, d8, d10
88    vsubl.u8        q13, d9, d11
89    vsubl.u8        q14, d12, d14
90    vsubl.u8        q15, d13, d15
91
92    vst1.16         {q8}, [r0], r6          ;store diff
93    vst1.16         {q9}, [r5], r6
94    vst1.16         {q10}, [r0], r6
95    vst1.16         {q11}, [r5], r6
96    vst1.16         {q12}, [r0], r6
97    vst1.16         {q13}, [r5], r6
98    vst1.16         {q14}, [r0], r6
99    vst1.16         {q15}, [r5], r6
100
101    subs            r12, r12, #1
102    bne             subtract_mby_loop
103
104    pop             {r4-r7}
105    bx              lr
106    ENDP
107
108;=================================
109;void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc,
110;                         int src_stride, unsigned char *upred,
111;                         unsigned char *vpred, int pred_stride)
112
113|vp8_subtract_mbuv_neon| PROC
114    push            {r4-r7}
115    ldr             r4, [sp, #16]       ; upred
116    ldr             r5, [sp, #20]       ; vpred
117    ldr             r6, [sp, #24]       ; pred_stride
118    add             r0, r0, #512        ; short *udiff = diff + 256;
119    mov             r12, #32            ; "diff" stride x2
120    add             r7, r0, #16         ; second diff pointer
121
122;u
123    vld1.8          {d0}, [r1], r3      ;load usrc
124    vld1.8          {d1}, [r4], r6      ;load upred
125    vld1.8          {d2}, [r1], r3
126    vld1.8          {d3}, [r4], r6
127    vld1.8          {d4}, [r1], r3
128    vld1.8          {d5}, [r4], r6
129    vld1.8          {d6}, [r1], r3
130    vld1.8          {d7}, [r4], r6
131    vld1.8          {d8}, [r1], r3
132    vld1.8          {d9}, [r4], r6
133    vld1.8          {d10}, [r1], r3
134    vld1.8          {d11}, [r4], r6
135    vld1.8          {d12}, [r1], r3
136    vld1.8          {d13}, [r4], r6
137    vld1.8          {d14}, [r1], r3
138    vld1.8          {d15}, [r4], r6
139
140    vsubl.u8        q8, d0, d1
141    vsubl.u8        q9, d2, d3
142    vsubl.u8        q10, d4, d5
143    vsubl.u8        q11, d6, d7
144    vsubl.u8        q12, d8, d9
145    vsubl.u8        q13, d10, d11
146    vsubl.u8        q14, d12, d13
147    vsubl.u8        q15, d14, d15
148
149    vst1.16         {q8}, [r0], r12     ;store diff
150    vst1.16         {q9}, [r7], r12
151    vst1.16         {q10}, [r0], r12
152    vst1.16         {q11}, [r7], r12
153    vst1.16         {q12}, [r0], r12
154    vst1.16         {q13}, [r7], r12
155    vst1.16         {q14}, [r0], r12
156    vst1.16         {q15}, [r7], r12
157
158;v
159    vld1.8          {d0}, [r2], r3      ;load vsrc
160    vld1.8          {d1}, [r5], r6      ;load vpred
161    vld1.8          {d2}, [r2], r3
162    vld1.8          {d3}, [r5], r6
163    vld1.8          {d4}, [r2], r3
164    vld1.8          {d5}, [r5], r6
165    vld1.8          {d6}, [r2], r3
166    vld1.8          {d7}, [r5], r6
167    vld1.8          {d8}, [r2], r3
168    vld1.8          {d9}, [r5], r6
169    vld1.8          {d10}, [r2], r3
170    vld1.8          {d11}, [r5], r6
171    vld1.8          {d12}, [r2], r3
172    vld1.8          {d13}, [r5], r6
173    vld1.8          {d14}, [r2], r3
174    vld1.8          {d15}, [r5], r6
175
176    vsubl.u8        q8, d0, d1
177    vsubl.u8        q9, d2, d3
178    vsubl.u8        q10, d4, d5
179    vsubl.u8        q11, d6, d7
180    vsubl.u8        q12, d8, d9
181    vsubl.u8        q13, d10, d11
182    vsubl.u8        q14, d12, d13
183    vsubl.u8        q15, d14, d15
184
185    vst1.16         {q8}, [r0], r12     ;store diff
186    vst1.16         {q9}, [r7], r12
187    vst1.16         {q10}, [r0], r12
188    vst1.16         {q11}, [r7], r12
189    vst1.16         {q12}, [r0], r12
190    vst1.16         {q13}, [r7], r12
191    vst1.16         {q14}, [r0], r12
192    vst1.16         {q15}, [r7], r12
193
194    pop             {r4-r7}
195    bx              lr
196
197    ENDP
198
199    END
200