1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12    EXPORT  |vp8_yv12_extend_frame_borders_neon|
13    ARM
14    REQUIRE8
15    PRESERVE8
16
17    INCLUDE vpx_scale_asm_offsets.asm
18
19    AREA ||.text||, CODE, READONLY, ALIGN=2
20;void vp8_yv12_extend_frame_borders_neon (YV12_BUFFER_CONFIG *ybf);
21; we depend on VP8BORDERINPIXELS being 32
22
23|vp8_yv12_extend_frame_borders_neon| PROC
24    push            {r4 - r10, lr}
25    vpush           {d8 - d15}
26
27    ; Border = 32
28    ldr             r3, [r0, #yv12_buffer_config_y_width]  ; plane_width
29    ldr             r1, [r0, #yv12_buffer_config_y_buffer] ; src_ptr1
30    ldr             r4, [r0, #yv12_buffer_config_y_height] ; plane_height
31    ldr             lr, [r0, #yv12_buffer_config_y_stride] ; plane_stride
32
33; Border copy for Y plane
34; copy the left and right most columns out
35    add             r6, r1, r3              ; dest_ptr2 = src_ptr2 + 1 (src_ptr1 + plane_width)
36    sub             r2, r6, #1              ; src_ptr2 = src_ptr1 + plane_width - 1
37    sub             r5, r1, #32             ; dest_ptr1 = src_ptr1 - Border
38
39    mov             r12, r4, lsr #2         ; plane_height / 4
40
41copy_left_right_y
42    vld1.8          {d0[], d1[]}, [r1], lr
43    vld1.8          {d4[], d5[]}, [r2], lr
44    vld1.8          {d8[], d9[]}, [r1], lr
45    vld1.8          {d12[], d13[]}, [r2], lr
46    vld1.8          {d16[], d17[]}, [r1], lr
47    vld1.8          {d20[], d21[]}, [r2], lr
48    vld1.8          {d24[], d25[]}, [r1], lr
49    vld1.8          {d28[], d29[]}, [r2], lr
50
51    vmov            q1, q0
52    vmov            q3, q2
53    vmov            q5, q4
54    vmov            q7, q6
55    vmov            q9, q8
56    vmov            q11, q10
57    vmov            q13, q12
58    vmov            q15, q14
59
60    subs            r12, r12, #1
61
62    vst1.8          {q0, q1}, [r5], lr
63    vst1.8          {q2, q3}, [r6], lr
64    vst1.8          {q4, q5}, [r5], lr
65    vst1.8          {q6, q7}, [r6], lr
66    vst1.8          {q8, q9}, [r5], lr
67    vst1.8          {q10, q11}, [r6], lr
68    vst1.8          {q12, q13}, [r5], lr
69    vst1.8          {q14, q15}, [r6], lr
70
71    bne             copy_left_right_y
72
73;Now copy the top and bottom source lines into each line of the respective borders
74    ldr             r1, [r0, #yv12_buffer_config_y_buffer] ; y_buffer
75    mul             r8, r4, lr              ; plane_height * plane_stride
76
77    ; copy width is plane_stride
78    movs            r12, lr, lsr #7         ; plane_stride / 128
79
80    sub             r1, r1, #32             ; src_ptr1 = y_buffer - Border
81    add             r6, r1, r8              ; dest_ptr2 = src_ptr2 - plane_stride (src_ptr1 + (plane_height * plane_stride))
82    sub             r2, r6, lr              ; src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride
83    sub             r5, r1, lr, asl #5      ; dest_ptr1 = src_ptr1 - (Border * plane_stride)
84    ble             extra_y_copy_needed     ; plane stride < 128
85
86copy_top_bottom_y
87    vld1.8          {q0, q1}, [r1]!
88    vld1.8          {q8, q9}, [r2]!
89    vld1.8          {q2, q3}, [r1]!
90    vld1.8          {q10, q11}, [r2]!
91    vld1.8          {q4, q5}, [r1]!
92    vld1.8          {q12, q13}, [r2]!
93    vld1.8          {q6, q7}, [r1]!
94    vld1.8          {q14, q15}, [r2]!
95
96    mov             r7, #32                 ; Border
97
98top_bottom_32
99    subs            r7, r7, #1
100
101    vst1.8          {q0, q1}, [r5]!
102    vst1.8          {q8, q9}, [r6]!
103    vst1.8          {q2, q3}, [r5]!
104    vst1.8          {q10, q11}, [r6]!
105    vst1.8          {q4, q5}, [r5]!
106    vst1.8          {q12, q13}, [r6]!
107    vst1.8          {q6, q7}, [r5]!
108    vst1.8          {q14, q15}, [r6]!
109
110    add             r5, r5, lr              ; dest_ptr1 += plane_stride
111    sub             r5, r5, #128            ; dest_ptr1 -= 128
112    add             r6, r6, lr              ; dest_ptr2 += plane_stride
113    sub             r6, r6, #128            ; dest_ptr2 -= 128
114
115    bne             top_bottom_32
116
117    sub             r5, r1, lr, asl #5      ; src_ptr1 - (Border* plane_stride)
118    add             r6, r2, lr              ; src_ptr2 + plane_stride
119
120    subs            r12, r12, #1
121    bne             copy_top_bottom_y
122
123extra_y_copy_needed
124    mov             r7, lr, lsr #4          ; check to see if extra copy is needed
125    ands            r7, r7, #0x7
126    bne             extra_top_bottom_y
127end_of_border_copy_y
128
129;Border copy for U, V planes
130; Border = 16
131    ldr             r7, [r0, #yv12_buffer_config_u_buffer]  ; src_ptr1
132    ldr             lr, [r0, #yv12_buffer_config_uv_stride] ; plane_stride
133    ldr             r3, [r0, #yv12_buffer_config_uv_width]  ; plane_width
134    ldr             r4, [r0, #yv12_buffer_config_uv_height] ; plane_height
135
136    mov             r10, #2
137
138;copy the left and right most columns out
139border_copy_uv
140    mov             r1, r7                  ; src_ptr1 needs to be saved for second half of loop
141    sub             r5, r1, #16             ; dest_ptr1 = src_ptr1 - Border
142    add             r6, r1, r3              ; dest_ptr2 = src_ptr2 + 1 (src_ptr1 + plane_width)
143    sub             r2, r6, #1              ; src_ptr2 = src_ptr1 + plane_width - 1
144
145    mov             r12, r4, lsr #3         ; plane_height / 8
146
147copy_left_right_uv
148    vld1.8          {d0[], d1[]}, [r1], lr
149    vld1.8          {d2[], d3[]}, [r2], lr
150    vld1.8          {d4[], d5[]}, [r1], lr
151    vld1.8          {d6[], d7[]}, [r2], lr
152    vld1.8          {d8[], d9[]},  [r1], lr
153    vld1.8          {d10[], d11[]}, [r2], lr
154    vld1.8          {d12[], d13[]}, [r1], lr
155    vld1.8          {d14[], d15[]}, [r2], lr
156    vld1.8          {d16[], d17[]}, [r1], lr
157    vld1.8          {d18[], d19[]}, [r2], lr
158    vld1.8          {d20[], d21[]}, [r1], lr
159    vld1.8          {d22[], d23[]}, [r2], lr
160    vld1.8          {d24[], d25[]}, [r1], lr
161    vld1.8          {d26[], d27[]}, [r2], lr
162    vld1.8          {d28[], d29[]}, [r1], lr
163    vld1.8          {d30[], d31[]}, [r2], lr
164
165    subs            r12, r12, #1
166
167    vst1.8          {q0}, [r5], lr
168    vst1.8          {q1}, [r6], lr
169    vst1.8          {q2}, [r5], lr
170    vst1.8          {q3}, [r6], lr
171    vst1.8          {q4}, [r5], lr
172    vst1.8          {q5}, [r6], lr
173    vst1.8          {q6}, [r5], lr
174    vst1.8          {q7}, [r6], lr
175    vst1.8          {q8}, [r5], lr
176    vst1.8          {q9}, [r6], lr
177    vst1.8          {q10}, [r5], lr
178    vst1.8          {q11}, [r6], lr
179    vst1.8          {q12}, [r5], lr
180    vst1.8          {q13}, [r6], lr
181    vst1.8          {q14}, [r5], lr
182    vst1.8          {q15}, [r6], lr
183
184    bne             copy_left_right_uv
185
186;Now copy the top and bottom source lines into each line of the respective borders
187    mov             r1, r7
188    mul             r8, r4, lr              ; plane_height * plane_stride
189    movs            r12, lr, lsr #6         ; plane_stride / 64
190
191    sub             r1, r1, #16             ; src_ptr1 = u_buffer - Border
192    add             r6, r1, r8              ; dest_ptr2 = src_ptr2 + plane_stride (src_ptr1 + (plane_height * plane_stride)
193    sub             r2, r6, lr              ; src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride
194    sub             r5, r1, lr, asl #4      ; dest_ptr1 = src_ptr1 - (Border * plane_stride)
195    ble             extra_uv_copy_needed    ; plane_stride < 64
196
197copy_top_bottom_uv
198    vld1.8          {q0, q1}, [r1]!
199    vld1.8          {q8, q9}, [r2]!
200    vld1.8          {q2, q3}, [r1]!
201    vld1.8          {q10, q11}, [r2]!
202
203    mov             r7, #16                 ; Border
204
205top_bottom_16
206    subs            r7, r7, #1
207
208    vst1.8          {q0, q1}, [r5]!
209    vst1.8          {q8, q9}, [r6]!
210    vst1.8          {q2, q3}, [r5]!
211    vst1.8          {q10, q11}, [r6]!
212
213    add             r5, r5, lr              ; dest_ptr1 += plane_stride
214    sub             r5, r5, #64
215    add             r6, r6, lr              ; dest_ptr2 += plane_stride
216    sub             r6, r6, #64
217
218    bne             top_bottom_16
219
220    sub             r5, r1, lr, asl #4      ; dest_ptr1 = src_ptr1 - (Border * plane_stride)
221    add             r6, r2, lr              ; dest_ptr2 = src_ptr2 + plane_stride
222
223    subs            r12, r12, #1
224    bne             copy_top_bottom_uv
225extra_uv_copy_needed
226    mov             r7, lr, lsr #3          ; check to see if extra copy is needed
227    ands            r7, r7, #0x7
228    bne             extra_top_bottom_uv
229
230end_of_border_copy_uv
231    subs            r10, r10, #1
232    ldrne           r7, [r0, #yv12_buffer_config_v_buffer] ; src_ptr1
233    bne             border_copy_uv
234
235    vpop            {d8 - d15}
236    pop             {r4 - r10, pc}
237
238;;;;;;;;;;;;;;;;;;;;;;
239extra_top_bottom_y
240    vld1.8          {q0}, [r1]!
241    vld1.8          {q2}, [r2]!
242
243    mov             r9, #4                  ; 32 >> 3
244
245extra_top_bottom_32
246    subs            r9, r9, #1
247
248    vst1.8          {q0}, [r5], lr
249    vst1.8          {q2}, [r6], lr
250    vst1.8          {q0}, [r5], lr
251    vst1.8          {q2}, [r6], lr
252    vst1.8          {q0}, [r5], lr
253    vst1.8          {q2}, [r6], lr
254    vst1.8          {q0}, [r5], lr
255    vst1.8          {q2}, [r6], lr
256    vst1.8          {q0}, [r5], lr
257    vst1.8          {q2}, [r6], lr
258    vst1.8          {q0}, [r5], lr
259    vst1.8          {q2}, [r6], lr
260    vst1.8          {q0}, [r5], lr
261    vst1.8          {q2}, [r6], lr
262    vst1.8          {q0}, [r5], lr
263    vst1.8          {q2}, [r6], lr
264    bne             extra_top_bottom_32
265
266    sub             r5, r1, lr, asl #5      ; src_ptr1 - (Border * plane_stride)
267    add             r6, r2, lr              ; src_ptr2 + plane_stride
268    subs            r7, r7, #1
269    bne             extra_top_bottom_y
270
271    b               end_of_border_copy_y
272
273extra_top_bottom_uv
274    vld1.8          {d0}, [r1]!
275    vld1.8          {d8}, [r2]!
276
277    mov             r9, #2                  ; 16 >> 3
278
279extra_top_bottom_16
280    subs            r9, r9, #1
281
282    vst1.8          {d0}, [r5], lr
283    vst1.8          {d8}, [r6], lr
284    vst1.8          {d0}, [r5], lr
285    vst1.8          {d8}, [r6], lr
286    vst1.8          {d0}, [r5], lr
287    vst1.8          {d8}, [r6], lr
288    vst1.8          {d0}, [r5], lr
289    vst1.8          {d8}, [r6], lr
290    vst1.8          {d0}, [r5], lr
291    vst1.8          {d8}, [r6], lr
292    vst1.8          {d0}, [r5], lr
293    vst1.8          {d8}, [r6], lr
294    vst1.8          {d0}, [r5], lr
295    vst1.8          {d8}, [r6], lr
296    vst1.8          {d0}, [r5], lr
297    vst1.8          {d8}, [r6], lr
298    bne             extra_top_bottom_16
299
300    sub             r5, r1, lr, asl #4      ; src_ptr1 - (Border * plane_stride)
301    add             r6, r2, lr              ; src_ptr2 + plane_stride
302    subs            r7, r7, #1
303    bne             extra_top_bottom_uv
304
305    b               end_of_border_copy_uv
306
307    ENDP
308    END
309