1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12    EXPORT  |vp8_yv12_copy_frame_yonly_neon|
13    EXPORT  |vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon|
14
15    ARM
16    REQUIRE8
17    PRESERVE8
18
19    INCLUDE asm_com_offsets.asm
20
21    AREA ||.text||, CODE, READONLY, ALIGN=2
22;void vpxyv12_copy_frame_yonly(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
23; Note: this is VP8 function, which has border=32 and 16. Internal y_width and y_height
24; are always multiples of 16.
25
26|vp8_yv12_copy_frame_yonly_neon| PROC
27    push            {r4 - r11, lr}
28    vpush           {d8 - d15}
29
30    ldr             r4, [r0, #yv12_buffer_config_y_height]
31    ldr             r5, [r0, #yv12_buffer_config_y_width]
32    ldr             r6, [r0, #yv12_buffer_config_y_stride]
33    ldr             r7, [r1, #yv12_buffer_config_y_stride]
34    ldr             r2, [r0, #yv12_buffer_config_y_buffer]       ;srcptr1
35    ldr             r3, [r1, #yv12_buffer_config_y_buffer]       ;dstptr1
36
37    ; copy two rows at one time
38    mov             lr, r4, lsr #1
39
40cp_src_to_dst_height_loop
41    mov             r8, r2
42    mov             r9, r3
43    add             r10, r2, r6
44    add             r11, r3, r7
45    mov             r12, r5, lsr #7
46
47cp_src_to_dst_width_loop
48    vld1.8          {q0, q1}, [r8]!
49    vld1.8          {q8, q9}, [r10]!
50    vld1.8          {q2, q3}, [r8]!
51    vld1.8          {q10, q11}, [r10]!
52    vld1.8          {q4, q5}, [r8]!
53    vld1.8          {q12, q13}, [r10]!
54    vld1.8          {q6, q7}, [r8]!
55    vld1.8          {q14, q15}, [r10]!
56
57    subs            r12, r12, #1
58
59    vst1.8          {q0, q1}, [r9]!
60    vst1.8          {q8, q9}, [r11]!
61    vst1.8          {q2, q3}, [r9]!
62    vst1.8          {q10, q11}, [r11]!
63    vst1.8          {q4, q5}, [r9]!
64    vst1.8          {q12, q13}, [r11]!
65    vst1.8          {q6, q7}, [r9]!
66    vst1.8          {q14, q15}, [r11]!
67
68    bne             cp_src_to_dst_width_loop
69
70    subs            lr, lr, #1
71    add             r2, r2, r6, lsl #1
72    add             r3, r3, r7, lsl #1
73
74    bne             cp_src_to_dst_height_loop
75
76    ands            r10, r5, #0x7f                  ;check to see if extra copy is needed
77    sub             r11, r5, r10
78    ldr             r2, [r0, #yv12_buffer_config_y_buffer]       ;srcptr1
79    ldr             r3, [r1, #yv12_buffer_config_y_buffer]       ;dstptr1
80    bne             extra_cp_src_to_dst_width
81end_of_cp_src_to_dst
82
83
84    ;vpxyv12_extend_frame_borders_yonly
85    mov             r0, r1
86    ;Not need to load y_width, since: y_width = y_stride - 2*border
87    ldr             r3, [r0, #yv12_buffer_config_border]
88    ldr             r1, [r0, #yv12_buffer_config_y_buffer]       ;srcptr1
89    ldr             r4, [r0, #yv12_buffer_config_y_height]
90    ldr             lr, [r0, #yv12_buffer_config_y_stride]
91
92    cmp             r3, #16
93    beq             b16_extend_frame_borders
94
95;=======================
96b32_extend_frame_borders
97;border = 32
98;=======================
99;Border copy for Y plane
100;copy the left and right most columns out
101    sub             r5, r1, r3              ;destptr1
102    add             r6, r1, lr
103    sub             r6, r6, r3, lsl #1      ;destptr2
104    sub             r2, r6, #1              ;srcptr2
105
106    ;Do four rows at one time
107    mov             r12, r4, lsr #2
108
109copy_left_right_y
110    vld1.8          {d0[], d1[]}, [r1], lr
111    vld1.8          {d4[], d5[]}, [r2], lr
112    vld1.8          {d8[], d9[]}, [r1], lr
113    vld1.8          {d12[], d13[]}, [r2], lr
114    vld1.8          {d16[], d17[]},  [r1], lr
115    vld1.8          {d20[], d21[]}, [r2], lr
116    vld1.8          {d24[], d25[]}, [r1], lr
117    vld1.8          {d28[], d29[]}, [r2], lr
118
119    vmov            q1, q0
120    vmov            q3, q2
121    vmov            q5, q4
122    vmov            q7, q6
123    vmov            q9, q8
124    vmov            q11, q10
125    vmov            q13, q12
126    vmov            q15, q14
127
128    subs            r12, r12, #1
129
130    vst1.8          {q0, q1}, [r5], lr
131    vst1.8          {q2, q3}, [r6], lr
132    vst1.8          {q4, q5}, [r5], lr
133    vst1.8          {q6, q7}, [r6], lr
134    vst1.8          {q8, q9}, [r5], lr
135    vst1.8          {q10, q11}, [r6], lr
136    vst1.8          {q12, q13}, [r5], lr
137    vst1.8          {q14, q15}, [r6], lr
138
139    bne             copy_left_right_y
140
141;Now copy the top and bottom source lines into each line of the respective borders
142    ldr             r7, [r0, #yv12_buffer_config_y_buffer]       ;srcptr1
143    mul             r8, r3, lr
144
145    mov             r12, lr, lsr #7
146
147    sub             r6, r1, r3              ;destptr2
148    sub             r2, r6, lr              ;srcptr2
149    sub             r1, r7, r3              ;srcptr1
150    sub             r5, r1, r8              ;destptr1
151
152copy_top_bottom_y
153    vld1.8          {q0, q1}, [r1]!
154    vld1.8          {q8, q9}, [r2]!
155    vld1.8          {q2, q3}, [r1]!
156    vld1.8          {q10, q11}, [r2]!
157    vld1.8          {q4, q5}, [r1]!
158    vld1.8          {q12, q13}, [r2]!
159    vld1.8          {q6, q7}, [r1]!
160    vld1.8          {q14, q15}, [r2]!
161
162    mov             r7, r3
163
164top_bottom_32
165    subs            r7, r7, #1
166
167    vst1.8          {q0, q1}, [r5]!
168    vst1.8          {q8, q9}, [r6]!
169    vst1.8          {q2, q3}, [r5]!
170    vst1.8          {q10, q11}, [r6]!
171    vst1.8          {q4, q5}, [r5]!
172    vst1.8          {q12, q13}, [r6]!
173    vst1.8          {q6, q7}, [r5]!
174    vst1.8          {q14, q15}, [r6]!
175
176    add             r5, r5, lr
177    sub             r5, r5, #128
178    add             r6, r6, lr
179    sub             r6, r6, #128
180
181    bne             top_bottom_32
182
183    sub             r5, r1, r8
184    add             r6, r2, lr
185
186    subs            r12, r12, #1
187    bne             copy_top_bottom_y
188
189    mov             r7, lr, lsr #4              ;check to see if extra copy is needed
190    ands            r7, r7, #0x7
191    bne             extra_top_bottom_y
192end_of_border_copy_y
193
194    vpop            {d8 - d15}
195    pop             {r4 - r11, pc}
196
197;=====================
198;extra copy part for Y
199extra_top_bottom_y
200    vld1.8          {q0}, [r1]!
201    vld1.8          {q2}, [r2]!
202
203    mov             r9, r3, lsr #3
204
205extra_top_bottom_32
206    subs            r9, r9, #1
207
208    vst1.8          {q0}, [r5], lr
209    vst1.8          {q2}, [r6], lr
210    vst1.8          {q0}, [r5], lr
211    vst1.8          {q2}, [r6], lr
212    vst1.8          {q0}, [r5], lr
213    vst1.8          {q2}, [r6], lr
214    vst1.8          {q0}, [r5], lr
215    vst1.8          {q2}, [r6], lr
216    vst1.8          {q0}, [r5], lr
217    vst1.8          {q2}, [r6], lr
218    vst1.8          {q0}, [r5], lr
219    vst1.8          {q2}, [r6], lr
220    vst1.8          {q0}, [r5], lr
221    vst1.8          {q2}, [r6], lr
222    vst1.8          {q0}, [r5], lr
223    vst1.8          {q2}, [r6], lr
224    bne             extra_top_bottom_32
225
226    sub             r5, r1, r8
227    add             r6, r2, lr
228    subs            r7, r7, #1
229    bne             extra_top_bottom_y
230
231    b               end_of_border_copy_y
232
233
234;=======================
235b16_extend_frame_borders
236;border = 16
237;=======================
238;Border copy for Y plane
239;copy the left and right most columns out
240    sub             r5, r1, r3              ;destptr1
241    add             r6, r1, lr
242    sub             r6, r6, r3, lsl #1      ;destptr2
243    sub             r2, r6, #1              ;srcptr2
244
245    ;Do four rows at one time
246    mov             r12, r4, lsr #2
247
248copy_left_right_y_b16
249    vld1.8          {d0[], d1[]}, [r1], lr
250    vld1.8          {d4[], d5[]}, [r2], lr
251    vld1.8          {d8[], d9[]}, [r1], lr
252    vld1.8          {d12[], d13[]}, [r2], lr
253    vld1.8          {d16[], d17[]},  [r1], lr
254    vld1.8          {d20[], d21[]}, [r2], lr
255    vld1.8          {d24[], d25[]}, [r1], lr
256    vld1.8          {d28[], d29[]}, [r2], lr
257
258    subs            r12, r12, #1
259
260    vst1.8          {q0}, [r5], lr
261    vst1.8          {q2}, [r6], lr
262    vst1.8          {q4}, [r5], lr
263    vst1.8          {q6}, [r6], lr
264    vst1.8          {q8}, [r5], lr
265    vst1.8          {q10}, [r6], lr
266    vst1.8          {q12}, [r5], lr
267    vst1.8          {q14}, [r6], lr
268
269    bne             copy_left_right_y_b16
270
271;Now copy the top and bottom source lines into each line of the respective borders
272    ldr             r7, [r0, #yv12_buffer_config_y_buffer]       ;srcptr1
273    mul             r8, r3, lr
274
275    mov             r12, lr, lsr #7
276
277    sub             r6, r1, r3              ;destptr2
278    sub             r2, r6, lr              ;srcptr2
279    sub             r1, r7, r3              ;srcptr1
280    sub             r5, r1, r8              ;destptr1
281
282copy_top_bottom_y_b16
283    vld1.8          {q0, q1}, [r1]!
284    vld1.8          {q8, q9}, [r2]!
285    vld1.8          {q2, q3}, [r1]!
286    vld1.8          {q10, q11}, [r2]!
287    vld1.8          {q4, q5}, [r1]!
288    vld1.8          {q12, q13}, [r2]!
289    vld1.8          {q6, q7}, [r1]!
290    vld1.8          {q14, q15}, [r2]!
291
292    mov             r7, r3
293
294top_bottom_16_b16
295    subs            r7, r7, #1
296
297    vst1.8          {q0, q1}, [r5]!
298    vst1.8          {q8, q9}, [r6]!
299    vst1.8          {q2, q3}, [r5]!
300    vst1.8          {q10, q11}, [r6]!
301    vst1.8          {q4, q5}, [r5]!
302    vst1.8          {q12, q13}, [r6]!
303    vst1.8          {q6, q7}, [r5]!
304    vst1.8          {q14, q15}, [r6]!
305
306    add             r5, r5, lr
307    sub             r5, r5, #128
308    add             r6, r6, lr
309    sub             r6, r6, #128
310
311    bne             top_bottom_16_b16
312
313    sub             r5, r1, r8
314    add             r6, r2, lr
315
316    subs            r12, r12, #1
317    bne             copy_top_bottom_y_b16
318
319    mov             r7, lr, lsr #4              ;check to see if extra copy is needed
320    ands            r7, r7, #0x7
321    bne             extra_top_bottom_y_b16
322end_of_border_copy_y_b16
323
324    vpop            {d8 - d15}
325    pop             {r4 - r11, pc}
326
327;=====================
328;extra copy part for Y
329extra_top_bottom_y_b16
330    vld1.8          {q0}, [r1]!
331    vld1.8          {q2}, [r2]!
332
333    mov             r9, r3, lsr #3
334
335extra_top_bottom_16_b16
336    subs            r9, r9, #1
337
338    vst1.8          {q0}, [r5], lr
339    vst1.8          {q2}, [r6], lr
340    vst1.8          {q0}, [r5], lr
341    vst1.8          {q2}, [r6], lr
342    vst1.8          {q0}, [r5], lr
343    vst1.8          {q2}, [r6], lr
344    vst1.8          {q0}, [r5], lr
345    vst1.8          {q2}, [r6], lr
346    vst1.8          {q0}, [r5], lr
347    vst1.8          {q2}, [r6], lr
348    vst1.8          {q0}, [r5], lr
349    vst1.8          {q2}, [r6], lr
350    vst1.8          {q0}, [r5], lr
351    vst1.8          {q2}, [r6], lr
352    vst1.8          {q0}, [r5], lr
353    vst1.8          {q2}, [r6], lr
354    bne             extra_top_bottom_16_b16
355
356    sub             r5, r1, r8
357    add             r6, r2, lr
358    subs            r7, r7, #1
359    bne             extra_top_bottom_y_b16
360
361    b               end_of_border_copy_y_b16
362
363;=============================
364extra_cp_src_to_dst_width
365    add             r2, r2, r11
366    add             r3, r3, r11
367    add             r0, r8, r6
368    add             r11, r9, r7
369
370    mov             lr, r4, lsr #1
371extra_cp_src_to_dst_height_loop
372    mov             r8, r2
373    mov             r9, r3
374    add             r0, r8, r6
375    add             r11, r9, r7
376
377    mov             r12, r10
378
379extra_cp_src_to_dst_width_loop
380    vld1.8          {q0}, [r8]!
381    vld1.8          {q1}, [r0]!
382
383    subs            r12, r12, #16
384
385    vst1.8          {q0}, [r9]!
386    vst1.8          {q1}, [r11]!
387    bne             extra_cp_src_to_dst_width_loop
388
389    subs            lr, lr, #1
390
391    add             r2, r2, r6, lsl #1
392    add             r3, r3, r7, lsl #1
393
394    bne             extra_cp_src_to_dst_height_loop
395
396    b               end_of_cp_src_to_dst
397
398    ENDP
399
400;===========================================================
401;In vp8cx_pick_filter_level(), call vp8_yv12_copy_frame_yonly
402;without extend_frame_borders.
403|vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon| PROC
404    push            {r4 - r11, lr}
405    vpush           {d8-d15}
406
407    ldr             r4, [r0, #yv12_buffer_config_y_height]
408    ldr             r5, [r0, #yv12_buffer_config_y_width]
409    ldr             r6, [r0, #yv12_buffer_config_y_stride]
410    ldr             r7, [r1, #yv12_buffer_config_y_stride]
411    ldr             r2, [r0, #yv12_buffer_config_y_buffer]       ;srcptr1
412    ldr             r3, [r1, #yv12_buffer_config_y_buffer]       ;dstptr1
413
414    ; copy two rows at one time
415    mov             lr, r4, lsr #1
416
417cp_src_to_dst_height_loop1
418    mov             r8, r2
419    mov             r9, r3
420    add             r10, r2, r6
421    add             r11, r3, r7
422    mov             r12, r5, lsr #7
423
424cp_src_to_dst_width_loop1
425    vld1.8          {q0, q1}, [r8]!
426    vld1.8          {q8, q9}, [r10]!
427    vld1.8          {q2, q3}, [r8]!
428    vld1.8          {q10, q11}, [r10]!
429    vld1.8          {q4, q5}, [r8]!
430    vld1.8          {q12, q13}, [r10]!
431    vld1.8          {q6, q7}, [r8]!
432    vld1.8          {q14, q15}, [r10]!
433
434    subs            r12, r12, #1
435
436    vst1.8          {q0, q1}, [r9]!
437    vst1.8          {q8, q9}, [r11]!
438    vst1.8          {q2, q3}, [r9]!
439    vst1.8          {q10, q11}, [r11]!
440    vst1.8          {q4, q5}, [r9]!
441    vst1.8          {q12, q13}, [r11]!
442    vst1.8          {q6, q7}, [r9]!
443    vst1.8          {q14, q15}, [r11]!
444
445    bne             cp_src_to_dst_width_loop1
446
447    subs            lr, lr, #1
448    add             r2, r2, r6, lsl #1
449    add             r3, r3, r7, lsl #1
450
451    bne             cp_src_to_dst_height_loop1
452
453    ands            r10, r5, #0x7f                  ;check to see if extra copy is needed
454    sub             r11, r5, r10
455    ldr             r2, [r0, #yv12_buffer_config_y_buffer]       ;srcptr1
456    ldr             r3, [r1, #yv12_buffer_config_y_buffer]       ;dstptr1
457    bne             extra_cp_src_to_dst_width1
458end_of_cp_src_to_dst1
459
460    vpop            {d8 - d15}
461    pop             {r4-r11, pc}
462
463;=============================
464extra_cp_src_to_dst_width1
465    add             r2, r2, r11
466    add             r3, r3, r11
467    add             r0, r8, r6
468    add             r11, r9, r7
469
470    mov             lr, r4, lsr #1
471extra_cp_src_to_dst_height_loop1
472    mov             r8, r2
473    mov             r9, r3
474    add             r0, r8, r6
475    add             r11, r9, r7
476
477    mov             r12, r10
478
479extra_cp_src_to_dst_width_loop1
480    vld1.8          {q0}, [r8]!
481    vld1.8          {q1}, [r0]!
482
483    subs            r12, r12, #16
484
485    vst1.8          {q0}, [r9]!
486    vst1.8          {q1}, [r11]!
487    bne             extra_cp_src_to_dst_width_loop1
488
489    subs            lr, lr, #1
490
491    add             r2, r2, r6, lsl #1
492    add             r3, r3, r7, lsl #1
493
494    bne             extra_cp_src_to_dst_height_loop1
495
496    b               end_of_cp_src_to_dst1
497
498    ENDP
499
500    END
501