1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12    EXPORT  |vp8_build_intra_predictors_mby_neon_func|
13    EXPORT  |vp8_build_intra_predictors_mby_s_neon_func|
14
15    ARM
16    REQUIRE8
17    PRESERVE8
18
19    AREA ||.text||, CODE, READONLY, ALIGN=2
20; r0    unsigned char *y_buffer
21; r1    unsigned char *ypred_ptr
22; r2    int y_stride
23; r3    int mode
24; stack int Up
25; stack int Left
26
27|vp8_build_intra_predictors_mby_neon_func| PROC
28    push            {r4-r8, lr}
29
30    cmp             r3, #0
31    beq             case_dc_pred
32    cmp             r3, #1
33    beq             case_v_pred
34    cmp             r3, #2
35    beq             case_h_pred
36    cmp             r3, #3
37    beq             case_tm_pred
38
39case_dc_pred
40    ldr             r4, [sp, #24]       ; Up
41    ldr             r5, [sp, #28]       ; Left
42
43    ; Default the DC average to 128
44    mov             r12, #128
45    vdup.u8         q0, r12
46
47    ; Zero out running sum
48    mov             r12, #0
49
50    ; compute shift and jump
51    adds            r7, r4, r5
52    beq             skip_dc_pred_up_left
53
54    ; Load above row, if it exists
55    cmp             r4, #0
56    beq             skip_dc_pred_up
57
58    sub             r6, r0, r2
59    vld1.8          {q1}, [r6]
60    vpaddl.u8       q2, q1
61    vpaddl.u16      q3, q2
62    vpaddl.u32      q4, q3
63
64    vmov.32         r4, d8[0]
65    vmov.32         r6, d9[0]
66
67    add             r12, r4, r6
68
69    ; Move back to interger registers
70
71skip_dc_pred_up
72
73    cmp             r5, #0
74    beq             skip_dc_pred_left
75
76    sub             r0, r0, #1
77
78    ; Load left row, if it exists
79    ldrb            r3, [r0], r2
80    ldrb            r4, [r0], r2
81    ldrb            r5, [r0], r2
82    ldrb            r6, [r0], r2
83
84    add             r12, r12, r3
85    add             r12, r12, r4
86    add             r12, r12, r5
87    add             r12, r12, r6
88
89    ldrb            r3, [r0], r2
90    ldrb            r4, [r0], r2
91    ldrb            r5, [r0], r2
92    ldrb            r6, [r0], r2
93
94    add             r12, r12, r3
95    add             r12, r12, r4
96    add             r12, r12, r5
97    add             r12, r12, r6
98
99    ldrb            r3, [r0], r2
100    ldrb            r4, [r0], r2
101    ldrb            r5, [r0], r2
102    ldrb            r6, [r0], r2
103
104    add             r12, r12, r3
105    add             r12, r12, r4
106    add             r12, r12, r5
107    add             r12, r12, r6
108
109    ldrb            r3, [r0], r2
110    ldrb            r4, [r0], r2
111    ldrb            r5, [r0], r2
112    ldrb            r6, [r0]
113
114    add             r12, r12, r3
115    add             r12, r12, r4
116    add             r12, r12, r5
117    add             r12, r12, r6
118
119skip_dc_pred_left
120    add             r7, r7, #3          ; Shift
121    sub             r4, r7, #1
122    mov             r5, #1
123    add             r12, r12, r5, lsl r4
124    mov             r5, r12, lsr r7     ; expected_dc
125
126    vdup.u8         q0, r5
127
128skip_dc_pred_up_left
129    vst1.u8         {q0}, [r1]!
130    vst1.u8         {q0}, [r1]!
131    vst1.u8         {q0}, [r1]!
132    vst1.u8         {q0}, [r1]!
133    vst1.u8         {q0}, [r1]!
134    vst1.u8         {q0}, [r1]!
135    vst1.u8         {q0}, [r1]!
136    vst1.u8         {q0}, [r1]!
137    vst1.u8         {q0}, [r1]!
138    vst1.u8         {q0}, [r1]!
139    vst1.u8         {q0}, [r1]!
140    vst1.u8         {q0}, [r1]!
141    vst1.u8         {q0}, [r1]!
142    vst1.u8         {q0}, [r1]!
143    vst1.u8         {q0}, [r1]!
144    vst1.u8         {q0}, [r1]!
145
146    pop             {r4-r8,pc}
147case_v_pred
148    ; Copy down above row
149    sub             r6, r0, r2
150    vld1.8          {q0}, [r6]
151
152    vst1.u8         {q0}, [r1]!
153    vst1.u8         {q0}, [r1]!
154    vst1.u8         {q0}, [r1]!
155    vst1.u8         {q0}, [r1]!
156    vst1.u8         {q0}, [r1]!
157    vst1.u8         {q0}, [r1]!
158    vst1.u8         {q0}, [r1]!
159    vst1.u8         {q0}, [r1]!
160    vst1.u8         {q0}, [r1]!
161    vst1.u8         {q0}, [r1]!
162    vst1.u8         {q0}, [r1]!
163    vst1.u8         {q0}, [r1]!
164    vst1.u8         {q0}, [r1]!
165    vst1.u8         {q0}, [r1]!
166    vst1.u8         {q0}, [r1]!
167    vst1.u8         {q0}, [r1]!
168    pop             {r4-r8,pc}
169
170case_h_pred
171    ; Load 4x yleft_col
172    sub             r0, r0, #1
173
174    ldrb            r3, [r0], r2
175    ldrb            r4, [r0], r2
176    ldrb            r5, [r0], r2
177    ldrb            r6, [r0], r2
178    vdup.u8         q0, r3
179    vdup.u8         q1, r4
180    vdup.u8         q2, r5
181    vdup.u8         q3, r6
182    vst1.u8         {q0}, [r1]!
183    vst1.u8         {q1}, [r1]!
184    vst1.u8         {q2}, [r1]!
185    vst1.u8         {q3}, [r1]!
186
187    ldrb            r3, [r0], r2
188    ldrb            r4, [r0], r2
189    ldrb            r5, [r0], r2
190    ldrb            r6, [r0], r2
191    vdup.u8         q0, r3
192    vdup.u8         q1, r4
193    vdup.u8         q2, r5
194    vdup.u8         q3, r6
195    vst1.u8         {q0}, [r1]!
196    vst1.u8         {q1}, [r1]!
197    vst1.u8         {q2}, [r1]!
198    vst1.u8         {q3}, [r1]!
199
200
201    ldrb            r3, [r0], r2
202    ldrb            r4, [r0], r2
203    ldrb            r5, [r0], r2
204    ldrb            r6, [r0], r2
205    vdup.u8         q0, r3
206    vdup.u8         q1, r4
207    vdup.u8         q2, r5
208    vdup.u8         q3, r6
209    vst1.u8         {q0}, [r1]!
210    vst1.u8         {q1}, [r1]!
211    vst1.u8         {q2}, [r1]!
212    vst1.u8         {q3}, [r1]!
213
214    ldrb            r3, [r0], r2
215    ldrb            r4, [r0], r2
216    ldrb            r5, [r0], r2
217    ldrb            r6, [r0], r2
218    vdup.u8         q0, r3
219    vdup.u8         q1, r4
220    vdup.u8         q2, r5
221    vdup.u8         q3, r6
222    vst1.u8         {q0}, [r1]!
223    vst1.u8         {q1}, [r1]!
224    vst1.u8         {q2}, [r1]!
225    vst1.u8         {q3}, [r1]!
226
227    pop             {r4-r8,pc}
228
229case_tm_pred
230    ; Load yabove_row
231    sub             r3, r0, r2
232    vld1.8          {q8}, [r3]
233
234    ; Load ytop_left
235    sub             r3, r3, #1
236    ldrb            r7, [r3]
237
238    vdup.u16        q7, r7
239
240    ; Compute yabove_row - ytop_left
241    mov             r3, #1
242    vdup.u8         q0, r3
243
244    vmull.u8        q4, d16, d0
245    vmull.u8        q5, d17, d0
246
247    vsub.s16        q4, q4, q7
248    vsub.s16        q5, q5, q7
249
250    ; Load 4x yleft_col
251    sub             r0, r0, #1
252    mov             r12, #4
253
254case_tm_pred_loop
255    ldrb            r3, [r0], r2
256    ldrb            r4, [r0], r2
257    ldrb            r5, [r0], r2
258    ldrb            r6, [r0], r2
259    vdup.u16        q0, r3
260    vdup.u16        q1, r4
261    vdup.u16        q2, r5
262    vdup.u16        q3, r6
263
264    vqadd.s16       q8, q0, q4
265    vqadd.s16       q9, q0, q5
266
267    vqadd.s16       q10, q1, q4
268    vqadd.s16       q11, q1, q5
269
270    vqadd.s16       q12, q2, q4
271    vqadd.s16       q13, q2, q5
272
273    vqadd.s16       q14, q3, q4
274    vqadd.s16       q15, q3, q5
275
276    vqshrun.s16     d0, q8, #0
277    vqshrun.s16     d1, q9, #0
278
279    vqshrun.s16     d2, q10, #0
280    vqshrun.s16     d3, q11, #0
281
282    vqshrun.s16     d4, q12, #0
283    vqshrun.s16     d5, q13, #0
284
285    vqshrun.s16     d6, q14, #0
286    vqshrun.s16     d7, q15, #0
287
288    vst1.u8         {q0}, [r1]!
289    vst1.u8         {q1}, [r1]!
290    vst1.u8         {q2}, [r1]!
291    vst1.u8         {q3}, [r1]!
292
293    subs            r12, r12, #1
294    bne             case_tm_pred_loop
295
296    pop             {r4-r8,pc}
297
298    ENDP
299
300;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
301; r0    unsigned char *y_buffer
302; r1    unsigned char *ypred_ptr
303; r2    int y_stride
304; r3    int mode
305; stack int Up
306; stack int Left
307
308|vp8_build_intra_predictors_mby_s_neon_func| PROC
309    push            {r4-r8, lr}
310
311    mov             r1, r0      ;   unsigned char *ypred_ptr = x->dst.y_buffer; //x->Predictor;
312
313    cmp             r3, #0
314    beq             case_dc_pred_s
315    cmp             r3, #1
316    beq             case_v_pred_s
317    cmp             r3, #2
318    beq             case_h_pred_s
319    cmp             r3, #3
320    beq             case_tm_pred_s
321
322case_dc_pred_s
323    ldr             r4, [sp, #24]       ; Up
324    ldr             r5, [sp, #28]       ; Left
325
326    ; Default the DC average to 128
327    mov             r12, #128
328    vdup.u8         q0, r12
329
330    ; Zero out running sum
331    mov             r12, #0
332
333    ; compute shift and jump
334    adds            r7, r4, r5
335    beq             skip_dc_pred_up_left_s
336
337    ; Load above row, if it exists
338    cmp             r4, #0
339    beq             skip_dc_pred_up_s
340
341    sub             r6, r0, r2
342    vld1.8          {q1}, [r6]
343    vpaddl.u8       q2, q1
344    vpaddl.u16      q3, q2
345    vpaddl.u32      q4, q3
346
347    vmov.32         r4, d8[0]
348    vmov.32         r6, d9[0]
349
350    add             r12, r4, r6
351
352    ; Move back to interger registers
353
354skip_dc_pred_up_s
355
356    cmp             r5, #0
357    beq             skip_dc_pred_left_s
358
359    sub             r0, r0, #1
360
361    ; Load left row, if it exists
362    ldrb            r3, [r0], r2
363    ldrb            r4, [r0], r2
364    ldrb            r5, [r0], r2
365    ldrb            r6, [r0], r2
366
367    add             r12, r12, r3
368    add             r12, r12, r4
369    add             r12, r12, r5
370    add             r12, r12, r6
371
372    ldrb            r3, [r0], r2
373    ldrb            r4, [r0], r2
374    ldrb            r5, [r0], r2
375    ldrb            r6, [r0], r2
376
377    add             r12, r12, r3
378    add             r12, r12, r4
379    add             r12, r12, r5
380    add             r12, r12, r6
381
382    ldrb            r3, [r0], r2
383    ldrb            r4, [r0], r2
384    ldrb            r5, [r0], r2
385    ldrb            r6, [r0], r2
386
387    add             r12, r12, r3
388    add             r12, r12, r4
389    add             r12, r12, r5
390    add             r12, r12, r6
391
392    ldrb            r3, [r0], r2
393    ldrb            r4, [r0], r2
394    ldrb            r5, [r0], r2
395    ldrb            r6, [r0]
396
397    add             r12, r12, r3
398    add             r12, r12, r4
399    add             r12, r12, r5
400    add             r12, r12, r6
401
402skip_dc_pred_left_s
403    add             r7, r7, #3          ; Shift
404    sub             r4, r7, #1
405    mov             r5, #1
406    add             r12, r12, r5, lsl r4
407    mov             r5, r12, lsr r7     ; expected_dc
408
409    vdup.u8         q0, r5
410
411skip_dc_pred_up_left_s
412    vst1.u8         {q0}, [r1], r2
413    vst1.u8         {q0}, [r1], r2
414    vst1.u8         {q0}, [r1], r2
415    vst1.u8         {q0}, [r1], r2
416    vst1.u8         {q0}, [r1], r2
417    vst1.u8         {q0}, [r1], r2
418    vst1.u8         {q0}, [r1], r2
419    vst1.u8         {q0}, [r1], r2
420    vst1.u8         {q0}, [r1], r2
421    vst1.u8         {q0}, [r1], r2
422    vst1.u8         {q0}, [r1], r2
423    vst1.u8         {q0}, [r1], r2
424    vst1.u8         {q0}, [r1], r2
425    vst1.u8         {q0}, [r1], r2
426    vst1.u8         {q0}, [r1], r2
427    vst1.u8         {q0}, [r1], r2
428
429    pop             {r4-r8,pc}
430case_v_pred_s
431    ; Copy down above row
432    sub             r6, r0, r2
433    vld1.8          {q0}, [r6]
434
435    vst1.u8         {q0}, [r1], r2
436    vst1.u8         {q0}, [r1], r2
437    vst1.u8         {q0}, [r1], r2
438    vst1.u8         {q0}, [r1], r2
439    vst1.u8         {q0}, [r1], r2
440    vst1.u8         {q0}, [r1], r2
441    vst1.u8         {q0}, [r1], r2
442    vst1.u8         {q0}, [r1], r2
443    vst1.u8         {q0}, [r1], r2
444    vst1.u8         {q0}, [r1], r2
445    vst1.u8         {q0}, [r1], r2
446    vst1.u8         {q0}, [r1], r2
447    vst1.u8         {q0}, [r1], r2
448    vst1.u8         {q0}, [r1], r2
449    vst1.u8         {q0}, [r1], r2
450    vst1.u8         {q0}, [r1], r2
451    pop             {r4-r8,pc}
452
453case_h_pred_s
454    ; Load 4x yleft_col
455    sub             r0, r0, #1
456
457    ldrb            r3, [r0], r2
458    ldrb            r4, [r0], r2
459    ldrb            r5, [r0], r2
460    ldrb            r6, [r0], r2
461    vdup.u8         q0, r3
462    vdup.u8         q1, r4
463    vdup.u8         q2, r5
464    vdup.u8         q3, r6
465    vst1.u8         {q0}, [r1], r2
466    vst1.u8         {q1}, [r1], r2
467    vst1.u8         {q2}, [r1], r2
468    vst1.u8         {q3}, [r1], r2
469
470    ldrb            r3, [r0], r2
471    ldrb            r4, [r0], r2
472    ldrb            r5, [r0], r2
473    ldrb            r6, [r0], r2
474    vdup.u8         q0, r3
475    vdup.u8         q1, r4
476    vdup.u8         q2, r5
477    vdup.u8         q3, r6
478    vst1.u8         {q0}, [r1], r2
479    vst1.u8         {q1}, [r1], r2
480    vst1.u8         {q2}, [r1], r2
481    vst1.u8         {q3}, [r1], r2
482
483
484    ldrb            r3, [r0], r2
485    ldrb            r4, [r0], r2
486    ldrb            r5, [r0], r2
487    ldrb            r6, [r0], r2
488    vdup.u8         q0, r3
489    vdup.u8         q1, r4
490    vdup.u8         q2, r5
491    vdup.u8         q3, r6
492    vst1.u8         {q0}, [r1], r2
493    vst1.u8         {q1}, [r1], r2
494    vst1.u8         {q2}, [r1], r2
495    vst1.u8         {q3}, [r1], r2
496
497    ldrb            r3, [r0], r2
498    ldrb            r4, [r0], r2
499    ldrb            r5, [r0], r2
500    ldrb            r6, [r0], r2
501    vdup.u8         q0, r3
502    vdup.u8         q1, r4
503    vdup.u8         q2, r5
504    vdup.u8         q3, r6
505    vst1.u8         {q0}, [r1], r2
506    vst1.u8         {q1}, [r1], r2
507    vst1.u8         {q2}, [r1], r2
508    vst1.u8         {q3}, [r1], r2
509
510    pop             {r4-r8,pc}
511
512case_tm_pred_s
513    ; Load yabove_row
514    sub             r3, r0, r2
515    vld1.8          {q8}, [r3]
516
517    ; Load ytop_left
518    sub             r3, r3, #1
519    ldrb            r7, [r3]
520
521    vdup.u16        q7, r7
522
523    ; Compute yabove_row - ytop_left
524    mov             r3, #1
525    vdup.u8         q0, r3
526
527    vmull.u8        q4, d16, d0
528    vmull.u8        q5, d17, d0
529
530    vsub.s16        q4, q4, q7
531    vsub.s16        q5, q5, q7
532
533    ; Load 4x yleft_col
534    sub             r0, r0, #1
535    mov             r12, #4
536
537case_tm_pred_loop_s
538    ldrb            r3, [r0], r2
539    ldrb            r4, [r0], r2
540    ldrb            r5, [r0], r2
541    ldrb            r6, [r0], r2
542    vdup.u16        q0, r3
543    vdup.u16        q1, r4
544    vdup.u16        q2, r5
545    vdup.u16        q3, r6
546
547    vqadd.s16       q8, q0, q4
548    vqadd.s16       q9, q0, q5
549
550    vqadd.s16       q10, q1, q4
551    vqadd.s16       q11, q1, q5
552
553    vqadd.s16       q12, q2, q4
554    vqadd.s16       q13, q2, q5
555
556    vqadd.s16       q14, q3, q4
557    vqadd.s16       q15, q3, q5
558
559    vqshrun.s16     d0, q8, #0
560    vqshrun.s16     d1, q9, #0
561
562    vqshrun.s16     d2, q10, #0
563    vqshrun.s16     d3, q11, #0
564
565    vqshrun.s16     d4, q12, #0
566    vqshrun.s16     d5, q13, #0
567
568    vqshrun.s16     d6, q14, #0
569    vqshrun.s16     d7, q15, #0
570
571    vst1.u8         {q0}, [r1], r2
572    vst1.u8         {q1}, [r1], r2
573    vst1.u8         {q2}, [r1], r2
574    vst1.u8         {q3}, [r1], r2
575
576    subs            r12, r12, #1
577    bne             case_tm_pred_loop_s
578
579    pop             {r4-r8,pc}
580
581    ENDP
582
583
584    END
585