1//******************************************************************************
2//*
3//* Copyright (C) 2015 The Android Open Source Project
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************
18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19//*/
20///**
21//******************************************************************************
22//* @file
23//*  ih264_intra_pred_chroma.s
24//*
25//* @brief
26//*  Contains function definitions for intra chroma prediction .
27//*
28//* @author
29//*  Ittiam
30//*
31//* @par List of Functions:
32//*
33//*  - ih264_intra_pred_luma_chroma_mode_vert_av8()
34//*  - ih264_intra_pred_luma_chroma_mode_horz_av8()
35//*  - ih264_intra_pred_luma_chroma_mode_dc_av8()
36//*  - ih264_intra_pred_luma_chroma_mode_plane_av8()
37//*
38//* @remarks
39//*  None
40//*
41//*******************************************************************************
42//*/
43
44///* All the functions here are replicated from ih264_chroma_intra_pred_filters.c
45//
46
47///**
48///**
49///**
50//
51
52
53.text
54.p2align 2
55.include "ih264_neon_macros.s"
56
57.extern ih264_gai1_intrapred_chroma_plane_coeffs1
58.extern ih264_gai1_intrapred_chroma_plane_coeffs2
59
60
61
62///**
63//*******************************************************************************
64//*
65//*ih264_intra_pred_chroma_8x8_mode_dc
66//*
67//* @brief
68//*     Perform Intra prediction for  chroma_8x8 mode:DC
69//*
70//* @par Description:
71//*    Perform Intra prediction for  chroma_8x8 mode:DC ,described in sec 8.3.4.1
72//*
73//* @param[in] pu1_src
74//*  UWORD8 pointer to the source containing alternate U and V samples
75//*
76//* @param[out] pu1_dst
77//*  UWORD8 pointer to the destination with alternate U and V samples
78//*
79//* @param[in] src_strd
80//*  integer source stride
81//*
82//* @param[in] dst_strd
83//*  integer destination stride
84//*
85//** @param[in] ui_neighboravailability
86//*  availability of neighbouring pixels
87//*
88//* @returns
89//*
90//* @remarks
91//*  None
92//*
93//*******************************************************************************/
94//void ih264_intra_pred_chroma_8x8_mode_dc(UWORD8 *pu1_src,
95//                                        UWORD8 *pu1_dst,
96//                                        WORD32 src_strd,
97//                                        WORD32 dst_strd,
98//                                        WORD32 ui_neighboravailability)
99
100//**************Variables Vs Registers*****************************************
101//    x0 => *pu1_src
102//    x1 => *pu1_dst
103//    w2 =>  src_strd
104//    w3 =>  dst_strd
105//    w4 =>  ui_neighboravailability
106
107
108
109    .global ih264_intra_pred_chroma_8x8_mode_dc_av8
110
111ih264_intra_pred_chroma_8x8_mode_dc_av8:
112
113
114    push_v_regs
115    stp       x19, x20, [sp, #-16]!
116    sxtw      x3, w3
117
118    mov       w19, #5
119    ands      w6, w4, w19
120    beq       none_available
121    cmp       w6, #1
122    beq       left_only_available
123    cmp       w6, #4
124    beq       top_only_available
125
126all_available:
127    ld1       {v0.8b, v1.8b}, [x0]
128    add       x6, x0, #18
129    ld1       {v2.8b, v3.8b}, [x6]
130    uxtl      v0.8h, v0.8b
131    uxtl      v1.8h, v1.8b
132    addp      v0.4s, v0.4s , v0.4s
133    addp      v1.4s, v1.4s , v1.4s
134    addp      v0.4s, v0.4s , v0.4s
135    addp      v1.4s, v1.4s , v1.4s
136    uxtl      v2.8h, v2.8b
137    uxtl      v3.8h, v3.8b
138    addp      v2.4s, v2.4s , v2.4s
139    addp      v3.4s, v3.4s , v3.4s
140    addp      v2.4s, v2.4s , v2.4s
141    addp      v3.4s, v3.4s , v3.4s
142    rshrn     v5.8b, v0.8h, #2
143    dup       v21.8h, v5.h[0]
144    rshrn     v6.8b, v3.8h, #2
145    dup       v20.8h, v6.h[0]
146    add       v1.8h, v1.8h, v2.8h
147    rshrn     v1.8b, v1.8h, #3
148    dup       v23.8h, v1.h[0]
149    mov       v20.d[0], v23.d[0]
150    add       v0.8h, v0.8h, v3.8h
151    rshrn     v0.8b, v0.8h, #3
152    dup       v23.8h, v0.h[0]
153    mov       v21.d[1], v23.d[0]
154    b         store
155left_only_available:
156    ld1       {v0.8b, v1.8b}, [x0]
157    uxtl      v0.8h, v0.8b
158    uxtl      v1.8h, v1.8b
159    addp      v0.4s, v0.4s , v0.4s
160    addp      v1.4s, v1.4s , v1.4s
161    addp      v0.4s, v0.4s , v0.4s
162    addp      v1.4s, v1.4s , v1.4s
163    rshrn     v0.8b, v0.8h, #2
164    rshrn     v1.8b, v1.8h, #2
165    dup       v20.8h , v1.h[0]
166    dup       v21.8h, v0.h[0]
167    b         store
168
169top_only_available:
170    add       x6, x0, #18
171    ld1       {v0.8b, v1.8b}, [x6]
172    uxtl      v0.8h, v0.8b
173    uxtl      v1.8h, v1.8b
174    addp      v0.4s, v0.4s , v0.4s
175    addp      v1.4s, v1.4s , v1.4s
176    addp      v0.4s, v0.4s , v0.4s
177    addp      v1.4s, v1.4s , v1.4s
178    rshrn     v0.8b, v0.8h, #2
179    rshrn     v1.8b, v1.8h, #2
180    dup       v20.8h , v0.h[0]
181    dup       v21.8h, v1.h[0]
182    mov       v20.d[1], v21.d[1]
183    mov       v21.d[0], v20.d[0]
184    b         store
185none_available:
186    mov       w15, #128
187    dup       v20.16b, w15
188    dup       v21.16b, w15
189
190
191store:
192
193    st1       { v20.16b}, [x1], x3
194    st1       { v20.16b}, [x1], x3
195    st1       { v20.16b}, [x1], x3
196    st1       { v20.16b}, [x1], x3
197    st1       { v21.16b}, [x1], x3
198    st1       { v21.16b}, [x1], x3
199    st1       { v21.16b}, [x1], x3
200    st1       { v21.16b}, [x1], x3
201end_func:
202
203    ldp       x19, x20, [sp], #16
204    pop_v_regs
205    ret
206
207
208
209
210
211///******************************************************************************
212
213
214///**
215//*******************************************************************************
216//*
217//*ih264_intra_pred_chroma_8x8_mode_horz
218//*
219//* @brief
220//*  Perform Intra prediction for  chroma_8x8 mode:Horizontal
221//*
222//* @par Description:
223//*   Perform Intra prediction for  chroma_8x8 mode:Horizontal ,described in sec 8.3.4.2
224//*
225//* @param[in] pu1_src
226//* UWORD8 pointer to the source containing alternate U and V samples
227//*
228//* @param[out] pu1_dst
229//*  UWORD8 pointer to the destination with alternate U and V samples
230//*
231//* @param[in] src_strd
232//*  integer source stride
233//*
234//* @param[in] dst_strd
235//*  integer destination stride
236//*
237//* @param[in] ui_neighboravailability
238//* availability of neighbouring pixels(Not used in this function)
239//*
240//* @returns
241//*
242//* @remarks
243//*  None
244//*
245//*******************************************************************************
246//*/
247//void ih264_intra_pred_chroma_8x8_mode_horz(UWORD8 *pu1_src,
248//                                         UWORD8 *pu1_dst,
249//                                         WORD32 src_strd,
250//                                         WORD32 dst_strd,
251//                                         WORD32 ui_neighboravailability)
252//**************Variables Vs Registers*****************************************
253//    x0 => *pu1_src
254//    x1 => *pu1_dst
255//    w2 =>  src_strd
256//    w3 =>  dst_strd
257//    w4 =>  ui_neighboravailability
258
259
260    .global ih264_intra_pred_chroma_8x8_mode_horz_av8
261
262ih264_intra_pred_chroma_8x8_mode_horz_av8:
263
264
265
266    push_v_regs
267    sxtw      x3, w3
268    ld1       {v0.8h}, [x0]
269
270    dup       v10.8h, v0.h[7]
271    dup       v11.8h, v0.h[6]
272    dup       v12.8h, v0.h[5]
273    dup       v13.8h, v0.h[4]
274    st1       {v10.8h}, [x1], x3
275    dup       v14.8h, v0.h[3]
276    st1       {v11.8h}, [x1], x3
277    dup       v15.8h, v0.h[2]
278    st1       {v12.8h}, [x1], x3
279    dup       v16.8h, v0.h[1]
280    st1       {v13.8h}, [x1], x3
281    dup       v17.8h, v0.h[0]
282    st1       {v14.8h}, [x1], x3
283    st1       {v15.8h}, [x1], x3
284    st1       {v16.8h}, [x1], x3
285    st1       {v17.8h}, [x1], x3
286
287
288    pop_v_regs
289    ret
290
291
292
293
294
295
296///**
297//*******************************************************************************
298//*
299//*ih264_intra_pred_chroma_8x8_mode_vert
300//*
301//* @brief
302//*   Perform Intra prediction for  chroma_8x8 mode:vertical
303//*
304//* @par Description:
305//*Perform Intra prediction for  chroma_8x8 mode:vertical ,described in sec 8.3.4.3
306//*
307//* @param[in] pu1_src
308//* UWORD8 pointer to the source containing alternate U and V samples
309//*
310//* @param[out] pu1_dst
311//*   UWORD8 pointer to the destination with alternate U and V samples
312//*
313//* @param[in] src_strd
314//*  integer source stride
315//*
316//* @param[in] dst_strd
317//*  integer destination stride
318//*
319//* @param[in] ui_neighboravailability
320//* availability of neighbouring pixels(Not used in this function)
321//*
322//* @returns
323//*
324//* @remarks
325//*  None
326//*
327//*******************************************************************************
328//void ih264_intra_pred_chroma_8x8_mode_vert(UWORD8 *pu1_src,
329//                                        UWORD8 *pu1_dst,
330//                                        WORD32 src_strd,
331//                                        WORD32 dst_strd,
332//                                        WORD32 ui_neighboravailability)
333
334//**************Variables Vs Registers*****************************************
335//    x0 => *pu1_src
336//    x1 => *pu1_dst
337//    w2 =>  src_strd
338//    w3 =>  dst_strd
339//    w4 =>  ui_neighboravailability
340
341
342    .global ih264_intra_pred_chroma_8x8_mode_vert_av8
343
344ih264_intra_pred_chroma_8x8_mode_vert_av8:
345
346    push_v_regs
347    sxtw      x3, w3
348
349    add       x0, x0, #18
350    ld1       {v0.8b, v1.8b}, [x0]
351
352    st1       {v0.8b, v1.8b}, [x1], x3
353    st1       {v0.8b, v1.8b}, [x1], x3
354    st1       {v0.8b, v1.8b}, [x1], x3
355    st1       {v0.8b, v1.8b}, [x1], x3
356    st1       {v0.8b, v1.8b}, [x1], x3
357    st1       {v0.8b, v1.8b}, [x1], x3
358    st1       {v0.8b, v1.8b}, [x1], x3
359    st1       {v0.8b, v1.8b}, [x1], x3
360
361    pop_v_regs
362    ret
363
364
365
366
367///******************************************************************************
368
369
370///**
371//*******************************************************************************
372//*
373//*ih264_intra_pred_chroma_8x8_mode_plane
374//*
375//* @brief
376//*   Perform Intra prediction for  chroma_8x8 mode:PLANE
377//*
378//* @par Description:
379//*  Perform Intra prediction for  chroma_8x8 mode:PLANE ,described in sec 8.3.4.4
380//*
381//* @param[in] pu1_src
382//*  UWORD8 pointer to the source containing alternate U and V samples
383//*
384//* @param[out] pu1_dst
385//*  UWORD8 pointer to the destination with alternate U and V samples
386//*
387//* @param[in] src_strd
388//*  integer source stride
389//*
390//* @param[in] dst_strd
391//*  integer destination stride
392//*
393//* @param[in] ui_neighboravailability
394//*  availability of neighbouring pixels
395//*
396//* @returns
397//*
398//* @remarks
399//*  None
400//*
401//*******************************************************************************/
402//void ih264_intra_pred_chroma_8x8_mode_plane(UWORD8 *pu1_src,
403//                                        UWORD8 *pu1_dst,
404//                                        WORD32 src_strd,
405//                                        WORD32 dst_strd,
406//                                        WORD32 ui_neighboravailability)
407
408//**************Variables Vs Registers*****************************************
409//    x0 => *pu1_src
410//    x1 => *pu1_dst
411//    w2 =>  src_strd
412//    w3 =>  dst_strd
413//    w4 =>  ui_neighboravailability
414
415    .global ih264_intra_pred_chroma_8x8_mode_plane_av8
416ih264_intra_pred_chroma_8x8_mode_plane_av8:
417
418    push_v_regs
419    stp       x19, x20, [sp, #-16]!
420    sxtw      x3, w3
421
422    ld1       {v0.2s}, [x0]
423    add       x10, x0, #10
424    ld1       {v1.2s}, [x10]
425    add       x10, x10, #6
426    rev64     v5.4h, v0.4h
427    ld1       {v2.2s}, [x10], #8
428    add       x10, x10, #2
429    rev64     v7.4h, v2.4h
430    ld1       {v3.2s}, [x10]
431    sub       x5, x3, #8
432    adrp      x12, :got:ih264_gai1_intrapred_chroma_plane_coeffs1
433    ldr       x12, [x12, #:got_lo12:ih264_gai1_intrapred_chroma_plane_coeffs1]
434    usubl     v10.8h, v5.8b, v1.8b
435    ld1       {v8.8b, v9.8b}, [x12]     // Load multiplication factors 1 to 8 into D3
436    mov       v8.d[1], v9.d[0]
437    usubl     v12.8h, v3.8b, v7.8b
438    mul       v14.8h, v10.8h , v8.8h
439    mul       v16.8h, v12.8h , v8.8h
440    uzp1      v15.8h, v14.8h, v16.8h
441    uzp2      v16.8h, v14.8h, v16.8h
442    mov       v14.16b, v15.16b
443    mov       v15.d[0], v14.d[1]
444    mov       v17.d[0], v16.d[1]
445    addp      v14.4h, v14.4h, v14.4h
446    addp      v15.4h, v15.4h, v15.4h
447    addp      v16.4h, v16.4h, v16.4h
448    addp      v17.4h, v17.4h, v17.4h
449    addp      v14.4h, v14.4h, v14.4h
450    addp      v15.4h, v15.4h, v15.4h
451    addp      v16.4h, v16.4h, v16.4h
452    addp      v17.4h, v17.4h, v17.4h
453    mov       x6, #34
454    dup       v18.8h, w6
455    smull     v22.4s, v14.4h, v18.4h
456    smull     v24.4s, v15.4h, v18.4h
457    smull     v26.4s, v16.4h, v18.4h
458    smull     v28.4s, v17.4h, v18.4h
459    rshrn     v10.4h, v22.4s, #6
460    rshrn     v12.4h, v24.4s, #6
461    rshrn     v13.4h, v26.4s, #6
462    rshrn     v14.4h, v28.4s, #6
463    ldrb      w6, [x0], #1
464    add       x10, x0, #31
465    ldrb      w8, [x0], #1
466    ldrb      w7, [x10], #1
467    ldrb      w9, [x10], #1
468    add       w6, w6, w7
469    add       w8, w8, w9
470    lsl       w6, w6, #4
471    lsl       w8, w8, #4
472    dup       v0.8h, w6
473    dup       v2.8h, w8
474    dup       v4.8h, v12.h[0]
475    dup       v6.8h, v10.h[0]
476    dup       v24.8h, v14.h[0]
477    dup       v26.8h, v13.h[0]
478    zip1      v5.8h, v4.8h, v24.8h
479    zip2      v24.8h, v4.8h, v24.8h
480    mov       v4.16b, v5.16b
481    zip1      v7.8h, v6.8h, v26.8h
482    zip2      v26.8h, v6.8h, v26.8h
483    mov       v6.16b, v7.16b
484    zip1      v1.8h, v0.8h, v2.8h
485    zip2      v2.8h, v0.8h, v2.8h
486    mov       v0.16b, v1.16b
487
488    adrp      x12, :got:ih264_gai1_intrapred_chroma_plane_coeffs2
489    ldr       x12, [x12, #:got_lo12:ih264_gai1_intrapred_chroma_plane_coeffs2]
490
491    ld1       {v8.2s, v9.2s}, [x12]
492    mov       v8.d[1], v9.d[0]
493    mov       v10.16b, v8.16b
494    mov       v22.16b, v8.16b
495    zip1      v9.8h, v8.8h, v10.8h
496    zip2      v10.8h, v8.8h, v10.8h
497    mov       v8.16b, v9.16b
498    mul       v12.8h, v4.8h , v8.8h
499    mul       v16.8h, v4.8h , v10.8h
500    add       v12.8h, v0.8h , v12.8h
501    add       v16.8h, v0.8h , v16.8h
502    dup       v20.8h, v22.h[0]
503    mul       v4.8h, v6.8h , v20.8h
504    dup       v30.8h, v22.h[1]
505    mul       v18.8h, v6.8h , v20.8h
506    mul       v14.8h, v6.8h , v30.8h
507    mul       v8.8h, v6.8h , v30.8h
508    add       v24.8h, v12.8h , v4.8h
509    add       v0.8h, v16.8h , v18.8h
510    add       v2.8h, v12.8h , v14.8h
511    sqrshrun  v28.8b, v24.8h, #5
512    add       v26.8h, v16.8h , v8.8h
513    sqrshrun  v29.8b, v0.8h, #5
514    dup       v20.8h, v22.h[2]
515    st1       {v28.8b, v29.8b}, [x1], x3
516    sqrshrun  v28.8b, v2.8h, #5
517    sqrshrun  v29.8b, v26.8h, #5
518    mul       v4.8h, v6.8h , v20.8h
519    mul       v18.8h, v6.8h , v20.8h
520    st1       {v28.8b, v29.8b}, [x1], x3
521    add       v24.8h, v12.8h , v4.8h
522    add       v0.8h, v16.8h , v18.8h
523    dup       v30.8h, v22.h[3]
524    sqrshrun  v28.8b, v24.8h, #5
525    sqrshrun  v29.8b, v0.8h, #5
526    mul       v14.8h, v6.8h , v30.8h
527    mul       v8.8h, v6.8h , v30.8h
528    st1       {v28.8b, v29.8b}, [x1], x3
529    add       v2.8h, v12.8h , v14.8h
530    add       v26.8h, v16.8h , v8.8h
531    dup       v20.8h, v22.h[4]
532    sqrshrun  v28.8b, v2.8h, #5
533    sqrshrun  v29.8b, v26.8h, #5
534    mul       v4.8h, v6.8h , v20.8h
535    mul       v18.8h, v6.8h , v20.8h
536    st1       {v28.8b, v29.8b}, [x1], x3
537    add       v24.8h, v12.8h , v4.8h
538    add       v0.8h, v16.8h , v18.8h
539    dup       v30.8h, v22.h[5]
540    sqrshrun  v28.8b, v24.8h, #5
541    sqrshrun  v29.8b, v0.8h, #5
542    mul       v14.8h, v6.8h , v30.8h
543    mul       v8.8h, v6.8h , v30.8h
544    st1       {v28.8b, v29.8b}, [x1], x3
545    add       v2.8h, v12.8h , v14.8h
546    add       v26.8h, v16.8h , v8.8h
547    dup       v20.8h, v22.h[6]
548    sqrshrun  v28.8b, v2.8h, #5
549    sqrshrun  v29.8b, v26.8h, #5
550    mul       v4.8h, v6.8h , v20.8h
551    mul       v18.8h, v6.8h , v20.8h
552    st1       {v28.8b, v29.8b}, [x1], x3
553    add       v24.8h, v12.8h , v4.8h
554    add       v0.8h, v16.8h , v18.8h
555    dup       v30.8h, v22.h[7]
556    sqrshrun  v28.8b, v24.8h, #5
557    sqrshrun  v29.8b, v0.8h, #5
558    mul       v14.8h, v6.8h , v30.8h
559    mul       v8.8h, v6.8h , v30.8h
560    st1       {v28.8b, v29.8b}, [x1], x3
561    add       v2.8h, v12.8h , v14.8h
562    add       v26.8h, v16.8h , v8.8h
563    sqrshrun  v28.8b, v2.8h, #5
564    sqrshrun  v29.8b, v26.8h, #5
565    st1       {v28.8b, v29.8b}, [x1], x3
566
567end_func_plane:
568
569    ldp       x19, x20, [sp], #16
570    pop_v_regs
571    ret
572
573
574
575