1//******************************************************************************
2//*
3//* Copyright (C) 2015 The Android Open Source Project
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************
18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19//*/
20///**
21//******************************************************************************
22//* @file
23//*  ih264_intra_pred_chroma.s
24//*
25//* @brief
26//*  Contains function definitions for intra chroma prediction .
27//*
28//* @author
29//*  Ittiam
30//*
31//* @par List of Functions:
32//*
33//*  - ih264_intra_pred_luma_chroma_mode_vert_av8()
34//*  - ih264_intra_pred_luma_chroma_mode_horz_av8()
35//*  - ih264_intra_pred_luma_chroma_mode_dc_av8()
36//*  - ih264_intra_pred_luma_chroma_mode_plane_av8()
37//*
38//* @remarks
39//*  None
40//*
41//*******************************************************************************
42//*/
43
44///* All the functions here are replicated from ih264_chroma_intra_pred_filters.c
45//
46
47///**
48///**
49///**
50//
51
52
53.text
54.p2align 2
55.include "ih264_neon_macros.s"
56
57.extern ih264_gai1_intrapred_chroma_plane_coeffs1
58.extern ih264_gai1_intrapred_chroma_plane_coeffs2
59
60
61
62///**
63//*******************************************************************************
64//*
65//*ih264_intra_pred_chroma_8x8_mode_dc
66//*
67//* @brief
68//*     Perform Intra prediction for  chroma_8x8 mode:DC
69//*
70//* @par Description:
71//*    Perform Intra prediction for  chroma_8x8 mode:DC ,described in sec 8.3.4.1
72//*
73//* @param[in] pu1_src
74//*  UWORD8 pointer to the source containing alternate U and V samples
75//*
76//* @param[out] pu1_dst
77//*  UWORD8 pointer to the destination with alternate U and V samples
78//*
79//* @param[in] src_strd
80//*  integer source stride
81//*
82//* @param[in] dst_strd
83//*  integer destination stride
84//*
85//** @param[in] ui_neighboravailability
86//*  availability of neighbouring pixels
87//*
88//* @returns
89//*
90//* @remarks
91//*  None
92//*
93//*******************************************************************************/
94//void ih264_intra_pred_chroma_8x8_mode_dc(UWORD8 *pu1_src,
95//                                        UWORD8 *pu1_dst,
96//                                        WORD32 src_strd,
97//                                        WORD32 dst_strd,
98//                                        WORD32 ui_neighboravailability)
99
100//**************Variables Vs Registers*****************************************
101//    x0 => *pu1_src
102//    x1 => *pu1_dst
103//    x2 =>  src_strd
104//    x3 =>  dst_strd
105//   x4 =>  ui_neighboravailability
106
107
108
109    .global ih264_intra_pred_chroma_8x8_mode_dc_av8
110
111ih264_intra_pred_chroma_8x8_mode_dc_av8:
112
113
114    push_v_regs
115    stp       x19, x20, [sp, #-16]!
116
117    mov       x19, #5
118    ands      x6, x4, x19
119    beq       none_available
120    cmp       x6, #1
121    beq       left_only_available
122    cmp       x6, #4
123    beq       top_only_available
124
125all_available:
126    ld1       {v0.8b, v1.8b}, [x0]
127    add       x6, x0, #18
128    ld1       {v2.8b, v3.8b}, [x6]
129    uxtl      v0.8h, v0.8b
130    uxtl      v1.8h, v1.8b
131    addp      v0.4s, v0.4s , v0.4s
132    addp      v1.4s, v1.4s , v1.4s
133    addp      v0.4s, v0.4s , v0.4s
134    addp      v1.4s, v1.4s , v1.4s
135    uxtl      v2.8h, v2.8b
136    uxtl      v3.8h, v3.8b
137    addp      v2.4s, v2.4s , v2.4s
138    addp      v3.4s, v3.4s , v3.4s
139    addp      v2.4s, v2.4s , v2.4s
140    addp      v3.4s, v3.4s , v3.4s
141    rshrn     v5.8b, v0.8h, #2
142    dup       v21.8h, v5.h[0]
143    rshrn     v6.8b, v3.8h, #2
144    dup       v20.8h, v6.h[0]
145    add       v1.8h, v1.8h, v2.8h
146    rshrn     v1.8b, v1.8h, #3
147    dup       v23.8h, v1.h[0]
148    mov       v20.d[0], v23.d[0]
149    add       v0.8h, v0.8h, v3.8h
150    rshrn     v0.8b, v0.8h, #3
151    dup       v23.8h, v0.h[0]
152    mov       v21.d[1], v23.d[0]
153    b         store
154left_only_available:
155    ld1       {v0.8b, v1.8b}, [x0]
156    uxtl      v0.8h, v0.8b
157    uxtl      v1.8h, v1.8b
158    addp      v0.4s, v0.4s , v0.4s
159    addp      v1.4s, v1.4s , v1.4s
160    addp      v0.4s, v0.4s , v0.4s
161    addp      v1.4s, v1.4s , v1.4s
162    rshrn     v0.8b, v0.8h, #2
163    rshrn     v1.8b, v1.8h, #2
164    dup       v20.8h , v1.h[0]
165    dup       v21.8h, v0.h[0]
166    b         store
167
168top_only_available:
169    add       x6, x0, #18
170    ld1       {v0.8b, v1.8b}, [x6]
171    uxtl      v0.8h, v0.8b
172    uxtl      v1.8h, v1.8b
173    addp      v0.4s, v0.4s , v0.4s
174    addp      v1.4s, v1.4s , v1.4s
175    addp      v0.4s, v0.4s , v0.4s
176    addp      v1.4s, v1.4s , v1.4s
177    rshrn     v0.8b, v0.8h, #2
178    rshrn     v1.8b, v1.8h, #2
179    dup       v20.8h , v0.h[0]
180    dup       v21.8h, v1.h[0]
181    mov       v20.d[1], v21.d[1]
182    mov       v21.d[0], v20.d[0]
183    b         store
184none_available:
185    mov       w15, #128
186    dup       v20.16b, w15
187    dup       v21.16b, w15
188
189
190store:
191
192    st1       { v20.16b}, [x1], x3
193    st1       { v20.16b}, [x1], x3
194    st1       { v20.16b}, [x1], x3
195    st1       { v20.16b}, [x1], x3
196    st1       { v21.16b}, [x1], x3
197    st1       { v21.16b}, [x1], x3
198    st1       { v21.16b}, [x1], x3
199    st1       { v21.16b}, [x1], x3
200end_func:
201
202    ldp       x19, x20, [sp], #16
203    pop_v_regs
204    ret
205
206
207
208
209
210///******************************************************************************
211
212
213///**
214//*******************************************************************************
215//*
216//*ih264_intra_pred_chroma_8x8_mode_horz
217//*
218//* @brief
219//*  Perform Intra prediction for  chroma_8x8 mode:Horizontal
220//*
221//* @par Description:
222//*   Perform Intra prediction for  chroma_8x8 mode:Horizontal ,described in sec 8.3.4.2
223//*
224//* @param[in] pu1_src
225//* UWORD8 pointer to the source containing alternate U and V samples
226//*
227//* @param[out] pu1_dst
228//*  UWORD8 pointer to the destination with alternate U and V samples
229//*
230//* @param[in] src_strd
231//*  integer source stride
232//*
233//* @param[in] dst_strd
234//*  integer destination stride
235//*
236//* @param[in] ui_neighboravailability
237//* availability of neighbouring pixels(Not used in this function)
238//*
239//* @returns
240//*
241//* @remarks
242//*  None
243//*
244//*******************************************************************************
245//*/
246//void ih264_intra_pred_chroma_8x8_mode_horz(UWORD8 *pu1_src,
247//                                         UWORD8 *pu1_dst,
248//                                         WORD32 src_strd,
249//                                         WORD32 dst_strd,
250//                                         WORD32 ui_neighboravailability)
251//**************Variables Vs Registers*****************************************
252//    x0 => *pu1_src
253//    x1 => *pu1_dst
254//    x2 =>  src_strd
255//    x3 =>  dst_strd
256//   x4 =>  ui_neighboravailability
257
258
259    .global ih264_intra_pred_chroma_8x8_mode_horz_av8
260
261ih264_intra_pred_chroma_8x8_mode_horz_av8:
262
263
264
265    push_v_regs
266    ld1       {v0.8h}, [x0]
267
268    dup       v10.8h, v0.h[7]
269    dup       v11.8h, v0.h[6]
270    dup       v12.8h, v0.h[5]
271    dup       v13.8h, v0.h[4]
272    st1       {v10.8h}, [x1], x3
273    dup       v14.8h, v0.h[3]
274    st1       {v11.8h}, [x1], x3
275    dup       v15.8h, v0.h[2]
276    st1       {v12.8h}, [x1], x3
277    dup       v16.8h, v0.h[1]
278    st1       {v13.8h}, [x1], x3
279    dup       v17.8h, v0.h[0]
280    st1       {v14.8h}, [x1], x3
281    st1       {v15.8h}, [x1], x3
282    st1       {v16.8h}, [x1], x3
283    st1       {v17.8h}, [x1], x3
284
285
286    pop_v_regs
287    ret
288
289
290
291
292
293
294///**
295//*******************************************************************************
296//*
297//*ih264_intra_pred_chroma_8x8_mode_vert
298//*
299//* @brief
300//*   Perform Intra prediction for  chroma_8x8 mode:vertical
301//*
302//* @par Description:
303//*Perform Intra prediction for  chroma_8x8 mode:vertical ,described in sec 8.3.4.3
304//*
305//* @param[in] pu1_src
306//* UWORD8 pointer to the source containing alternate U and V samples
307//*
308//* @param[out] pu1_dst
309//*   UWORD8 pointer to the destination with alternate U and V samples
310//*
311//* @param[in] src_strd
312//*  integer source stride
313//*
314//* @param[in] dst_strd
315//*  integer destination stride
316//*
317//* @param[in] ui_neighboravailability
318//* availability of neighbouring pixels(Not used in this function)
319//*
320//* @returns
321//*
322//* @remarks
323//*  None
324//*
325//*******************************************************************************
326//void ih264_intra_pred_chroma_8x8_mode_vert(UWORD8 *pu1_src,
327//                                        UWORD8 *pu1_dst,
328//                                        WORD32 src_strd,
329//                                        WORD32 dst_strd,
330//                                        WORD32 ui_neighboravailability)
331
332//**************Variables Vs Registers*****************************************
333//    x0 => *pu1_src
334//    x1 => *pu1_dst
335//    x2 =>  src_strd
336//    x3 =>  dst_strd
337//   x4 =>  ui_neighboravailability
338
339
340    .global ih264_intra_pred_chroma_8x8_mode_vert_av8
341
342ih264_intra_pred_chroma_8x8_mode_vert_av8:
343
344    push_v_regs
345
346    add       x0, x0, #18
347    ld1       {v0.8b, v1.8b}, [x0]
348
349    st1       {v0.8b, v1.8b}, [x1], x3
350    st1       {v0.8b, v1.8b}, [x1], x3
351    st1       {v0.8b, v1.8b}, [x1], x3
352    st1       {v0.8b, v1.8b}, [x1], x3
353    st1       {v0.8b, v1.8b}, [x1], x3
354    st1       {v0.8b, v1.8b}, [x1], x3
355    st1       {v0.8b, v1.8b}, [x1], x3
356    st1       {v0.8b, v1.8b}, [x1], x3
357
358    pop_v_regs
359    ret
360
361
362
363
364///******************************************************************************
365
366
367///**
368//*******************************************************************************
369//*
370//*ih264_intra_pred_chroma_8x8_mode_plane
371//*
372//* @brief
373//*   Perform Intra prediction for  chroma_8x8 mode:PLANE
374//*
375//* @par Description:
376//*  Perform Intra prediction for  chroma_8x8 mode:PLANE ,described in sec 8.3.4.4
377//*
378//* @param[in] pu1_src
379//*  UWORD8 pointer to the source containing alternate U and V samples
380//*
381//* @param[out] pu1_dst
382//*  UWORD8 pointer to the destination with alternate U and V samples
383//*
384//* @param[in] src_strd
385//*  integer source stride
386//*
387//* @param[in] dst_strd
388//*  integer destination stride
389//*
390//* @param[in] ui_neighboravailability
391//*  availability of neighbouring pixels
392//*
393//* @returns
394//*
395//* @remarks
396//*  None
397//*
398//*******************************************************************************/
399//void ih264_intra_pred_chroma_8x8_mode_plane(UWORD8 *pu1_src,
400//                                        UWORD8 *pu1_dst,
401//                                        WORD32 src_strd,
402//                                        WORD32 dst_strd,
403//                                        WORD32 ui_neighboravailability)
404
405//**************Variables Vs Registers*****************************************
406//    x0 => *pu1_src
407//    x1 => *pu1_dst
408//    x2 =>  src_strd
409//    x3 =>  dst_strd
410//   x4 =>  ui_neighboravailability
411
412    .global ih264_intra_pred_chroma_8x8_mode_plane_av8
413ih264_intra_pred_chroma_8x8_mode_plane_av8:
414
415    push_v_regs
416    stp       x19, x20, [sp, #-16]!
417
418    ld1       {v0.2s}, [x0]
419    add       x10, x0, #10
420    ld1       {v1.2s}, [x10]
421    add       x10, x10, #6
422    rev64     v5.4h, v0.4h
423    ld1       {v2.2s}, [x10], #8
424    add       x10, x10, #2
425    rev64     v7.4h, v2.4h
426    ld1       {v3.2s}, [x10]
427    sub       x5, x3, #8
428    adrp      x12, :got:ih264_gai1_intrapred_chroma_plane_coeffs1
429    ldr       x12, [x12, #:got_lo12:ih264_gai1_intrapred_chroma_plane_coeffs1]
430    usubl     v10.8h, v5.8b, v1.8b
431    ld1       {v8.8b, v9.8b}, [x12]     // Load multiplication factors 1 to 8 into D3
432    mov       v8.d[1], v9.d[0]
433    usubl     v12.8h, v3.8b, v7.8b
434    mul       v14.8h, v10.8h , v8.8h
435    mul       v16.8h, v12.8h , v8.8h
436    uzp1      v15.8h, v14.8h, v16.8h
437    uzp2      v16.8h, v14.8h, v16.8h
438    mov       v14.16b, v15.16b
439    mov       v15.d[0], v14.d[1]
440    mov       v17.d[0], v16.d[1]
441    addp      v14.4h, v14.4h, v14.4h
442    addp      v15.4h, v15.4h, v15.4h
443    addp      v16.4h, v16.4h, v16.4h
444    addp      v17.4h, v17.4h, v17.4h
445    addp      v14.4h, v14.4h, v14.4h
446    addp      v15.4h, v15.4h, v15.4h
447    addp      v16.4h, v16.4h, v16.4h
448    addp      v17.4h, v17.4h, v17.4h
449    mov       x6, #34
450    dup       v18.8h, w6
451    smull     v22.4s, v14.4h, v18.4h
452    smull     v24.4s, v15.4h, v18.4h
453    smull     v26.4s, v16.4h, v18.4h
454    smull     v28.4s, v17.4h, v18.4h
455    rshrn     v10.4h, v22.4s, #6
456    rshrn     v12.4h, v24.4s, #6
457    rshrn     v13.4h, v26.4s, #6
458    rshrn     v14.4h, v28.4s, #6
459    ldrb      w6, [x0], #1
460    sxtw      x6, w6
461    add       x10, x0, #31
462    ldrb      w8, [x0], #1
463    sxtw      x8, w8
464    ldrb      w7, [x10], #1
465    sxtw      x7, w7
466    ldrb      w9, [x10], #1
467    sxtw      x9, w9
468    add       x6, x6, x7
469    add       x8, x8, x9
470    lsl       x6, x6, #4
471    lsl       x8, x8, #4
472    dup       v0.8h, w6
473    dup       v2.8h, w8
474    dup       v4.8h, v12.h[0]
475    dup       v6.8h, v10.h[0]
476    dup       v24.8h, v14.h[0]
477    dup       v26.8h, v13.h[0]
478    zip1      v5.8h, v4.8h, v24.8h
479    zip2      v24.8h, v4.8h, v24.8h
480    mov       v4.16b, v5.16b
481    zip1      v7.8h, v6.8h, v26.8h
482    zip2      v26.8h, v6.8h, v26.8h
483    mov       v6.16b, v7.16b
484    zip1      v1.8h, v0.8h, v2.8h
485    zip2      v2.8h, v0.8h, v2.8h
486    mov       v0.16b, v1.16b
487
488    adrp      x12, :got:ih264_gai1_intrapred_chroma_plane_coeffs2
489    ldr       x12, [x12, #:got_lo12:ih264_gai1_intrapred_chroma_plane_coeffs2]
490
491    ld1       {v8.2s, v9.2s}, [x12]
492    mov       v8.d[1], v9.d[0]
493    mov       v10.16b, v8.16b
494    mov       v22.16b, v8.16b
495    zip1      v9.8h, v8.8h, v10.8h
496    zip2      v10.8h, v8.8h, v10.8h
497    mov       v8.16b, v9.16b
498    mul       v12.8h, v4.8h , v8.8h
499    mul       v16.8h, v4.8h , v10.8h
500    add       v12.8h, v0.8h , v12.8h
501    add       v16.8h, v0.8h , v16.8h
502    dup       v20.8h, v22.h[0]
503    mul       v4.8h, v6.8h , v20.8h
504    dup       v30.8h, v22.h[1]
505    mul       v18.8h, v6.8h , v20.8h
506    mul       v14.8h, v6.8h , v30.8h
507    mul       v8.8h, v6.8h , v30.8h
508    add       v24.8h, v12.8h , v4.8h
509    add       v0.8h, v16.8h , v18.8h
510    add       v2.8h, v12.8h , v14.8h
511    sqrshrun  v28.8b, v24.8h, #5
512    add       v26.8h, v16.8h , v8.8h
513    sqrshrun  v29.8b, v0.8h, #5
514    dup       v20.8h, v22.h[2]
515    st1       {v28.8b, v29.8b}, [x1], x3
516    sqrshrun  v28.8b, v2.8h, #5
517    sqrshrun  v29.8b, v26.8h, #5
518    mul       v4.8h, v6.8h , v20.8h
519    mul       v18.8h, v6.8h , v20.8h
520    st1       {v28.8b, v29.8b}, [x1], x3
521    add       v24.8h, v12.8h , v4.8h
522    add       v0.8h, v16.8h , v18.8h
523    dup       v30.8h, v22.h[3]
524    sqrshrun  v28.8b, v24.8h, #5
525    sqrshrun  v29.8b, v0.8h, #5
526    mul       v14.8h, v6.8h , v30.8h
527    mul       v8.8h, v6.8h , v30.8h
528    st1       {v28.8b, v29.8b}, [x1], x3
529    add       v2.8h, v12.8h , v14.8h
530    add       v26.8h, v16.8h , v8.8h
531    dup       v20.8h, v22.h[4]
532    sqrshrun  v28.8b, v2.8h, #5
533    sqrshrun  v29.8b, v26.8h, #5
534    mul       v4.8h, v6.8h , v20.8h
535    mul       v18.8h, v6.8h , v20.8h
536    st1       {v28.8b, v29.8b}, [x1], x3
537    add       v24.8h, v12.8h , v4.8h
538    add       v0.8h, v16.8h , v18.8h
539    dup       v30.8h, v22.h[5]
540    sqrshrun  v28.8b, v24.8h, #5
541    sqrshrun  v29.8b, v0.8h, #5
542    mul       v14.8h, v6.8h , v30.8h
543    mul       v8.8h, v6.8h , v30.8h
544    st1       {v28.8b, v29.8b}, [x1], x3
545    add       v2.8h, v12.8h , v14.8h
546    add       v26.8h, v16.8h , v8.8h
547    dup       v20.8h, v22.h[6]
548    sqrshrun  v28.8b, v2.8h, #5
549    sqrshrun  v29.8b, v26.8h, #5
550    mul       v4.8h, v6.8h , v20.8h
551    mul       v18.8h, v6.8h , v20.8h
552    st1       {v28.8b, v29.8b}, [x1], x3
553    add       v24.8h, v12.8h , v4.8h
554    add       v0.8h, v16.8h , v18.8h
555    dup       v30.8h, v22.h[7]
556    sqrshrun  v28.8b, v24.8h, #5
557    sqrshrun  v29.8b, v0.8h, #5
558    mul       v14.8h, v6.8h , v30.8h
559    mul       v8.8h, v6.8h , v30.8h
560    st1       {v28.8b, v29.8b}, [x1], x3
561    add       v2.8h, v12.8h , v14.8h
562    add       v26.8h, v16.8h , v8.8h
563    sqrshrun  v28.8b, v2.8h, #5
564    sqrshrun  v29.8b, v26.8h, #5
565    st1       {v28.8b, v29.8b}, [x1], x3
566
567end_func_plane:
568
569    ldp       x19, x20, [sp], #16
570    pop_v_regs
571    ret
572
573
574
575