1///*****************************************************************************
2//*
3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************/
18///**
19//*******************************************************************************
20//* @file
21//*  ihevc_intra_pred_chroma_mode_27_to_33.s
22//*
23//* @brief
24//*  contains function definition for intra prediction  interpolation filters
25//*
26//*
27//* @author
28//*  parthiban v
29//*
30//* @par list of functions:
31//*  - ihevc_intra_pred_chroma_mode_27_to_33()
32//*
33//* @remarksll
34//*  none
35//*
36//*******************************************************************************
37//*/
38
39///**
40//*******************************************************************************
41//*
42//* @brief
43//*  intraprediction for mode 27 to 33  (positive angle, vertical mode ) with
44//* neighboring samples location pointed by 'pu1_ref' to the  tu
45//* block location pointed by 'pu1_dst'
46//*
47//* @par description:
48//*
49//*
50//* @param[in] pu1_src
51//*  uword8 pointer to the source
52//*
53//* @param[in] pu1_dst
54//*  uword8 pointer to the destination
55//*
56//* @param[in] src_strd
57//*  integer source stride
58//*
59//* @param[in] dst_strd
60//*  integer destination stride
61//*
62//* @param[in] nt
63//*  integer transform block size
64//*
65//* @param[in] mode
66//*  integer intraprediction mode
67//*
68//* @returns
69//*
70//* @remarks
71//*  none
72//*
73//*******************************************************************************
74//*/
75
76//.if intra_pred_chroma_27_t0_33 == c
77//void ihevc_intra_pred_chroma_mode_27_to_33(uword8 *pu1_ref,
78//                                        word32 src_strd,
79//                                         uword8 *pu1_dst,
80//                                         word32 dst_strd,
81//                                         word32 nt,
82//                                         word32 mode)
83
84.text
85.align 4
86.include "ihevc_neon_macros.s"
87
88
89.globl ihevc_intra_pred_chroma_mode_27_to_33_av8
90.extern gai4_ihevc_ang_table
91.extern gau1_ihevc_planar_factor
92
93.type ihevc_intra_pred_chroma_mode_27_to_33_av8, %function
94
95ihevc_intra_pred_chroma_mode_27_to_33_av8:
96
97    // stmfd sp!, {x4-x12, x14}                //stack stores the values of the arguments
98
99    stp         d9,d10,[sp,#-16]!
100    stp         d12,d13,[sp,#-16]!
101    stp         d14,d15,[sp,#-16]!
102    stp         x19, x20,[sp,#-16]!
103
104    adrp        x6,  :got:gai4_ihevc_ang_table //loads word32 gai4_ihevc_ang_table[35]
105    ldr         x6,  [x6, #:got_lo12:gai4_ihevc_ang_table]
106
107    lsl         x7,x4,#2                    //four_nt
108
109    add         x8,x6,x5,lsl #2             //*gai4_ihevc_ang_table[mode]
110    ldr         w9, [x8]                    //intra_pred_ang = gai4_ihevc_ang_table[mode]
111    sxtw        x9,w9
112    adrp        x1, :got:gau1_ihevc_planar_factor //used for ((row + 1) * intra_pred_ang) row values
113    ldr         x1, [x1, #:got_lo12:gau1_ihevc_planar_factor]
114    add         x6,x1,#1
115
116    tst         x4,#7
117    add         x8,x0,x7                    //pu1_ref + four_nt
118    mov         x14,#0                      //row
119    mov         x12,x4
120    bne         core_loop_4
121    lsl         x4,x4,#1
122    b           core_loop_8
123
124core_loop_8:
125    add         x8,x8,#2                    //pu1_ref_main_idx += (four_nt + 1)
126    dup         v0.8b,w9                    //intra_pred_ang
127    lsr         x12, x4, #4                 //divide by 8
128
129    movi        v1.8b, #32
130    mul         x7, x4, x12
131
132    movi        v6.8h, #31
133
134    mov         x1,x8
135    mov         x5,x4
136    mov         x11,#2
137
138prologue:
139    ld1         {v3.8b},[x6]                //loads the row value
140    umull       v2.8h, v3.8b, v0.8b         //pos = ((row + 1) * intra_pred_ang)
141    and         v4.16b,  v2.16b ,  v6.16b   //dup_const_fract(fract = pos & (31))
142    xtn         v4.8b,  v4.8h
143    shrn        v5.8b, v2.8h,#5             //idx = pos >> 5
144
145    dup         v31.8b, v4.b[0]
146    add         x0,x2,x3
147
148    smov        x14, v5.s[0]                //(i row)extract idx to the r register
149    lsl         x14,x14,#1
150
151    dup         v29.8b, v4.b[1]             //(ii)
152    and         x9,x14,#0xff                //(i row) get the last byte
153
154    add         x10,x8,x9                   //(i row)*pu1_ref[ref_main_idx]
155
156    asr         x14,x14,#8                  //(ii)shift by 8
157    ld1         {v23.8b},[x10],x11          //(i row)ref_main_idx
158    and         x9,x14,#0xff                //(ii)get the last byte
159
160    asr         x14,x14,#8                  //(iii)
161    ld1         {v9.8b},[x10]               //(i row)ref_main_idx_1
162    add         x12,x8,x9                   //(ii)*pu1_ref[ref_main_idx]
163
164    and         x9,x14,#0xff                //(iii)
165    sub         v30.8b,  v1.8b ,  v31.8b    //32-fract(dup_const_32_fract)
166    add         x10,x8,x9                   //(iii)*pu1_ref[ref_main_idx]
167
168    ld1         {v12.8b},[x12],x11          //(ii)ref_main_idx
169    umull       v10.8h, v23.8b, v30.8b      //(i row)vmull_u8(ref_main_idx, dup_const_32_fract)
170
171    ld1         {v13.8b},[x12]              //(ii)ref_main_idx_1
172    umlal       v10.8h, v9.8b, v31.8b       //(i row)vmull_u8(ref_main_idx_1, dup_const_fract)
173    asr         x14,x14,#8                  //(iv)
174
175    dup         v27.8b, v4.b[2]             //(iii)
176    sub         v28.8b,  v1.8b ,  v29.8b    //(ii)32-fract(dup_const_32_fract)
177    and         x9,x14,#0xff                //(iv)
178
179    dup         v25.8b, v4.b[3]             //(iv)
180    umull       v14.8h, v12.8b, v28.8b      //(ii)vmull_u8(ref_main_idx, dup_const_32_fract)
181    add         x12,x8,x9                   //(iv)*pu1_ref[ref_main_idx]
182
183    ld1         {v16.8b},[x10],x11          //(iii)ref_main_idx
184    umlal       v14.8h, v13.8b, v29.8b      //(ii)vmull_u8(ref_main_idx_1, dup_const_fract)
185
186    ld1         {v17.8b},[x10]              //(iii)ref_main_idx_1
187    rshrn       v10.8b, v10.8h,#5           //(i row)shift_res = vrshrn_n_u16(add_res, 5)
188
189    ld1         {v20.8b},[x12],x11          //(iv)ref_main_idx
190    sub         v26.8b,  v1.8b ,  v27.8b    //(iii)32-fract(dup_const_32_fract)
191
192    ld1         {v21.8b},[x12]              //(iv)ref_main_idx_1
193
194    dup         v31.8b, v4.b[4]             //(v)
195    umull       v18.8h, v16.8b, v26.8b      //(iii)vmull_u8(ref_main_idx, dup_const_32_fract)
196
197    smov        x14, v5.s[1]                //extract idx to the r register
198    umlal       v18.8h, v17.8b, v27.8b      //(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
199    lsl         x14,x14,#1
200
201    st1         {v10.8b},[x2],#8            //(i row)
202    rshrn       v14.8b, v14.8h,#5           //(ii)shift_res = vrshrn_n_u16(add_res, 5)
203
204    and         x9,x14,#0xff                //(v)
205    dup         v29.8b, v4.b[5]             //(vi)
206    add         x10,x8,x9                   //(v)*pu1_ref[ref_main_idx]
207
208    ld1         {v23.8b},[x10],x11          //(v)ref_main_idx
209    sub         v24.8b,  v1.8b ,  v25.8b    //(iv)32-fract(dup_const_32_fract)
210
211    asr         x14,x14,#8                  //(vi)
212    umull       v22.8h, v20.8b, v24.8b      //(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
213    and         x9,x14,#0xff                //(vi)
214
215    ld1         {v9.8b},[x10]               //(v)ref_main_idx_1
216    umlal       v22.8h, v21.8b, v25.8b      //(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
217
218    st1         {v14.8b},[x0],x3            //(ii)
219    rshrn       v18.8b, v18.8h,#5           //(iii)shift_res = vrshrn_n_u16(add_res, 5)
220
221    add         x12,x8,x9                   //(vi)*pu1_ref[ref_main_idx]
222    dup         v27.8b, v4.b[6]             //(vii)
223    asr         x14,x14,#8                  //(vii)
224
225    and         x9,x14,#0xff                //(vii)
226    sub         v30.8b,  v1.8b ,  v31.8b    //(v)32-fract(dup_const_32_fract)
227    add         x10,x8,x9                   //(vii)*pu1_ref[ref_main_idx]
228
229    ld1         {v12.8b},[x12],x11          //(vi)ref_main_idx
230    umull       v10.8h, v23.8b, v30.8b      //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
231
232    ld1         {v13.8b},[x12]              //(vi)ref_main_idx_1
233    umlal       v10.8h, v9.8b, v31.8b       //(v)vmull_u8(ref_main_idx_1, dup_const_fract)
234
235    st1         {v18.8b},[x0],x3            //(iii)
236    rshrn       v22.8b, v22.8h,#5           //(iv)shift_res = vrshrn_n_u16(add_res, 5)
237
238    asr         x14,x14,#8                  //(viii)
239    dup         v25.8b, v4.b[7]             //(viii)
240    and         x9,x14,#0xff                //(viii)
241
242    ld1         {v16.8b},[x10],x11          //(vii)ref_main_idx
243    sub         v28.8b,  v1.8b ,  v29.8b    //(vi)32-fract(dup_const_32_fract)
244
245    ld1         {v17.8b},[x10]              //(vii)ref_main_idx_1
246    umull       v14.8h, v12.8b, v28.8b      //(vi)vmull_u8(ref_main_idx, dup_const_32_fract)
247
248    add         x12,x8,x9                   //(viii)*pu1_ref[ref_main_idx]
249    umlal       v14.8h, v13.8b, v29.8b      //(vi)vmull_u8(ref_main_idx_1, dup_const_fract)
250    subs        x7,x7,#8
251
252    st1         {v22.8b},[x0],x3            //(iv)
253    rshrn       v10.8b, v10.8h,#5           //(v)shift_res = vrshrn_n_u16(add_res, 5)
254
255    ld1         {v20.8b},[x12],x11          //(viii)ref_main_idx
256    sub         v26.8b,  v1.8b ,  v27.8b    //(vii)32-fract(dup_const_32_fract)
257
258    ld1         {v21.8b},[x12]              //(viii)ref_main_idx_1
259    umull       v18.8h, v16.8b, v26.8b      //(vii)vmull_u8(ref_main_idx, dup_const_32_fract)
260
261    add         x20,x8,#8
262    csel        x8, x20, x8,gt
263    umlal       v18.8h, v17.8b, v27.8b      //(vii)vmull_u8(ref_main_idx_1, dup_const_fract)
264    sub         x20,x4,#8
265    csel        x4, x20, x4,gt
266
267    st1         {v10.8b},[x0],x3            //(v)
268    rshrn       v14.8b, v14.8h,#5           //(vi)shift_res = vrshrn_n_u16(add_res, 5)
269
270    beq         epilogue
271
272    ld1         {v5.8b},[x6]                //loads the row value
273    umull       v2.8h, v5.8b, v0.8b         //pos = ((row + 1) * intra_pred_ang)
274    and         v4.16b,  v2.16b ,  v6.16b   //dup_const_fract(fract = pos & (31))
275    xtn         v4.8b,  v4.8h
276    shrn        v3.8b, v2.8h,#5             //idx = pos >> 5
277    smov        x14, v3.s[0]                //(i)extract idx to the r register
278    lsl         x14,x14,#1
279    and         x9,x14,#0xff                //(i)
280    add         x10,x8,x9                   //(i)*pu1_ref[ref_main_idx]
281
282kernel_8_rows:
283    asr         x14,x14,#8                  //(ii)
284    dup         v31.8b, v4.b[0]
285    subs        x4,x4,#8
286
287    ld1         {v23.8b},[x10],x11          //(i)ref_main_idx
288    sub         v24.8b,  v1.8b ,  v25.8b    //(viii)32-fract(dup_const_32_fract)
289    and         x9,x14,#0xff                //(ii)
290    add         x20,x6,#8                   //increment the row value
291    csel        x6, x20, x6,le
292
293    ld1         {v9.8b},[x10]               //(i)ref_main_idx_1
294    umull       v22.8h, v20.8b, v24.8b      //(viii)vmull_u8(ref_main_idx, dup_const_32_fract)
295    add         x12,x8,x9                   //(ii)*pu1_ref[ref_main_idx]
296
297    ld1         {v5.8b},[x6]                //loads the row value
298    umlal       v22.8h, v21.8b, v25.8b      //(viii)vmull_u8(ref_main_idx_1, dup_const_fract)
299    asr         x14,x14,#8                  //(iii)
300
301    dup         v29.8b, v4.b[1]             //(ii)
302    rshrn       v18.8b, v18.8h,#5           //(vii)shift_res = vrshrn_n_u16(add_res, 5)
303    and         x9,x14,#0xff                //(iii)
304
305    st1         {v14.8b},[x0],x3            //(vi)
306    sub         v30.8b,  v1.8b ,  v31.8b    //(i)32-fract(dup_const_32_fract)
307    add         x10,x8,x9                   //(iii)*pu1_ref[ref_main_idx]
308
309    ld1         {v12.8b},[x12],x11          //(ii)ref_main_idx
310    umull       v10.8h, v23.8b, v30.8b      //(i)vmull_u8(ref_main_idx, dup_const_32_fract)
311    asr         x14,x14,#8                  //(iv)
312
313    ld1         {v13.8b},[x12]              //(ii)ref_main_idx_1
314    umlal       v10.8h, v9.8b, v31.8b       //(i)vmull_u8(ref_main_idx_1, dup_const_fract)
315    and         x9,x14,#0xff                //(iv)
316
317    smov        x14, v3.s[1]                //extract idx to the r register
318    rshrn       v22.8b, v22.8h,#5           //(viii)shift_res = vrshrn_n_u16(add_res, 5)
319
320    dup         v27.8b, v4.b[2]             //(iii)
321    sub         v28.8b,  v1.8b ,  v29.8b    //(ii)32-fract(dup_const_32_fract)
322    csel        x4, x5, x4,le               //reload nt
323
324    ld1         {v16.8b},[x10],x11          //(iii)ref_main_idx
325    umull       v14.8h, v12.8b, v28.8b      //(ii)vmull_u8(ref_main_idx, dup_const_32_fract)
326    add         x12,x8,x9                   //(iv)*pu1_ref[ref_main_idx]
327
328    st1         {v18.8b},[x0],x3            //(vii)
329    umlal       v14.8h, v13.8b, v29.8b      //(ii)vmull_u8(ref_main_idx_1, dup_const_fract)
330
331    ld1         {v17.8b},[x10]              //(iii)ref_main_idx_1
332    rshrn       v10.8b, v10.8h,#5           //(i)shift_res = vrshrn_n_u16(add_res, 5)
333
334    dup         v25.8b, v4.b[3]             //(iv)
335    umull       v2.8h, v5.8b, v0.8b         //pos = ((row + 1) * intra_pred_ang)
336
337    st1         {v22.8b},[x0]               //(viii)
338    sub         v26.8b,  v1.8b ,  v27.8b    //(iii)32-fract(dup_const_32_fract)
339
340    ld1         {v20.8b},[x12],x11          //(iv)ref_main_idx
341    umull       v18.8h, v16.8b, v26.8b      //(iii)vmull_u8(ref_main_idx, dup_const_32_fract)
342    lsl         x14,x14,#1
343
344    ld1         {v21.8b},[x12]              //(iv)ref_main_idx_1
345    umlal       v18.8h, v17.8b, v27.8b      //(iii)vmull_u8(ref_main_idx_1, dup_const_fract)
346    add         x0,x2,x3
347
348    dup         v31.8b, v4.b[4]             //(v)
349    rshrn       v14.8b, v14.8h,#5           //(ii)shift_res = vrshrn_n_u16(add_res, 5)
350    and         x9,x14,#0xff                //(v)
351
352    st1         {v10.8b},[x2],#8            //(i)
353    sub         v24.8b,  v1.8b ,  v25.8b    //(iv)32-fract(dup_const_32_fract)
354    add         x10,x8,x9                   //(v)*pu1_ref[ref_main_idx]
355
356    dup         v29.8b, v4.b[5]             //(vi)
357    umull       v22.8h, v20.8b, v24.8b      //(iv)vmull_u8(ref_main_idx, dup_const_32_fract)
358    asr         x14,x14,#8                  //(vi)
359
360    dup         v27.8b, v4.b[6]             //(vii)
361    umlal       v22.8h, v21.8b, v25.8b      //(iv)vmull_u8(ref_main_idx_1, dup_const_fract)
362    and         x9,x14,#0xff                //(vi)
363
364    dup         v25.8b, v4.b[7]             //(viii)
365    rshrn       v18.8b, v18.8h,#5           //(iii)shift_res = vrshrn_n_u16(add_res, 5)
366    add         x12,x8,x9                   //(vi)*pu1_ref[ref_main_idx]
367
368    ld1         {v23.8b},[x10],x11          //(v)ref_main_idx
369    and         v4.16b,  v2.16b ,  v6.16b   //dup_const_fract(fract = pos & (31))
370    asr         x14,x14,#8                  //(vii)
371
372    ld1         {v9.8b},[x10]               //(v)ref_main_idx_1
373    shrn        v3.8b, v2.8h,#5             //idx = pos >> 5
374    and         x9,x14,#0xff                //(vii)
375
376    st1         {v14.8b},[x0],x3            //(ii)
377    rshrn       v22.8b, v22.8h,#5           //(iv)shift_res = vrshrn_n_u16(add_res, 5)
378    asr         x14,x14,#8                  //(viii)
379
380    ld1         {v12.8b},[x12],x11          //(vi)ref_main_idx
381    sub         v30.8b,  v1.8b ,  v31.8b    //(v)32-fract(dup_const_32_fract)
382    add         x10,x8,x9                   //(vii)*pu1_ref[ref_main_idx]
383
384    ld1         {v13.8b},[x12]              //(vi)ref_main_idx_1
385    umull       v10.8h, v23.8b, v30.8b      //(v)vmull_u8(ref_main_idx, dup_const_32_fract)
386    and         x9,x14,#0xff                //(viii)
387
388    smov        x14, v3.s[0]                //(i)extract idx to the r register
389    umlal       v10.8h, v9.8b, v31.8b       //(v)vmull_u8(ref_main_idx_1, dup_const_fract)
390    add         x12,x8,x9                   //(viii)*pu1_ref[ref_main_idx]
391
392    ld1         {v16.8b},[x10],x11          //(vii)ref_main_idx
393    sub         v28.8b,  v1.8b ,  v29.8b    //(vi)32-fract(dup_const_32_fract)
394
395    st1         {v18.8b},[x0],x3            //(iii)
396    umull       v14.8h, v12.8b, v28.8b      //(vi)vmull_u8(ref_main_idx, dup_const_32_fract)
397    csel        x8, x1, x8,le               //reload the source to pu1_src+2nt
398
399    ld1         {v17.8b},[x10]              //(vii)ref_main_idx_1
400    umlal       v14.8h, v13.8b, v29.8b      //(vi)vmull_u8(ref_main_idx_1, dup_const_fract)
401    add         x20,x8,#8                   //increment the source next set 8 columns in same row
402    csel        x8, x20, x8,gt
403
404    ld1         {v20.8b},[x12],x11          //(viii)ref_main_idx
405    rshrn       v10.8b, v10.8h,#5           //(v)shift_res = vrshrn_n_u16(add_res, 5)
406
407    ld1         {v21.8b},[x12]              //(viii)ref_main_idx_1
408    sub         v26.8b,  v1.8b ,  v27.8b    //(vii)32-fract(dup_const_32_fract)
409    lsl         x20, x3,#3
410    csel        x12,x20,x12,le
411
412    st1         {v22.8b},[x0],x3            //(iv)
413    umull       v18.8h, v16.8b, v26.8b      //(vii)vmull_u8(ref_main_idx, dup_const_32_fract)
414    sub         x20,x12,x5
415    csel        x12, x20, x12,le
416
417    st1         {v10.8b},[x0],x3            //(v)
418    umlal       v18.8h, v17.8b, v27.8b      //(vii)vmull_u8(ref_main_idx_1, dup_const_fract)
419    add         x20,x2,x12                  //increment the dst pointer to 8*dst_strd - nt
420    csel        x2, x20, x2,le
421
422    xtn         v4.8b,  v4.8h
423    rshrn       v14.8b, v14.8h,#5           //(vi)shift_res = vrshrn_n_u16(add_res, 5)
424    lsl         x14,x14,#1
425
426    and         x9,x14,#0xff                //(i)
427    subs        x7,x7,#8
428    add         x10,x8,x9                   //(i)*pu1_ref[ref_main_idx]
429
430    bne         kernel_8_rows
431
432epilogue:
433    st1         {v14.8b},[x0],x3            //(vi)
434    rshrn       v18.8b, v18.8h,#5           //(vii)shift_res = vrshrn_n_u16(add_res, 5)
435
436    sub         v24.8b,  v1.8b ,  v25.8b    //(viii)32-fract(dup_const_32_fract)
437    umull       v22.8h, v20.8b, v24.8b      //(viii)vmull_u8(ref_main_idx, dup_const_32_fract)
438    umlal       v22.8h, v21.8b, v25.8b      //(viii)vmull_u8(ref_main_idx_1, dup_const_fract)
439
440    st1         {v18.8b},[x0],x3            //(vii)
441    rshrn       v22.8b, v22.8h,#5           //(viii)shift_res = vrshrn_n_u16(add_res, 5)
442
443    st1         {v22.8b},[x0],x3            //(viii)
444    b           end_loops
445
446core_loop_4:
447    add         x10,x8,#2                   //pu1_ref_main_idx += (four_nt + 1)
448    add         x11,x8,#4                   //pu1_ref_main_idx_1 += (four_nt + 2)
449    mov         x8,#0
450
451    add         x5,x8,#1                    //row + 1
452    mul         x5, x5, x9                  //pos = ((row + 1) * intra_pred_ang)
453    and         x5,x5,#31                   //fract = pos & (31)
454    cmp         x14,x5                      //if(fract_prev > fract)
455    add         x20,x10,#2                  //pu1_ref_main_idx += 2
456    csel        x10, x20, x10,gt
457    add         x11,x10,#2                  //pu1_ref_main_idx_1 += 2
458    dup         v0.8b,w5                    //dup_const_fract
459    sub         x20,x5,#32
460    neg         x4, x20
461    dup         v1.8b,w4                    //dup_const_32_fract
462
463//inner_loop_4
464    ld1         {v2.8b},[x10]               //ref_main_idx
465    add         x8,x8,#1
466    mov         x14,x5                      //fract_prev = fract
467
468    ld1         {v3.8b},[x11]               //ref_main_idx_1
469    add         x5,x8,#1                    //row + 1
470    mul         x5, x5, x9                  //pos = ((row + 1) * intra_pred_ang)
471    and         x5,x5,#31                   //fract = pos & (31)
472    cmp         x14,x5                      //if(fract_prev > fract)
473    add         x20,x10,#2                  //pu1_ref_main_idx += 1
474    csel        x10, x20, x10,gt
475    add         x11,x10,#2                  //pu1_ref_main_idx_1 += 1
476
477    dup         v6.8b,w5                    //dup_const_fract
478    umull       v4.8h, v2.8b, v1.8b         //vmull_u8(ref_main_idx, dup_const_32_fract)
479
480    sub         x20,x5,#32
481    neg         x4, x20
482    dup         v7.8b,w4                    //dup_const_32_fract
483    umlal       v4.8h, v3.8b, v0.8b         //vmull_u8(ref_main_idx_1, dup_const_fract)
484
485    ld1         {v23.8b},[x10]              //ref_main_idx
486    add         x8,x8,#1
487
488    ld1         {v9.8b},[x11]               //ref_main_idx_1
489    rshrn       v4.8b, v4.8h,#5             //shift_res = vrshrn_n_u16(add_res, 5)
490
491    mov         x14,x5                      //fract_prev = fract
492    add         x5,x8,#1                    //row + 1
493    mul         x5, x5, x9                  //pos = ((row + 1) * intra_pred_ang)
494    and         x5,x5,#31                   //fract = pos & (31)
495    cmp         x14,x5                      //if(fract_prev > fract)
496    add         x20,x10,#2                  //pu1_ref_main_idx += 1
497    csel        x10, x20, x10,gt
498    add         x11,x10,#2                  //pu1_ref_main_idx_1 += 1
499
500    dup         v12.8b,w5                   //dup_const_fract
501    umull       v10.8h, v23.8b, v7.8b       //vmull_u8(ref_main_idx, dup_const_32_fract)
502
503    sub         x20,x5,#32
504    neg         x4, x20
505    dup         v13.8b,w4                   //dup_const_32_fract
506    umlal       v10.8h, v9.8b, v6.8b        //vmull_u8(ref_main_idx_1, dup_const_fract)
507
508    ld1         {v14.8b},[x10]              //ref_main_idx
509    add         x8,x8,#1
510
511    st1         {v4.8b},[x2],x3
512    rshrn       v10.8b, v10.8h,#5           //shift_res = vrshrn_n_u16(add_res, 5)
513
514    ld1         {v15.8b},[x11]              //ref_main_idx_1
515    mov         x14,x5                      //fract_prev = fract
516    add         x5,x8,#1                    //row + 1
517    mul         x5, x5, x9                  //pos = ((row + 1) * intra_pred_ang)
518    and         x5,x5,#31                   //fract = pos & (31)
519    cmp         x14,x5                      //if(fract_prev > fract)
520    add         x20,x10,#2                  //pu1_ref_main_idx += 1
521    csel        x10, x20, x10,gt
522    add         x11,x10,#2                  //pu1_ref_main_idx_1 += 1
523
524    dup         v18.8b,w5                   //dup_const_fract
525    umull       v16.8h, v14.8b, v13.8b      //vmull_u8(ref_main_idx, dup_const_32_fract)
526
527    sub         x20,x5,#32
528    neg         x4, x20
529    dup         v19.8b,w4                   //dup_const_32_fract
530    umlal       v16.8h, v15.8b, v12.8b      //vmull_u8(ref_main_idx_1, dup_const_fract)
531
532    ld1         {v20.8b},[x10]              //ref_main_idx
533
534    st1         {v10.8b},[x2],x3
535    rshrn       v16.8b, v16.8h,#5           //shift_res = vrshrn_n_u16(add_res, 5)
536    ld1         {v21.8b},[x11]              //ref_main_idx_1
537
538    umull       v22.8h, v20.8b, v19.8b      //vmull_u8(ref_main_idx, dup_const_32_fract)
539    umlal       v22.8h, v21.8b, v18.8b      //vmull_u8(ref_main_idx_1, dup_const_fract)
540
541    st1         {v16.8b},[x2],x3
542    rshrn       v22.8b, v22.8h,#5           //shift_res = vrshrn_n_u16(add_res, 5)
543
544    st1         {v22.8b},[x2],x3
545
546end_loops:
547    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
548    ldp         x19, x20,[sp],#16
549    ldp         d14,d15,[sp],#16
550    ldp         d12,d13,[sp],#16
551    ldp         d9,d10,[sp],#16
552    ret
553
554
555
556
557