1///*****************************************************************************
2//*
3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************/
18///**
19//*******************************************************************************
20//* @file
21//*  ihevc_intra_pred_luma_mode_11_to_17.s
22//*
23//* @brief
24//*  contains function definitions for intra prediction dc filtering.
25//* functions are coded using neon  intrinsics and can be compiled using
26
27//* rvct
28//*
29//* @author
30//*  akshaya mukund
31//*
32//* @par list of functions:
33//*
34//*
35//* @remarks
36//*  none
37//*
38//*******************************************************************************
39//*/
40///**
41//*******************************************************************************
42//*
43//* @brief
44//*    luma intraprediction filter for dc input
45//*
46//* @par description:
47//*
48//* @param[in] pu1_ref
49//*  uword8 pointer to the source
50//*
51//* @param[out] pu1_dst
52//*  uword8 pointer to the destination
53//*
54//* @param[in] src_strd
55//*  integer source stride
56//*
57//* @param[in] dst_strd
58//*  integer destination stride
59//*
60//* @param[in] nt
61//*  size of tranform block
62//*
63//* @param[in] mode
64//*  type of filtering
65//*
66//* @returns
67//*
68//* @remarks
69//*  none
70//*
71//*******************************************************************************
72//*/
73
74//void ihevc_intra_pred_luma_mode_11_to_17(uword8* pu1_ref,
75//                               word32 src_strd,
76//                               uword8* pu1_dst,
77//                               word32 dst_strd,
78//                               word32 nt,
79//                               word32 mode)
80//
81//**************variables vs registers*****************************************
82//x0 => *pu1_ref
83//x1 => src_strd
84//x2 => *pu1_dst
85//x3 => dst_strd
86
87//stack contents from #40
88//    nt
89//    mode
90
91.text
92.align 4
93.include "ihevc_neon_macros.s"
94
95
96
97.globl ihevc_intra_pred_luma_mode_11_to_17_av8
98.extern gai4_ihevc_ang_table
99.extern gai4_ihevc_inv_ang_table
100.extern col_for_intra_luma
101.extern idx_11_17
102
103.type ihevc_intra_pred_luma_mode_11_to_17_av8, %function
104
105ihevc_intra_pred_luma_mode_11_to_17_av8:
106
107    // stmfd sp!, {x4-x12, x14}            //stack stores the values of the arguments
108
109    stp         d12,d13,[sp,#-16]!
110    stp         d14,d15,[sp,#-16]!
111    stp         x19, x20,[sp,#-16]!
112
113    adrp        x7,  :got:gai4_ihevc_ang_table
114    ldr         x7,  [x7, #:got_lo12:gai4_ihevc_ang_table]
115
116    adrp        x8,  :got:gai4_ihevc_inv_ang_table
117    ldr         x8,  [x8, #:got_lo12:gai4_ihevc_inv_ang_table]
118
119    add         x7, x7, x5, lsl #2          //gai4_ihevc_ang_table[mode]
120    add         x8, x8, x5, lsl #2          //gai4_ihevc_inv_ang_table[mode - 11]
121    sub         x8, x8, #44
122
123    ldr         w7,  [x7]                   //intra_pred_ang
124    sxtw        x7,w7
125    sub         sp, sp, #132                //ref_temp[2 * max_cu_size + 1]
126
127    ldr         w8,  [x8]                   //inv_ang
128    sxtw        x8,w8
129    add         x6, sp, x4                  //ref_temp + nt
130
131    mul         x9, x4, x7                  //nt*intra_pred_ang
132
133    sub         x6, x6, #1                  //ref_temp + nt - 1
134
135    add         x1, x0, x4, lsl #1          //x1 = &src[2nt]
136    dup         v30.8b,w7                   //intra_pred_ang
137
138    mov         x7, x4
139
140    ldrb        w11, [x1], #-1
141    sxtw        x11,w11
142
143    asr         x9, x9, #5
144
145    ldrb        w12, [x1], #-1
146    sxtw        x12,w12
147    ldrb        w10, [x1], #-1
148    sxtw        x10,w10
149    ldrb        w14, [x1], #-1
150    sxtw        x14,w14
151
152    strb        w11, [x6], #1
153    sxtw        x11,w11
154    strb        w12, [x6], #1
155    sxtw        x12,w12
156    strb        w10, [x6], #1
157    sxtw        x10,w10
158    strb        w14, [x6], #1
159    sxtw        x14,w14
160
161    subs        x7, x7, #4
162    beq         end_loop_copy
163
164    sub         x6, x6,#4
165    sub         x1, x1,#3
166
167    subs        x7,x7,#4
168    beq         loop_copy_8
169    subs        x7,x7,#8
170    beq         loop_copy_16
171
172loop_copy_32:
173    ld1         {v0.8b},[x1]
174    sub         x1, x1,#8
175    ld1         {v1.8b},[x1]
176    sub         x1, x1,#8
177    ld1         {v2.8b},[x1]
178    sub         x1, x1,#8
179    ld1         {v3.8b},[x1]
180
181    rev64       v0.8b,  v0.8b
182    rev64       v1.8b,  v1.8b
183    st1         {v0.8b},[x6],#8
184    rev64       v2.8b,  v2.8b
185    st1         {v1.8b},[x6],#8
186    rev64       v3.8b,  v3.8b
187    st1         {v2.8b},[x6],#8
188    st1         {v3.8b},[x6],#8
189    sub         x1, x1,#1
190    b           end_loop_copy
191
192loop_copy_16:
193    ld1         {v0.8b},[x1]
194    sub         x1, x1,#8
195    ld1         {v1.8b},[x1]
196
197    rev64       v0.8b,  v0.8b
198    rev64       v1.8b,  v1.8b
199
200    st1         {v0.8b},[x6],#8
201    st1         {v1.8b},[x6],#8
202    sub         x1, x1,#1
203    b           end_loop_copy
204
205loop_copy_8:
206    ld1         {v0.8b},[x1]
207    rev64       v0.8b,  v0.8b
208    st1         {v0.8b},[x6],#8
209    sub         x1, x1,#1
210end_loop_copy:
211
212    ldrb        w11, [x1], #-1
213    sxtw        x11,w11
214    strb        w11, [x6], #1
215    sxtw        x11,w11
216
217    cmp         x9, #-1
218    bge         prologue_8_16_32
219
220    add         x6, sp, x4                  //ref_temp + nt
221    sub         x6, x6, #2                  //ref_temp + nt - 2
222
223    mov         x12, #-1
224
225    sub         x20, x9, x12                //count to take care off ref_idx
226    neg         x9, x20
227
228    add         x1, x0, x4, lsl #1          //x1 = &src[2nt]
229
230    mov         x7, #128                    //inv_ang_sum
231
232loop_copy_ref_idx:
233
234    add         x7, x7, x8                  //inv_ang_sum += inv_ang
235
236    lsr         x20, x7, #8
237    ldrb        w11, [x1, x20]
238    strb        w11, [x6], #-1
239
240    subs        x9, x9, #1
241
242    bne         loop_copy_ref_idx
243
244prologue_8_16_32:
245    cmp         x4, #4
246    beq         sz_4_proc
247    adrp        x14,  :got:col_for_intra_luma
248    ldr         x14,  [x14, #:got_lo12:col_for_intra_luma]
249
250    lsr         x10, x4, #3
251    ld1         {v31.8b},[x14],#8
252    mul         x10, x4, x10                //block counter (dec by #8)
253
254    mov         x11, x4                     //col counter to be inc/dec by #8
255    smull       v22.8h, v30.8b, v31.8b      //(col+1)*intra_pred_angle [0:7](col)
256    mov         x0, #1
257
258    sub         x7, x5, #11
259    dup         v2.8b,w0                    //contains #1 for adding to get ref_main_idx + 1
260
261    adrp        x12, :got:idx_neg_idx_11_17 //load least idx table
262    ldr         x12, [x12, #:got_lo12:idx_neg_idx_11_17]
263
264    mov         x0, #2
265    dup         v3.8b,w0
266
267    add         x12, x12, x7, lsl #4
268    mov         x8, x12
269
270    mov         x7, #8
271    sub         x7, x7, x3, lsl #3          //x7 = 8-8x3
272
273    ldr         w9,  [x8]
274    sxtw        x9,w9
275    add         x1, sp, x4                  //ref_temp + nt
276
277    xtn         v6.8b,  v22.8h
278    dup         v26.8b,w9                   //least idx added to final idx values
279    sub         x1, x1, #1                  //ref_temp + nt - 1
280
281    add         x6, x1, x9
282
283    ld1         {v0.16b}, [x6]              //stores the 32 values reqd based on indices values (from least idx)
284    sshr        v22.8h, v22.8h,#5
285
286    mov         x0, #31
287    dup         v29.8b,w0                   //contains #31 for vand operation
288
289    mov         x0, #32
290    dup         v28.8b,w0
291
292    sqxtn       v19.8b,  v22.8h
293
294    and         v6.8b,  v6.8b ,  v29.8b     //fract values in d1/ idx values in d0
295
296    mov         x0, #1
297    dup         v27.8b,w0                   //row value inc or reset accordingly
298
299    add         v19.8b,  v19.8b ,  v27.8b   //ref_main_idx (add row)
300    sub         v19.8b,  v19.8b ,  v26.8b   //ref_main_idx (row 0)
301    add         v21.8b,  v19.8b ,  v2.8b    //ref_main_idx + 1 (row 0)
302    tbl         v12.8b, {v0.16b},v19.8b     //load from ref_main_idx (row 0)
303    sub         v7.8b,  v28.8b ,  v6.8b     //32-fract
304
305    tbl         v13.8b, {v0.16b},v21.8b     //load from ref_main_idx + 1 (row 0)
306    add         v4.8b,  v19.8b ,  v2.8b     //ref_main_idx (row 1)
307    add         v5.8b,  v21.8b ,  v2.8b     //ref_main_idx + 1 (row 1)
308
309    tbl         v16.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 1)
310    umull       v24.8h, v12.8b, v7.8b       //mul (row 0)
311    umlal       v24.8h, v13.8b, v6.8b       //mul (row 0)
312
313    tbl         v17.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 1)
314    add         v19.8b,  v19.8b ,  v3.8b    //ref_main_idx (row 2)
315    add         v21.8b,  v21.8b ,  v3.8b    //ref_main_idx + 1 (row 2)
316
317    rshrn       v24.8b, v24.8h,#5           //round shft (row 0)
318
319    tbl         v14.8b, {v0.16b},v19.8b     //load from ref_main_idx (row 2)
320    umull       v22.8h, v16.8b, v7.8b       //mul (row 1)
321    umlal       v22.8h, v17.8b, v6.8b       //mul (row 1)
322
323    tbl         v15.8b, {v0.16b},v21.8b     //load from ref_main_idx + 1 (row 2)
324    add         v4.8b,  v4.8b ,  v3.8b      //ref_main_idx (row 3)
325    add         v5.8b,  v5.8b ,  v3.8b      //ref_main_idx + 1 (row 3)
326
327    st1         {v24.8b},[x2], x3           //st (row 0)
328    rshrn       v22.8b, v22.8h,#5           //round shft (row 1)
329
330    tbl         v23.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 3)
331    umull       v20.8h, v14.8b, v7.8b       //mul (row 2)
332    umlal       v20.8h, v15.8b, v6.8b       //mul (row 2)
333
334    tbl         v25.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 3)
335    add         v19.8b,  v19.8b ,  v3.8b    //ref_main_idx (row 4)
336    add         v21.8b,  v21.8b ,  v3.8b    //ref_main_idx + 1 (row 4)
337
338    st1         {v22.8b},[x2], x3           //st (row 1)
339    rshrn       v20.8b, v20.8h,#5           //round shft (row 2)
340
341    tbl         v12.8b, {v0.16b},v19.8b     //load from ref_main_idx (row 4)
342    umull       v18.8h, v23.8b, v7.8b       //mul (row 3)
343    umlal       v18.8h, v25.8b, v6.8b       //mul (row 3)
344
345    tbl         v13.8b, {v0.16b},v21.8b     //load from ref_main_idx + 1 (row 4)
346    add         v4.8b,  v4.8b ,  v3.8b      //ref_main_idx (row 5)
347    add         v5.8b,  v5.8b ,  v3.8b      //ref_main_idx + 1 (row 5)
348
349    st1         {v20.8b},[x2], x3           //st (row 2)
350    rshrn       v18.8b, v18.8h,#5           //round shft (row 3)
351
352    tbl         v16.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 5)
353    umull       v24.8h, v12.8b, v7.8b       //mul (row 4)
354    umlal       v24.8h, v13.8b, v6.8b       //mul (row 4)
355
356    tbl         v17.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 5)
357    add         v19.8b,  v19.8b ,  v3.8b    //ref_main_idx (row 6)
358    add         v21.8b,  v21.8b ,  v3.8b    //ref_main_idx + 1 (row 6)
359
360    st1         {v18.8b},[x2], x3           //st (row 3)
361    rshrn       v24.8b, v24.8h,#5           //round shft (row 4)
362
363    tbl         v14.8b, {v0.16b},v19.8b     //load from ref_main_idx (row 6)
364    umull       v22.8h, v16.8b, v7.8b       //mul (row 5)
365    umlal       v22.8h, v17.8b, v6.8b       //mul (row 5)
366
367    tbl         v15.8b, {v0.16b},v21.8b     //load from ref_main_idx + 1 (row 6)
368    add         v4.8b,  v4.8b ,  v3.8b      //ref_main_idx (row 7)
369    add         v5.8b,  v5.8b ,  v3.8b      //ref_main_idx + 1 (row 7)
370
371    st1         {v24.8b},[x2], x3           //st (row 4)
372    rshrn       v22.8b, v22.8h,#5           //round shft (row 5)
373
374    tbl         v23.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 7)
375    umull       v20.8h, v14.8b, v7.8b       //mul (row 6)
376    umlal       v20.8h, v15.8b, v6.8b       //mul (row 6)
377
378    tbl         v25.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 7)
379    umull       v18.8h, v23.8b, v7.8b       //mul (row 7)
380    umlal       v18.8h, v25.8b, v6.8b       //mul (row 7)
381
382    st1         {v22.8b},[x2], x3           //st (row 5)
383    rshrn       v20.8b, v20.8h,#5           //round shft (row 6)
384    rshrn       v18.8b, v18.8h,#5           //round shft (row 7)
385
386    st1         {v20.8b},[x2], x3           //st (row 6)
387
388    subs        x10, x10, #8                //subtract 8 and go to end if 8x8
389
390    st1         {v18.8b},[x2], x3           //st (row 7)
391
392    beq         end_func
393
394    subs        x11, x11, #8
395    add         x20, x8, #4
396    csel        x8, x20, x8,gt
397    add         x20, x2, x7
398    csel        x2, x20, x2,gt
399    csel        x8, x12, x8,le
400    sub         x20, x2, x4
401    csel        x2, x20, x2,le
402    add         x20, x2, #8
403    csel        x2, x20, x2,le
404    csel        x11, x4, x11,le
405    bgt         lbl390
406    adrp        x14,  :got:col_for_intra_luma
407    ldr         x14,  [x14, #:got_lo12:col_for_intra_luma]
408lbl390:
409    add         x20, x0, #8
410    csel        x0, x20, x0,le
411
412    mov         x5,x2
413    ld1         {v31.8b},[x14],#8
414    smull       v12.8h, v30.8b, v31.8b      //(col+1)*intra_pred_angle [0:7](col)
415    xtn         v23.8b,  v12.8h
416    sshr        v12.8h, v12.8h,#5
417    sqxtn       v25.8b,  v12.8h
418    dup         v27.8b,w0                   //row value inc or reset accordingly
419    ldr         w9,  [x8]
420    sxtw        x9,w9
421    add         x9, x0, x9
422    sub         x9, x9, #1
423    dup         v26.8b,w9
424    add         v19.8b,  v27.8b ,  v25.8b   //ref_main_idx (add row)
425
426    sub         x4,x4,#8
427
428kernel_8_16_32:
429
430    sub         v19.8b,  v19.8b ,  v26.8b   //ref_main_idx
431    mov         v26.8b, v23.8b
432
433    subs        x11, x11, #8
434    add         x6, x1, x9
435    tbl         v23.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 7)
436    add         v21.8b,  v2.8b ,  v19.8b    //ref_main_idx + 1
437
438    umull       v20.8h, v14.8b, v7.8b       //mul (row 6)
439    tbl         v25.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 7)
440    umlal       v20.8h, v15.8b, v6.8b       //mul (row 6)
441
442    add         x20, x0, #8
443    csel        x0, x20, x0,le
444    add         x20, x8, #4
445    csel        x8, x20, x8,gt
446    ld1         {v0.16b}, [x6]              //stores the 32 values reqd based on indices values (from least idx)
447
448    st1         {v24.8b},[x5], x3           //st (row 4)
449    rshrn       v24.8b, v22.8h,#5           //round shft (row 5)
450
451    bgt         lbl429
452    adrp        x14,  :got:col_for_intra_luma
453    ldr         x14,  [x14, #:got_lo12:col_for_intra_luma]
454lbl429:
455    csel        x8, x12, x8,le
456    dup         v27.8b,w0                   //row value inc or reset accordingly
457
458    add         v4.8b,  v2.8b ,  v19.8b     //ref_main_idx (row 1)
459    tbl         v12.8b, {v0.16b},v19.8b     //load from ref_main_idx (row 0)
460    add         v5.8b,  v2.8b ,  v21.8b     //ref_main_idx + 1 (row 1)
461
462
463    umull       v18.8h, v23.8b, v7.8b       //mul (row 7)
464    tbl         v13.8b, {v0.16b},v21.8b     //load from ref_main_idx + 1 (row 0)
465    umlal       v18.8h, v25.8b, v6.8b       //mul (row 7)
466
467    ld1         {v31.8b},[x14],#8
468    and         v6.8b,  v29.8b ,  v26.8b    //fract values in d1/ idx values in d0
469
470    st1         {v24.8b},[x5], x3           //(from previous loop)st (row 5)
471    rshrn       v20.8b, v20.8h,#5           //(from previous loop)round shft (row 6)
472
473    add         v19.8b,  v3.8b ,  v19.8b    //ref_main_idx (row 2)
474    tbl         v16.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 1)
475    add         v21.8b,  v3.8b ,  v21.8b    //ref_main_idx + 1 (row 2)
476
477    add         x20, x4, #8
478    csel        x11, x20, x11,le
479    ldr         w9,  [x8]
480    sxtw        x9,w9
481    sub         v7.8b,  v28.8b ,  v6.8b     //32-fract
482
483    umull       v24.8h, v12.8b, v7.8b       //mul (row 0)
484    tbl         v17.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 1)
485    umlal       v24.8h, v13.8b, v6.8b       //mul (row 0)
486
487    st1         {v20.8b},[x5], x3           //(from previous loop)st (row 6)
488    rshrn       v18.8b, v18.8h,#5           //(from previous loop)round shft (row 7)
489
490    add         v4.8b,  v4.8b ,  v3.8b      //ref_main_idx (row 3)
491    tbl         v14.8b, {v0.16b},v19.8b     //load from ref_main_idx (row 2)
492    add         v5.8b,  v5.8b ,  v3.8b      //ref_main_idx + 1 (row 3)
493
494    umull       v22.8h, v16.8b, v7.8b       //mul (row 1)
495    tbl         v15.8b, {v0.16b},v21.8b     //load from ref_main_idx + 1 (row 2)
496    umlal       v22.8h, v17.8b, v6.8b       //mul (row 1)
497
498    rshrn       v24.8b, v24.8h,#5           //round shft (row 0)
499    st1         {v18.8b},[x5], x3           //(from previous loop)st (row 7)
500
501    add         v19.8b,  v19.8b ,  v3.8b    //ref_main_idx (row 4)
502    tbl         v23.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 3)
503    add         v21.8b,  v21.8b ,  v3.8b    //ref_main_idx + 1 (row 4)
504
505    umull       v20.8h, v14.8b, v7.8b       //mul (row 2)
506    tbl         v25.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 3)
507    umlal       v20.8h, v15.8b, v6.8b       //mul (row 2)
508
509    smull       v14.8h, v30.8b, v31.8b      //(col+1)*intra_pred_angle [0:7](col)
510    add         x5,x2,x3,lsl#2
511    add         x9, x0, x9
512
513
514    st1         {v24.8b},[x2], x3           //st (row 0)
515    rshrn       v22.8b, v22.8h,#5           //round shft (row 1)
516
517    add         v4.8b,  v4.8b ,  v3.8b      //ref_main_idx (row 5)
518    tbl         v12.8b, {v0.16b},v19.8b     //load from ref_main_idx (row 4)
519    add         v5.8b,  v5.8b ,  v3.8b      //ref_main_idx + 1 (row 5)
520
521    umull       v18.8h, v23.8b, v7.8b       //mul (row 3)
522    tbl         v13.8b, {v0.16b},v21.8b     //load from ref_main_idx + 1 (row 4)
523    umlal       v18.8h, v25.8b, v6.8b       //mul (row 3)
524
525    st1         {v22.8b},[x2], x3           //st (row 1)
526    rshrn       v20.8b, v20.8h,#5           //round shft (row 2)
527
528    xtn         v23.8b,  v14.8h
529    sshr        v14.8h, v14.8h,#5
530
531    add         v19.8b,  v19.8b ,  v3.8b    //ref_main_idx (row 6)
532    tbl         v16.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 5)
533    add         v21.8b,  v21.8b ,  v3.8b    //ref_main_idx + 1 (row 6)
534
535    umull       v24.8h, v12.8b, v7.8b       //mul (row 4)
536    tbl         v17.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 5)
537    umlal       v24.8h, v13.8b, v6.8b       //mul (row 4)
538
539    st1         {v20.8b},[x2], x3           //st (row 2)
540    rshrn       v18.8b, v18.8h,#5           //round shft (row 3)
541
542    sub         x9, x9, #1
543    sqxtn       v25.8b,  v14.8h
544
545    add         v4.8b,  v4.8b ,  v3.8b      //ref_main_idx (row 7)
546    tbl         v14.8b, {v0.16b},v19.8b     //load from ref_main_idx (row 6)
547    add         v5.8b,  v5.8b ,  v3.8b      //ref_main_idx + 1 (row 7)
548
549    umull       v22.8h, v16.8b, v7.8b       //mul (row 5)
550    tbl         v15.8b, {v0.16b},v21.8b     //load from ref_main_idx + 1 (row 6)
551    umlal       v22.8h, v17.8b, v6.8b       //mul (row 5)
552
553    add         v19.8b,  v27.8b ,  v25.8b   //ref_main_idx (add row)
554    dup         v26.8b,w9
555
556    st1         {v18.8b},[x2], x3           //st (row 3)
557    rshrn       v24.8b, v24.8h,#5           //round shft (row 4)
558
559
560    add         x2, x2, x3, lsl #2
561    add         x20, x7, x2
562    csel        x2, x20, x2,gt
563    sub         x20, x2, x4
564    csel        x2, x20, x2,le
565
566    subs        x10, x10, #8                //subtract 8 and go to end if 8x8
567
568    bne         kernel_8_16_32
569epil_8_16_32:
570
571    tbl         v23.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 7)
572
573    umull       v20.8h, v14.8b, v7.8b       //mul (row 6)
574    tbl         v25.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 7)
575    umlal       v20.8h, v15.8b, v6.8b       //mul (row 6)
576
577    st1         {v24.8b},[x5], x3           //st (row 4)
578    rshrn       v24.8b, v22.8h,#5           //round shft (row 5)
579
580    umull       v18.8h, v23.8b, v7.8b       //mul (row 7)
581    umlal       v18.8h, v25.8b, v6.8b       //mul (row 7)
582
583    st1         {v24.8b},[x5], x3           //(from previous loop)st (row 5)
584    rshrn       v20.8b, v20.8h,#5           //(from previous loop)round shft (row 6)
585
586    st1         {v20.8b},[x5], x3           //(from previous loop)st (row 6)
587    rshrn       v18.8b, v18.8h,#5           //(from previous loop)round shft (row 7)
588
589    st1         {v18.8b},[x5], x3           //st (row 7)
590
591
592    b           end_func
593
594sz_4_proc:
595    adrp        x14,  :got:col_for_intra_luma
596    ldr         x14,  [x14, #:got_lo12:col_for_intra_luma]
597
598    ld1         {v31.8b},[x14]
599    mov         x12, #1
600
601    dup         v2.8b,w12                   //contains #1 for adding to get ref_main_idx + 1
602    mov         x0, #2
603
604    dup         v3.8b,w0
605    adrp        x12, :got:idx_neg_idx_11_17 //load least idx table
606    ldr         x12, [x12, #:got_lo12:idx_neg_idx_11_17]
607
608    smull       v22.8h, v30.8b, v31.8b      //(col+1)*intra_pred_angle [0:7](col)
609    sub         x7, x5, #11
610
611    add         x12, x12, x7, lsl #4
612    mov         x8, x12
613
614    ldr         w9,  [x8]
615    sxtw        x9,w9
616
617    dup         v26.8b,w9                   //least idx added to final idx values
618    add         x6, sp, x4                  //ref_temp + nt
619
620    sub         x6, x6, #1                  //ref_temp + nt - 1
621    xtn         v6.8b,  v22.8h
622    add         x6, x6, x9
623
624    ld1         {v0.16b}, [x6]              //stores the 32 values reqd based on indices values (from least idx)
625    mov         x0, #31
626
627    dup         v29.8b,w0                   //contains #31 for vand operation
628    mov         x1, #32
629
630    dup         v28.8b,w1
631
632    sshr        v22.8h, v22.8h,#5
633    sqxtn       v19.8b,  v22.8h
634
635    and         v6.8b,  v6.8b ,  v29.8b     //fract values in d1/ idx values in d0
636    sub         v7.8b,  v28.8b ,  v6.8b     //32-fract
637
638    add         v19.8b,  v19.8b ,  v2.8b    //ref_main_idx (add 1)
639    sub         v19.8b,  v19.8b ,  v26.8b   //ref_main_idx
640    add         v21.8b,  v19.8b ,  v2.8b    //ref_main_idx + 1
641
642    add         v4.8b,  v19.8b ,  v2.8b     //row 1 ref_main_idx
643    add         v5.8b,  v21.8b ,  v2.8b
644
645    tbl         v12.8b, {v0.16b},v19.8b     //load from ref_main_idx (row 0)
646    tbl         v13.8b, {v0.16b},v21.8b     //load from ref_main_idx + 1 (row 0)
647
648
649    umull       v24.8h, v12.8b, v7.8b       //mul (row 0)
650    tbl         v16.8b, {v0.16b},v4.8b      //load from ref_main_idx    (row 1)
651    umlal       v24.8h, v13.8b, v6.8b       //mul (row 0)
652
653    add         v19.8b,  v19.8b ,  v3.8b    //idx (row 2)
654    tbl         v17.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 1)
655    add         v21.8b,  v21.8b ,  v3.8b    //idx+1 (row 2)
656
657    umull       v22.8h, v16.8b, v7.8b       //mul (row 1)
658    tbl         v12.8b, {v0.16b},v19.8b     //load from ref_main_idx    (row 2)
659    umlal       v22.8h, v17.8b, v6.8b       //mul (row 1)
660
661    rshrn       v24.8b, v24.8h,#5           //round shift (row 0)
662
663    add         v4.8b,  v4.8b ,  v3.8b      //idx (row 3)
664    tbl         v13.8b, {v0.16b},v21.8b     //load from ref_main_idx + 1 (row 2)
665    add         v5.8b,  v5.8b ,  v3.8b      //idx+1 (row 3)
666
667    umull       v20.8h, v12.8b, v7.8b       //mul (row 2)
668    tbl         v16.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 3)
669    umlal       v20.8h, v13.8b, v6.8b       //mul (row 2)
670
671    st1         {v24.s}[0],[x2], x3         //st row 0
672    rshrn       v22.8b, v22.8h,#5           //round shift (row 1)
673
674    tbl         v17.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 3)
675
676    umull       v18.8h, v16.8b, v7.8b       //mul (row 3)
677    umlal       v18.8h, v17.8b, v6.8b       //mul (row 3)
678
679    st1         {v22.s}[0],[x2], x3         //st row 1
680    rshrn       v20.8b, v20.8h,#5           //round shift (row 2)
681
682    st1         {v20.s}[0],[x2], x3         //st row 2
683
684    rshrn       v18.8b, v18.8h,#5           //round shift (row 3)
685
686    st1         {v18.s}[0],[x2], x3         //st (row 3)
687
688end_func:
689    add         sp, sp, #132
690    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
691    ldp         x19, x20,[sp],#16
692    ldp         d14,d15,[sp],#16
693    ldp         d12,d13,[sp],#16
694    ret
695
696
697
698
699
700
701