1///*****************************************************************************
2//*
3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************/
18///**
19///*******************************************************************************
20//* //file
21//*  ihevc_deblk_luma_vert.s
22//*
23//* //brief
24//*  contains function definitions for inter prediction  interpolation.
25//* functions are coded using neon  intrinsics and can be compiled using
26
27//* rvct
28//*
29//* //author
30//*  anand s
31//*
32//* //par list of functions:
33//*
34//*
35//* //remarks
36//*  none
37//*
38//*******************************************************************************/
39
40.text
41.align 4
42
43
44
45.extern gai4_ihevc_tc_table
46.extern gai4_ihevc_beta_table
47
48.globl ihevc_deblk_luma_vert_av8
49
50.type ihevc_deblk_luma_vert_av8, %function
51
52ihevc_deblk_luma_vert_av8:
53
54    sxtw        x5,w5
55    sxtw        x6,w6
56    stp         d8,d9,[sp,#-16]!
57    stp         d10,d11,[sp,#-16]!
58    stp         d12,d13,[sp,#-16]!
59    stp         d14,d15,[sp,#-16]!
60    stp         x19, x20,[sp,#-16]!
61    stp         x21, x22,[sp,#-16]!
62    mov         x21,x7
63    ldr         w22,[sp,#96]
64    add         x3,x3,x4
65    add         x3,x3,#1
66    asr         x3,x3,#1
67    add         x7,x3,x5,lsl #1
68    add         x3,x3,x6,lsl #1
69    cmp         x7,#0x33
70    mov         x20,#0x33
71    csel        x7, x20, x7,gt
72    bgt         l1.56
73    cmp         x7,#0x0
74    mov         x20,#0x0
75    csel        x7, x20, x7,lt              // x7 has the beta_index value
76l1.56:
77
78//     bic      x2,x2,#1
79    asr         x2,x2,#1
80
81    add         x3,x3,x2,lsl #1
82    cmp         x3,#0x35
83    mov         x20,#0x35
84    csel        x3, x20, x3,gt
85    bgt         l1.88
86    cmp         x3,#0x0
87    mov         x20,#0x0
88    csel        x3, x20, x3,lt              // x3 has the tc_index value
89
90//    qp_luma = (quant_param_p + quant_param_q + 1) >> 1//
91//    beta_indx = clip3(qp_luma + (beta_offset_div2 << 1), 0, 51)//
92//    tc_indx = clip3(qp_luma + (2 * (bs >> 1)) + (tc_offset_div2 << 1), 0, 53)//
93
94l1.88:
95    adrp        x2, :got:gai4_ihevc_beta_table
96    ldr         x2, [x2, #:got_lo12:gai4_ihevc_beta_table]
97
98    movi        v18.8b, #0x2
99    adrp        x4, :got:gai4_ihevc_tc_table
100    ldr         x4, [x4, #:got_lo12:gai4_ihevc_tc_table]
101
102    ldr         w5,[x2,x7,lsl #2]           // beta
103    movi        v16.8h, #0x2
104    ldr         w6,[x4,x3,lsl #2]           // tc
105    lsl         x8,x6,#1
106    cmp         x6,#0
107    dup         v19.8b,w8
108    sub         x7,x0,#4
109    movi        v23.8b, #0x3
110    beq         l1.964
111
112
113    sub         x19,x0,#3
114    ld1         {v15.8b},[x7],x1
115    ldrb        w8,[x19]                    // -3 value
116    ld1         {v1.8b},[x7],x1
117    ldrb        w10,[x19,#1]                //-2 value
118    ld1         {v29.8b},[x7],x1
119    ldrb        w11,[x19,#2]                //-1 value
120    ld1         {v0.8b},[x7]
121    ldrb        w12,[x0,#0]                 // 0 value
122    ldrb        w9,[x0,#1]                  // 1 value
123    trn1        v24.8b,v15.8b,v1.8b
124    trn2        v1.8b,v15.8b,v1.8b
125    ldrb        w2,[x0,#2]                  // 2 value
126    trn1        v2.8b,v29.8b,v0.8b
127    trn2        v0.8b,v29.8b,v0.8b
128    add         x12,x12,x2
129    subs        x9,x12,x9,lsl #1            // dq0 value is stored in x9
130    csneg       x9,x9,x9,pl
131//dq0 = abs( pu1_src[2] - 2 * pu1_src[1] + pu1_src[0] )//
132    mov         v29.8b,v24.8b
133    trn1        v24.4h,v29.4h,v2.4h
134    trn2        v2.4h,v29.4h,v2.4h
135    add         x8,x8,x11
136    mov         v15.8b,v1.8b
137    trn1        v1.4h,v15.4h,v0.4h
138    trn2        v0.4h,v15.4h,v0.4h
139    subs        x8,x8,x10,lsl #1
140    csneg       x8,x8,x8,pl
141//  dp0 = abs( pu1_src[-3] - 2 * pu1_src[-2] + pu1_src[-1] )//
142
143
144
145    add         x14,x1,x1,lsl #1
146    add         x14,x0,x14
147
148    sub         x19,x14,#3
149    dup         v4.2s, v24.s[1]
150    ldrb        w2,[x19]                    // -2 value
151    dup         v7.2s, v2.s[1]
152    ldrb        w10,[x19,#1]                // -2 value
153    dup         v3.2s, v2.s[0]
154    ldrb        w11,[x19,#2]                // -1 value
155    dup         v5.2s, v1.s[1]
156    ldrb        w12,[x14,#0]                // 0 value
157    dup         v6.2s, v1.s[0]
158    ldrb        w3,[x14,#1]                 // 1 value
159    dup         v2.2s, v0.s[0]
160    ldrb        w4,[x14,#2]                 // 2 value
161
162
163    add         x12,x12,x4
164    subs        x12,x12,x3,lsl #1           // dq3value is stored in x12
165    csneg       x12,x12,x12,pl
166//    dq3 = abs( pu1_src[3 * src_strd + 2] - 2 * pu1_src[3 * src_strd + 1]+ pu1_src[3 * src_strd + 0] )//
167
168
169    add         x2,x2,x11
170    subs        x11,x2,x10,lsl #1
171    csneg       x11,x11,x11,pl              // dp3 value is stored in x8
172//    dp3 = abs( pu1_src[3 * src_strd - 3] - 2 * pu1_src[3 * src_strd - 2]   + pu1_src[3 * src_strd - 1] )//
173
174
175
176    add         x3,x8,x9                    // x3 has the d0 value
177    add         x4,x11,x12                  // x4 has the d3 value
178
179
180//    d0 = dp0 + dq0//
181//    d3 = dp3 + dq3//
182
183    add         x14,x8,x11                  // x13 has the value dp
184    add         x12,x12,x9                  // x12 has the value  dq
185//    dp = dp0 + dp3//
186//   dq = dq0 + dq3//
187
188    add         x11, x3, x4                 // x3 has the value d
189
190//   d = d0 + d3//
191
192
193    cmp         x11,x5
194    dup         v22.2s, v0.s[1]
195    bge         l1.964
196
197//    if(d < beta)
198
199
200    // registers which cannont be altered : x3,x4 x5,x6,x12,x13,x0,x1,x11
201
202    // registers for use: x2,x7,x8,x9,x10,
203    uqsub       v30.8b,v7.8b,v19.8b
204    asr         x10,x5,#2
205    uqadd       v31.8b,v7.8b,v19.8b
206    cmp         x10,x3,lsl #1
207    uaddl       v0.8h,v5.8b,v4.8b
208    ble         l1.336
209
210    sub         x19,x0,4
211    ldrb        w2,[x19]
212    uaddw       v0.8h,  v0.8h ,  v2.8b
213    ldrb        w7,[x19,#3]
214    umull       v20.8h, v7.8b, v23.8b
215    ldrb        w3,[x0,#0]
216    umlal       v20.8h, v22.8b, v18.8b
217    ldrb        w8,[x0,#3]
218//   ubfx   x7,x2,#24,#8           // has the -1 value
219//  and    x2,#0xff               // has the -4 value
220//  ubfx   x8,x3,#24,#8           // has the 3 value
221//  and    x3,#0xff               // x4 has the 0 value
222
223    add         v20.8h,  v20.8h ,  v0.8h
224    subs        x8,x8,x3
225    rshrn       v22.8b,v20.8h,#3
226    csneg       x8,x8,x8,pl
227    subs        x2,x2,x7
228    umin        v21.8b,  v22.8b ,  v31.8b
229    csneg       x2,x2,x2,pl
230    umax        v22.8b,  v21.8b ,  v30.8b
231    add         x8,x8,x2
232    uaddl       v20.8h,v7.8b,v3.8b
233    cmp         x8,x5,asr #3
234    mla         v20.8h, v0.8h, v16.8h
235    bge         l1.336
236    uaddw       v0.8h,  v0.8h ,  v7.8b
237    subs        x7,x3,x7
238    rshrn       v20.8b,v20.8h,#3
239    csneg       x7,x7,x7,pl
240    rshrn       v0.8b,v0.8h,#2
241    mov         x10,#5
242    uqadd       v30.8b,v5.8b,v19.8b
243    mul         x10, x10, x6
244    uqsub       v31.8b,v5.8b,v19.8b
245    add         x10, x10,#1
246    cmp         x7,x10,asr #1
247    bge         l1.336
248
249
250//        if( (2 * d3 < (beta >> 2)&& ( abs(pu1_src[3] - pu1_src[0]) + abs(pu1_src[-1] - pu1_src[-4])  < (beta >> 3) )
251//            && abs(pu1_src[0] - pu1_src[-1]) < ( (5 * tc + 1) >> 1 ) )
252
253
254    asr         x10,x5,#2
255    uqsub       v25.8b,v4.8b,v19.8b
256    cmp         x10,x4,lsl #1
257    uqadd       v21.8b,v4.8b,v19.8b
258    ble         l1.336
259    umin        v26.8b,  v20.8b ,  v21.8b
260    add         x4,x1,x1,lsl #1
261    add         x4,x4,x0
262    umax        v20.8b,  v26.8b ,  v25.8b
263    sub         x19,x4,#4
264    ldrb        w2,[x19]
265    umin        v19.8b,  v0.8b ,  v30.8b
266    ldrb        w7,[x19,#3]
267    umax        v21.8b,  v19.8b ,  v31.8b
268    ldrb        w3,[x4,#0]
269    lsl         x10,x6,#1
270    ldrb        w8,[x4,#3]
271//   ubfx   x7,x2,#24,#8           // has the -1 value
272//  and    x2,#0xff               // has the -4 value
273//  ubfx   x8,x3,#24,#8           // has the 3 value
274//  and    x3,#0xff               // x4 has the 0 value
275    uaddl       v0.8h,v2.8b,v3.8b
276    dup         v19.8b,w10
277    subs        x8,x8,x3
278    uaddw       v0.8h,  v0.8h ,  v4.8b
279    csneg       x8,x8,x8,pl
280    uqadd       v30.8b,v2.8b,v19.8b
281    subs        x2,x2,x7
282    uqsub       v31.8b,v2.8b,v19.8b
283    csneg       x2,x2,x2,pl
284    uaddl       v26.8h,v5.8b,v6.8b
285    add         x8,x8,x2
286    mla         v26.8h, v0.8h, v16.8h
287    cmp         x8,x5,asr #3
288    bge         l1.336
289    rshrn       v26.8b,v26.8h,#3
290    subs        x7,x3,x7
291    uqadd       v27.8b,v3.8b,v19.8b
292    csneg       x7,x7,x7,pl
293    uqsub       v28.8b,v3.8b,v19.8b
294    mov         x10,#5
295    umin        v16.8b,  v26.8b ,  v30.8b
296    mul         x10, x10, x6
297    add         x10, x10,#1
298    cmp         x7,x10,asr #1
299    umax        v26.8b,  v16.8b ,  v31.8b
300    bge         l1.336
301    uqadd       v30.8b,v6.8b,v19.8b
302
303    mov         x2,#2
304    mov         x4,x21
305    uqsub       v31.8b,v6.8b,v19.8b
306    mov         x5,x22
307    b           end_dep_deq_decision
308// x2 has the value of de
309// x6 has teh value of tc
310// x5 has the value of beta
311// x14 has the value of dp
312// x12 has the value of dq
313// x0 has the value of source address
314// x1 has the src stride
315
316l1.336:
317    mov         x2,#1
318l1.424:
319    mov         x11,x5
320    mov         x4,x21
321    mov         x5,x22
322
323    cmp         x6,#1
324    mov         x20,#0
325    csel        x9, x20, x9,eq
326    mov         x20,#0
327    csel        x10, x20, x10,eq
328    beq         end_dep_deq_decision
329
330    and         x7,x4,x5
331
332    cmp         x7,#1
333    beq         both_flags_set
334    cmp         x4,#0
335    beq         set_flag_dep_zero
336
337
338    add         x8,x11,x11,asr #1
339    mov         x10,#0
340    asr         x8,x8,#3
341    cmp         x8,x14
342    mov         x20,#1
343    csel        x9, x20, x9,gt
344    mov         x20,#0
345    csel        x9, x20, x9,le
346    b           end_dep_deq_decision
347set_flag_dep_zero:
348
349    add         x8,x11,x11,asr #1
350    mov         x9,#0
351    asr         x8,x8,#3
352    cmp         x8,x12
353    mov         x20,#1
354    csel        x10, x20, x10,gt
355    mov         x20,#0
356    csel        x10, x20, x10,le
357    b           end_dep_deq_decision
358
359both_flags_set:
360    add         x8,x11,x11,asr #1
361    asr         x8,x8,#3
362    cmp         x8,x14
363    mov         x20,#1
364    csel        x9, x20, x9,gt
365    mov         x20,#0
366    csel        x9, x20, x9,le
367    cmp         x8,x12
368    mov         x20,#1
369    csel        x10, x20, x10,gt
370    mov         x20,#0
371    csel        x10, x20, x10,le
372end_dep_deq_decision:
373
374//x0=source address
375//x1=stride
376// x2 =de
377// x4=flag p
378//x5= flag q
379//x6 =tc
380// x9 =dep
381// x10=deq
382//    b    l1.964
383
384
385    cmp         x2,#2
386// x4 has the value of de
387    bne         l1.968
388
389    cmp         x5,#0
390    beq         l1.780
391// x5 has the flag of q
392
393    add         x3,x0,#2
394    st1         {v22.b}[0],[x3],x1
395
396    st1         {v22.b}[1],[x3],x1
397
398    st1         {v22.b}[2],[x3],x1
399
400    st1         {v22.b}[3],[x3]
401    add         x3,x0,x1
402    mov         v29.8b,v20.8b
403    trn1        v20.8b,v29.8b,v21.8b
404    trn2        v21.8b,v29.8b,v21.8b
405
406    st1         {v20.h}[0],[x0]
407    st1         {v21.h}[0],[x3],x1
408    st1         {v20.h}[1],[x3],x1
409    st1         {v21.h}[1],[x3]
410
411
412l1.780:
413    cmp         x4,#0
414    beq         l1.964
415    // x4 has the flag p
416
417
418    dup         v7.2s, v24.s[0]
419    sub         x3,x0,#1
420    uaddw       v16.8h,  v0.8h ,  v6.8b
421    add         x7,x3,x1
422    rshrn       v2.8b,v16.8h,#2
423    st1         {v26.b}[0],[x3]
424    sub         x0,x0,#3
425    umin        v16.8b,  v2.8b ,  v27.8b
426    st1         {v26.b}[1],[x7],x1
427    umull       v2.8h, v6.8b, v23.8b
428    umlal       v2.8h, v7.8b, v18.8b
429    st1         {v26.b}[2],[x7],x1
430    umax        v5.8b,  v16.8b ,  v28.8b
431    st1         {v26.b}[3],[x7]
432    add         v0.8h,  v2.8h ,  v0.8h
433    rshrn       v0.8b,v0.8h,#3
434
435
436    umin        v1.8b,  v0.8b ,  v30.8b
437    umax        v0.8b,  v1.8b ,  v31.8b
438
439    mov         v29.8b,v0.8b
440    trn1        v0.8b,v29.8b,v5.8b
441    trn2        v5.8b,v29.8b,v5.8b
442    st1         {v0.h}[0],[x0],x1
443    st1         {v5.h}[0],[x0],x1
444    st1         {v0.h}[1],[x0],x1
445    st1         {v5.h}[1],[x0]
446l1.964:
447    ldp         x21, x22,[sp],#16
448    ldp         x19, x20,[sp],#16
449    ldp         d14,d15,[sp],#16
450    ldp         d12,d13,[sp],#16
451    ldp         d10,d11,[sp],#16
452    ldp         d8,d9,[sp],#16
453    ret
454
455l1.968:
456
457
458    movi        v0.8h, #0x9
459    neg         x11, x6
460    cmp         x4,#0
461    // checks for the flag p
462    movi        v16.8h, #0x3
463    movi        v24.8b, #0x1
464
465
466    dup         v30.8b,w11
467    and         x11,x6,#0xff
468    dup         v31.8b,w11
469
470    usubl       v18.8h,v4.8b,v2.8b
471    mul         v18.8h, v18.8h, v0.8h
472    usubl       v0.8h,v5.8b,v3.8b
473
474
475
476    mul         v16.8h, v0.8h, v16.8h
477    sub         v16.8h,  v18.8h ,  v16.8h
478    srshr       v16.8h,v16.8h,#4
479//   delta = ( 9 * (pu1_src[0] - pu1_src[-1]) - 3 * (pu1_src[1] - pu1_src[-2]) + 8 ) >> 4//
480
481    abs         v0.8h, v16.8h
482    xtn         v0.8b,  v0.8h
483    // storing the absolute values of delta in d0
484
485    sqxtn       v16.8b,v16.8h
486    // storing the clipped values of delta in d16
487
488    movi        v1.8b, #0xa
489    dup         v21.8b,w11
490    mul         v1.8b, v1.8b, v21.8b
491    // d1 stores the value (10 * tc)
492
493//if(abs(delta) < 10 * tc)
494
495    smin        v18.8b,  v16.8b ,  v31.8b
496    smax        v20.8b,  v18.8b ,  v30.8b
497
498// delta = clip3(delta, -tc, tc)//
499    sxtl        v16.8h, v20.8b
500    uxtl        v18.8h, v2.8b
501    add         v18.8h,  v18.8h ,  v16.8h
502
503    sqxtun      v22.8b, v18.8h
504    uxtl        v18.8h, v4.8b
505    sub         v16.8h,  v18.8h ,  v16.8h
506    sqxtun      v23.8b, v16.8h
507// tmp_p0 = clip_u8(pu1_src[-1] + delta)//
508//  tmp_q0 = clip_u8(pu1_src[0] - delta)//
509    beq         l1.1272
510
511
512
513    cmp         x9,#1
514    bne         l1.1212
515// checks for the flag dep
516
517    asr         x3,x6,#1
518
519
520    uaddl       v16.8h,v6.8b,v2.8b
521    uaddw       v16.8h,  v16.8h ,  v24.8b
522    dup         v18.8b,w3
523    sub         x20,x3,#0
524    neg         x3, x20
525    dup         v19.8b,w3
526    ushr        v16.8h,v16.8h,#1
527    xtn         v16.8b,  v16.8h
528
529    usubl       v16.8h,v16.8b,v3.8b
530    saddw       v16.8h,  v16.8h ,  v20.8b
531    sshr        v16.8h,v16.8h,#1
532    sqxtn       v16.8b,v16.8h
533
534    smin        v17.8b,  v16.8b ,  v18.8b
535    smax        v16.8b,  v19.8b ,  v17.8b
536
537
538
539
540    uxtl        v18.8h, v3.8b
541    sxtl        v16.8h, v16.8b
542    add         v16.8h,  v18.8h ,  v16.8h
543
544    sqxtun      v16.8b, v16.8h
545    mov         v30.8b,v3.8b
546    cmhs        v3.8b,v0.8b,v1.8b
547
548
549    bsl         v3.8b,v30.8b,v16.8b
550l1.1212:
551    dup         v16.8b,w11
552    sub         x12,x0,#3
553    sub         x3,x0,#1
554//     smul v16.8b, v16.8b, v1.8b
555    mov         v29.8b,v6.8b
556    trn1        v6.8b,v29.8b,v3.8b
557    trn2        v3.8b,v29.8b,v3.8b
558    st1         {v6.h}[0],[x12],x1
559    cmhs        v16.8b,v0.8b,v1.8b
560    st1         {v3.h}[0],[x12],x1
561    bsl         v16.8b,v2.8b,v22.8b
562    st1         {v16.b}[0],[x3],x1
563    st1         {v16.b}[1],[x3],x1
564    st1         {v6.h}[1],[x12],x1
565    st1         {v16.b}[2],[x3],x1
566    st1         {v3.h}[1],[x12]
567    st1         {v16.b}[3],[x3]
568l1.1272:
569    cmp         x5,#0
570    beq         l1.964
571    // checks for the flag q
572    cmp         x10,#1
573    bne         l1.1412
574    // checks for the flag deq
575    mov         v2.8b,v7.8b
576    asr         x3,x6,#1
577
578    dup         v6.8b,w3
579    sub         x20,x3,#0
580    neg         x3, x20
581    dup         v16.8b,w3
582    uaddl       v2.8h,v2.8b,v4.8b
583    uaddw       v2.8h,  v2.8h ,  v24.8b
584    ushr        v2.8h,v2.8h,#1
585    xtn         v2.8b,  v2.8h
586
587    usubl       v2.8h,v2.8b,v5.8b
588    ssubw       v2.8h,  v2.8h ,  v20.8b
589    sshr        v2.8h,v2.8h,#1
590    sqxtn       v3.8b,v2.8h
591
592    smin        v2.8b,  v3.8b ,  v6.8b
593    smax        v3.8b,  v16.8b ,  v2.8b
594    //  dup  v6.8b,w2
595    //   smul v6.8b, v6.8b, v1.8b
596
597
598
599    uxtl        v16.8h, v5.8b
600    sxtl        v2.8h, v3.8b
601    add         v2.8h,  v16.8h ,  v2.8h
602    sqxtun      v3.8b, v2.8h
603    mov         v30.8b,v5.8b
604    cmhs        v5.8b,v0.8b,v1.8b
605
606
607    bsl         v5.8b,v30.8b,v3.8b
608l1.1412:
609    //  dup  v2.8b,w2
610    add         x3,x0,#2
611    add         x11,x3,x1
612    //   smul v1.8b, v2.8b, v1.8b
613    st1         {v7.b}[0],[x3]
614    st1         {v7.b}[1],[x11],x1
615    st1         {v7.b}[2],[x11],x1
616    cmhs        v0.8b,v0.8b,v1.8b
617    st1         {v7.b}[3],[x11]
618    bsl         v0.8b,v4.8b,v23.8b
619    mov         v29.8b,v0.8b
620    trn1        v0.8b,v29.8b,v5.8b
621    trn2        v5.8b,v29.8b,v5.8b
622    st1         {v0.h}[0],[x0],x1
623    st1         {v5.h}[0],[x0],x1
624    st1         {v0.h}[1],[x0],x1
625    st1         {v5.h}[1],[x0]
626
627    ldp         x21, x22,[sp],#16
628    ldp         x19, x20,[sp],#16
629    ldp         d14,d15,[sp],#16
630    ldp         d12,d13,[sp],#16
631    ldp         d10,d11,[sp],#16
632    ldp         d8,d9,[sp],#16
633    ret
634
635
636