1//******************************************************************************
2//*
3//* Copyright (C) 2015 The Android Open Source Project
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************
18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19//*/
20///**
21// *******************************************************************************
22// * @file
23// *  impeg2_idct.s
24// *
25// * @brief
26// *  contains function definitions for single stage  inverse transform
27// *
28// * @author
29// *  anand s
30// *
31// * @par list of functions:
32// *  - impeg2_idct_recon_dc_av8()
33// *
34// * @remarks
35// *  none
36// *
37// *******************************************************************************
38//*/
39
40///**
41// *******************************************************************************
42// *
43// * @brief
44// *  this function performs inverse transform  and reconstruction for 8x8
45// * input block
46// *
47// * @par description:
48// *  performs inverse transform and adds the prediction  data and clips output
49// * to 8 bit
50// *
51// * @param[in] pi2_src
52// *  input 8x8 coefficients
53// *
54// * @param[in] pi2_tmp
55// *  temporary 8x8 buffer for storing inverse
56// *
57// *  transform
58// *  1st stage output
59// *
60// * @param[in] pu1_pred
61// *  prediction 8x8 block
62// *
63// * @param[out] pu1_dst
64// *  output 8x8 block
65// *
66// * @param[in] src_strd
67// *  input stride
68// *
69// * @param[in] pred_strd
70// *  prediction stride
71// *
72// * @param[in] dst_strd
73// *  output stride
74// *
75// * @param[in] shift
76// *  output shift
77// *
78// * @param[in] zero_cols
79// *  zero columns in pi2_src
80// *
81// * @returns  void
82// *
83// * @remarks
84// *  none
85// *
86// *******************************************************************************
87// */
88
89//void impeg2_itrans_recon_8x8(word16 *pi2_src,
90//                            word16 *pi2_tmp,
91//                            uword8 *pu1_pred,
92//                            uword8 *pu1_dst,
93//                            word32 src_strd,
94//                            word32 pred_strd,
95//                            word32 dst_strd,
96//                            word32 zero_cols
97//                             word32    zero_rows                )
98
99//**************variables vs registers*************************
100//    x0 => *pi2_src
101//    x1 => *pi2_tmp
102//    x2 => *pu1_pred
103//    x3 => *pu1_dst
104//    src_strd
105//    pred_strd
106//    dst_strd
107//    zero_cols
108
109
110
111.text
112.align 4
113.include "impeg2_neon_macros.s"
114
115.set idct_stg1_shift       ,            12
116.set idct_stg2_shift       ,            16
117.set idct_stg1_round        ,           (1 << (idct_stg1_shift - 1))
118.set idct_stg2_round        ,           (1 << (idct_stg2_shift - 1))
119
120.extern gai2_impeg2_idct_q15
121.extern gai2_impeg2_idct_q11
122.extern gai2_impeg2_idct_first_col_q15
123.extern gai2_impeg2_idct_first_col_q11
124.extern gai2_impeg2_mismatch_stg2_additive
125
126.global impeg2_idct_recon_dc_av8
127impeg2_idct_recon_dc_av8:
128    // STMFD sp!,{x4,x6,x12,x14}
129    push_v_regs
130    ////x0: pi2_src
131    ////x1: pi2_tmp - not used, used as pred_strd
132    ////x2: pu1_pred
133    ////x3: pu1_dst
134    ////x4: used as scratch
135    ////x5: pred_strd
136    ////x6: dst_strd
137
138    ldrsh           x4, [x0]
139    adrp            x14, :got:gai2_impeg2_idct_q15
140    ldr             x14, [x14, #:got_lo12:gai2_impeg2_idct_q15]
141    ldrsh           x12, [x14]
142
143    ld1             {v0.8b}, [x2], x5
144    mul             x4, x4, x12
145
146    ld1             {v1.8b}, [x2], x5
147    add             x4, x4, #idct_stg1_round
148
149    ld1             {v2.8b}, [x2], x5
150    asr             x4, x4, #idct_stg1_shift
151
152    adrp            x14, :got:gai2_impeg2_idct_q11
153    ldr             x14, [x14, #:got_lo12:gai2_impeg2_idct_q11]
154    ldrsh           x12, [x14]
155
156    ld1             {v3.8b}, [x2], x5
157    mul             x4, x4, x12
158
159    ld1             {v4.8b}, [x2], x5
160    add             x4, x4, #idct_stg2_round
161
162    ld1             {v5.8b}, [x2], x5
163    asr             x4, x4, #idct_stg2_shift
164
165    ld1             {v6.8b}, [x2], x5
166    dup             v30.8h, w4
167
168
169    ld1             {v7.8b}, [x2], x5
170
171    uaddw           v8.8h, v30.8h , v0.8b
172
173    uaddw           v10.8h, v30.8h , v1.8b
174    sqxtun          v0.8b, v8.8h
175
176    uaddw           v12.8h, v30.8h , v2.8b
177    sqxtun          v1.8b, v10.8h
178    st1             {v0.8b}, [x3], x6
179
180    uaddw           v14.8h, v30.8h , v3.8b
181    sqxtun          v2.8b, v12.8h
182    st1             {v1.8b}, [x3], x6
183
184    uaddw           v16.8h, v30.8h , v4.8b
185    sqxtun          v3.8b, v14.8h
186    st1             {v2.8b}, [x3], x6
187
188    uaddw           v18.8h, v30.8h , v5.8b
189    sqxtun          v4.8b, v16.8h
190    st1             {v3.8b}, [x3], x6
191
192    uaddw           v20.8h, v30.8h , v6.8b
193    sqxtun          v5.8b, v18.8h
194    st1             {v4.8b}, [x3], x6
195
196    uaddw           v22.8h, v30.8h , v7.8b
197    sqxtun          v6.8b, v20.8h
198    st1             {v5.8b}, [x3], x6
199
200    sqxtun          v7.8b, v22.8h
201    st1             {v6.8b}, [x3], x6
202
203
204    st1             {v7.8b}, [x3], x6
205
206    // LDMFD sp!,{x4,x6,x12,pc}
207    pop_v_regs
208    ret
209
210
211
212.global impeg2_idct_recon_dc_mismatch_av8
213.extern gai2_impeg2_idct_last_row_q11
214.extern gai2_impeg2_mismatch_stg1_outp
215impeg2_idct_recon_dc_mismatch_av8:
216    // STMFD sp!,{x4-x12,x14}
217    push_v_regs
218
219    ldrsh           x4, [x0]
220    adrp            x14, :got:gai2_impeg2_idct_q15
221    ldr             x14, [x14, #:got_lo12:gai2_impeg2_idct_q15]
222    ldrsh           x12, [x14]
223
224    mul             x4, x4, x12
225    add             x4, x4, #idct_stg1_round
226    asr             x4, x4, #idct_stg1_shift
227
228    adrp            x14, :got:gai2_impeg2_idct_q11
229    ldr             x14, [x14, #:got_lo12:gai2_impeg2_idct_q11]
230    ldrsh           x12, [x14]
231    mul             x4, x4, x12
232    dup             v0.4s, w4
233
234    mov             x14, #16            ////Increment for table read
235    adrp            x4, :got:gai2_impeg2_mismatch_stg2_additive
236    ldr             x4, [x4, #:got_lo12:gai2_impeg2_mismatch_stg2_additive]
237
238    ld1             {v2.4h, v3.4h}, [x4], x14
239    ld1             {v30.8b}, [x2], x5
240    sxtl            v8.4s, v2.4h
241    sxtl            v10.4s, v3.4h
242    raddhn          v12.4h, v0.4s, v8.4s
243    raddhn2         v12.8h, v0.4s, v10.4s
244    uaddw           v14.8h, v12.8h , v30.8b
245    sqxtun          v30.8b, v14.8h
246    st1             {v30.8b}, [x3], x6
247
248    ld1             {v2.4h, v3.4h}, [x4], x14
249    ld1             {v30.8b}, [x2], x5
250    sxtl            v8.4s, v2.4h
251    sxtl            v10.4s, v3.4h
252    raddhn          v12.4h, v0.4s, v8.4s
253    raddhn2         v12.8h, v0.4s, v10.4s
254    uaddw           v14.8h, v12.8h , v30.8b
255    sqxtun          v30.8b, v14.8h
256    st1             {v30.8b}, [x3], x6
257
258    ld1             {v2.4h, v3.4h}, [x4], x14
259    ld1             {v30.8b}, [x2], x5
260    sxtl            v8.4s, v2.4h
261    sxtl            v10.4s, v3.4h
262    raddhn          v12.4h, v0.4s, v8.4s
263    raddhn2         v12.8h, v0.4s, v10.4s
264    uaddw           v14.8h, v12.8h , v30.8b
265    sqxtun          v30.8b, v14.8h
266    st1             {v30.8b}, [x3], x6
267
268    ld1             {v2.4h, v3.4h}, [x4], x14
269    ld1             {v30.8b}, [x2], x5
270    sxtl            v8.4s, v2.4h
271    sxtl            v10.4s, v3.4h
272    raddhn          v12.4h, v0.4s, v8.4s
273    raddhn2         v12.8h, v0.4s, v10.4s
274    uaddw           v14.8h, v12.8h , v30.8b
275    sqxtun          v30.8b, v14.8h
276    st1             {v30.8b}, [x3], x6
277
278    ld1             {v2.4h, v3.4h}, [x4], x14
279    ld1             {v30.8b}, [x2], x5
280    sxtl            v8.4s, v2.4h
281    sxtl            v10.4s, v3.4h
282    raddhn          v12.4h, v0.4s, v8.4s
283    raddhn2         v12.8h, v0.4s, v10.4s
284    uaddw           v14.8h, v12.8h , v30.8b
285    sqxtun          v30.8b, v14.8h
286    st1             {v30.8b}, [x3], x6
287
288    ld1             {v2.4h, v3.4h}, [x4], x14
289    ld1             {v30.8b}, [x2], x5
290    sxtl            v8.4s, v2.4h
291    sxtl            v10.4s, v3.4h
292    raddhn          v12.4h, v0.4s, v8.4s
293    raddhn2         v12.8h, v0.4s, v10.4s
294    uaddw           v14.8h, v12.8h , v30.8b
295    sqxtun          v30.8b, v14.8h
296    st1             {v30.8b}, [x3], x6
297
298    ld1             {v2.4h, v3.4h}, [x4], x14
299    ld1             {v30.8b}, [x2], x5
300    sxtl            v8.4s, v2.4h
301    sxtl            v10.4s, v3.4h
302    raddhn          v12.4h, v0.4s, v8.4s
303    raddhn2         v12.8h, v0.4s, v10.4s
304    uaddw           v14.8h, v12.8h , v30.8b
305    sqxtun          v30.8b, v14.8h
306    st1             {v30.8b}, [x3], x6
307
308    ld1             {v2.4h, v3.4h}, [x4], x14
309    ld1             {v30.8b}, [x2], x5
310    sxtl            v8.4s, v2.4h
311    sxtl            v10.4s, v3.4h
312    raddhn          v12.4h, v0.4s, v8.4s
313    raddhn2         v12.8h, v0.4s, v10.4s
314    uaddw           v14.8h, v12.8h , v30.8b
315    sqxtun          v30.8b, v14.8h
316    st1             {v30.8b}, [x3], x6
317
318
319    // LDMFD sp!,{x4-x12,pc}
320    pop_v_regs
321    ret
322
323.globl impeg2_idct_recon_av8
324
325.type impeg2_idct_recon_av8, %function
326
327impeg2_idct_recon_av8:
328////register usage.extern        - loading and until idct of columns
329////    cosine constants     -     d0
330////    sine constants         -     d1
331////    row 0 first half     -     d2        -    y0
332////    row 1 first half     -     d6        -    y1
333////    row 2 first half     -     d3        -    y2
334////    row 3 first half     -     d7        -    y3
335////    row 4 first half     -     d10        -    y4
336////    row 5 first half     -     d14        -    y5
337////    row 6 first half     -     d11        -    y6
338////    row 7 first half     -     d15        -    y7
339
340////    row 0 second half    -     d4        -    y0
341////    row 1 second half    -     d8      -    y1
342////    row 2 second half    -     d5      -    y2
343////    row 3 second half    -     d9      -    y3
344////    row 4 second half    -     d12     -    y4
345////    row 5 second half    -     d16     -    y5
346////    row 6 second half    -     d13     -    y6
347////    row 7 second half    -     d17     -    y7
348
349    //// copy the input pointer to another register
350    //// step 1 : load all constants
351    // stmfd sp!,{x4-x12,x14}
352
353    ldr             w11, [sp]           // zero rows
354
355    push_v_regs
356    stp             x19, x20, [sp, #-16]!
357
358    mov             x12, x7             // zero columns
359    mov             x8, x5              // prediction stride
360    mov             x7, x6              // destination stride
361    mov             x6, x4              // src stride
362    lsl             x6, x6, #1          // x sizeof(word16)
363    add             x9, x0, x6, lsl #1  // 2 rows
364
365    add             x10, x6, x6, lsl #1 // 3 rows
366
367    sub             x10, x10, #8        // - 4 cols * sizeof(word16)
368    sub             x5, x6, #8          // src_strd - 4 cols * sizeof(word16)
369
370    adrp            x14, :got:gai2_impeg2_idct_first_col_q15
371    ldr             x14, [x14, #:got_lo12:gai2_impeg2_idct_first_col_q15]
372    ld1             {v0.4h, v1.4h}, [x14] ////d0,d1 are used for storing the constant data
373
374    ////step 2 load all the input data
375    ////step 3 operate first 4 colums at a time
376
377    and             x11, x11, #0xff
378    and             x12, x12, #0xff
379
380    cmp             x11, #0xf0
381    bge             skip_last4_rows
382
383
384    ld1             {v2.4h}, [x0], #8
385    ld1             {v3.4h}, [x9], #8
386    ld1             {v4.4h}, [x0], x5
387    smull           v20.4s, v2.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1)
388    ld1             {v5.4h}, [x9], x5
389    smull           v18.4s, v3.4h, v1.4h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1)
390    ld1             {v6.4h}, [x0], #8
391    ld1             {v7.4h}, [x9], #8
392    smull           v24.4s, v6.4h, v0.4h[1] //// y1 * cos1(part of b0)
393    ld1             {v8.4h}, [x0], x10
394    smull           v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1)
395    ld1             {v9.4h}, [x9], x10
396    smull           v28.4s, v6.4h, v1.4h[1] //// y1 * sin3(part of b2)
397    ld1             {v10.4h}, [x0], #8
398    smull           v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3)
399    ld1             {v11.4h}, [x9], #8
400    smlal           v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
401    ld1             {v12.4h}, [x0], x5
402    smlsl           v26.4s, v7.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
403    ld1             {v13.4h}, [x9], x5
404    smlsl           v28.4s, v7.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
405    ld1             {v14.4h}, [x0], #8
406    smlsl           v30.4s, v7.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
407    ld1             {v15.4h}, [x9], #8
408    smull           v22.4s, v10.4h, v0.4h[0] //// y4 * cos4(part of c0 and c1)
409    ld1             {v16.4h}, [x0], x10
410    smull           v6.4s, v3.4h, v0.4h[2] //// y2 * cos2(part of d0)
411    ld1             {v17.4h}, [x9], x10
412
413    ///* this following was activated when alignment is not there */
414////    vld1.16        d2,[x0]!
415////    vld1.16        d3,[x2]!
416////    vld1.16        d4,[x0]!
417////    vld1.16        d5,[x2]!
418////    vld1.16        d6,[x0]!
419////    vld1.16        d7,[x2]!
420////    vld1.16        d8,[x0],x3
421////    vld1.16        d9,[x2],x3
422////    vld1.16        d10,[x0]!
423////    vld1.16        d11,[x2]!
424////    vld1.16        d12,[x0]!
425////    vld1.16        d13,[x2]!
426////    vld1.16        d14,[x0]!
427////    vld1.16        d15,[x2]!
428////    vld1.16        d16,[x0],x3
429////    vld1.16        d17,[x2],x3
430
431
432
433
434    smlal           v24.4s, v14.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
435    smlsl           v26.4s, v14.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
436    smlal           v28.4s, v14.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
437    smlal           v30.4s, v14.4h, v0.4h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
438
439    smlsl           v18.4s, v11.4h, v0.4h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
440    smlal           v6.4s, v11.4h, v1.4h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
441
442    add             v10.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
443    sub             v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
444
445    smlal           v24.4s, v15.4h, v1.4h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7)
446    smlsl           v26.4s, v15.4h, v1.4h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6)
447    smlal           v28.4s, v15.4h, v0.4h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5)
448    smlsl           v30.4s, v15.4h, v0.4h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4)
449
450    add             v14.4s, v10.4s , v6.4s ////    a0 = c0 + d0(part of x0,x7)
451    sub             v10.4s, v10.4s , v6.4s //// a3 = c0 - d0(part of x3,x4)
452    sub             v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5)
453    add             v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6)
454
455    add             v20.4s, v14.4s , v24.4s //// a0 + b0(part of x0)
456    sub             v6.4s, v14.4s , v24.4s //// a0 - b0(part of x7)
457
458    add             v24.4s, v22.4s , v28.4s //// a2 + b2(part of x2)
459    sub             v22.4s, v22.4s , v28.4s //// a2 - b2(part of x5)
460
461    add             v28.4s, v18.4s , v26.4s //// a1 + b1(part of x1)
462    sub             v18.4s, v18.4s , v26.4s //// a1 - b1(part of x6)
463
464    add             v26.4s, v10.4s , v30.4s //// a3 + b3(part of x3)
465    sub             v30.4s, v10.4s , v30.4s //// a3 - b3(part of x4)
466
467    sqrshrn         v2.4h, v20.4s, #idct_stg1_shift //// x0 = (a0 + b0 + rnd) >> 7(IDCT_STG1_SHIFT)
468    sqrshrn         v15.4h, v6.4s, #idct_stg1_shift //// x7 = (a0 - b0 + rnd) >> 7(IDCT_STG1_SHIFT)
469    sqrshrn         v3.4h, v24.4s, #idct_stg1_shift //// x2 = (a2 + b2 + rnd) >> 7(IDCT_STG1_SHIFT)
470    sqrshrn         v14.4h, v22.4s, #idct_stg1_shift //// x5 = (a2 - b2 + rnd) >> 7(IDCT_STG1_SHIFT)
471    sqrshrn         v6.4h, v28.4s, #idct_stg1_shift //// x1 = (a1 + b1 + rnd) >> 7(IDCT_STG1_SHIFT)
472    sqrshrn         v11.4h, v18.4s, #idct_stg1_shift //// x6 = (a1 - b1 + rnd) >> 7(IDCT_STG1_SHIFT)
473    sqrshrn         v7.4h, v26.4s, #idct_stg1_shift //// x3 = (a3 + b3 + rnd) >> 7(IDCT_STG1_SHIFT)
474    sqrshrn         v10.4h, v30.4s, #idct_stg1_shift //// x4 = (a3 - b3 + rnd) >> 7(IDCT_STG1_SHIFT)
475
476
477    b               last4_cols
478
479
480
481skip_last4_rows:
482    adrp            x14, :got:gai2_impeg2_idct_first_col_q15
483    ldr             x14, [x14, #:got_lo12:gai2_impeg2_idct_first_col_q15]
484    ld1             {v0.4h, v1.4h}, [x14]
485
486    ld1             {v2.4h}, [x0], #8
487    ld1             {v3.4h}, [x9], #8
488    ld1             {v4.4h}, [x0], x5
489    ld1             {v5.4h}, [x9], x5
490    ld1             {v6.4h}, [x0], #8
491    ld1             {v7.4h}, [x9], #8
492    ld1             {v8.4h}, [x0], x10
493    ld1             {v9.4h}, [x9], x10
494
495
496
497    movi            v12.4h, #0
498    movi            v13.4h, #0
499    movi            v16.4h, #0
500    movi            v17.4h, #0
501
502
503
504
505    smull           v24.4s, v6.4h, v0.4h[1] //// y1 * cos1(part of b0)
506    smull           v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1)
507    smull           v28.4s, v6.4h, v1.4h[1] //// y1 * sin3(part of b2)
508    smull           v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3)
509
510    smlal           v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
511    smlsl           v26.4s, v7.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
512    smlsl           v28.4s, v7.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
513    smlsl           v30.4s, v7.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
514
515    smull           v18.4s, v3.4h, v1.4h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1)
516    smull           v6.4s, v3.4h, v0.4h[2] //// y2 * cos2(part of d0)
517
518    smull           v20.4s, v2.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1)
519
520
521    add             v14.4s, v20.4s , v6.4s ////    a0 = c0 + d0(part of x0,x7)
522    sub             v10.4s, v20.4s , v6.4s //// a3 = c0 - d0(part of x3,x4)
523    sub             v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5)
524    add             v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6)
525
526    add             v20.4s, v14.4s , v24.4s //// a0 + b0(part of x0)
527    sub             v6.4s, v14.4s , v24.4s //// a0 - b0(part of x7)
528
529    add             v24.4s, v22.4s , v28.4s //// a2 + b2(part of x2)
530    sub             v22.4s, v22.4s , v28.4s //// a2 - b2(part of x5)
531
532    add             v28.4s, v18.4s , v26.4s //// a1 + b1(part of x1)
533    sub             v18.4s, v18.4s , v26.4s //// a1 - b1(part of x6)
534
535    add             v26.4s, v10.4s , v30.4s //// a3 + b3(part of x3)
536    sub             v30.4s, v10.4s , v30.4s //// a3 - b3(part of x4)
537
538    sqrshrn         v2.4h, v20.4s, #idct_stg1_shift //// x0 = (a0 + b0 + rnd) >> 7(IDCT_STG1_SHIFT)
539    sqrshrn         v15.4h, v6.4s, #idct_stg1_shift //// x7 = (a0 - b0 + rnd) >> 7(IDCT_STG1_SHIFT)
540    sqrshrn         v3.4h, v24.4s, #idct_stg1_shift //// x2 = (a2 + b2 + rnd) >> 7(IDCT_STG1_SHIFT)
541    sqrshrn         v14.4h, v22.4s, #idct_stg1_shift //// x5 = (a2 - b2 + rnd) >> 7(IDCT_STG1_SHIFT)
542    sqrshrn         v6.4h, v28.4s, #idct_stg1_shift //// x1 = (a1 + b1 + rnd) >> 7(IDCT_STG1_SHIFT)
543    sqrshrn         v11.4h, v18.4s, #idct_stg1_shift //// x6 = (a1 - b1 + rnd) >> 7(IDCT_STG1_SHIFT)
544    sqrshrn         v7.4h, v26.4s, #idct_stg1_shift //// x3 = (a3 + b3 + rnd) >> 7(IDCT_STG1_SHIFT)
545    sqrshrn         v10.4h, v30.4s, #idct_stg1_shift //// x4 = (a3 - b3 + rnd) >> 7(IDCT_STG1_SHIFT)
546
547
548last4_cols:
549    adrp            x14, :got:gai2_impeg2_idct_first_col_q15
550    ldr             x14, [x14, #:got_lo12:gai2_impeg2_idct_first_col_q15]
551    ld1             {v0.4h, v1.4h}, [x14]
552
553
554    cmp             x12, #0xf0
555    bge             skip_last4cols
556
557    smull           v24.4s, v8.4h, v0.4h[1] //// y1 * cos1(part of b0)
558    smull           v26.4s, v8.4h, v0.4h[3] //// y1 * cos3(part of b1)
559    smull           v28.4s, v8.4h, v1.4h[1] //// y1 * sin3(part of b2)
560    smull           v30.4s, v8.4h, v1.4h[3] //// y1 * sin1(part of b3)
561
562    smlal           v24.4s, v9.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
563    smlsl           v26.4s, v9.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
564    smlsl           v28.4s, v9.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
565    smlsl           v30.4s, v9.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
566
567    smull           v18.4s, v5.4h, v1.4h[2] //// y2 * sin2 (q4 is freed by this time)(part of d1)
568    smull           v8.4s, v5.4h, v0.4h[2] //// y2 * cos2(part of d0)
569
570    smull           v20.4s, v4.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1)
571    smull           v22.4s, v12.4h, v0.4h[0] //// y4 * cos4(part of c0 and c1)
572
573    smlal           v24.4s, v16.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
574    smlsl           v26.4s, v16.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
575    smlal           v28.4s, v16.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
576    smlal           v30.4s, v16.4h, v0.4h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
577
578    smlsl           v18.4s, v13.4h, v0.4h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
579    smlal           v8.4s, v13.4h, v1.4h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
580
581    add             v12.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
582    sub             v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
583
584    smlal           v24.4s, v17.4h, v1.4h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of e0,e7)
585    smlsl           v26.4s, v17.4h, v1.4h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of e1,e6)
586    smlal           v28.4s, v17.4h, v0.4h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of e2,e5)
587    smlsl           v30.4s, v17.4h, v0.4h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of e3,e4)
588
589    add             v16.4s, v12.4s , v8.4s ////    a0 = c0 + d0(part of e0,e7)
590    sub             v12.4s, v12.4s , v8.4s //// a3 = c0 - d0(part of e3,e4)
591    sub             v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of e2,e5)
592    add             v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of e1,e6)
593
594    add             v20.4s, v16.4s , v24.4s //// a0 + b0(part of e0)
595    sub             v8.4s, v16.4s , v24.4s //// a0 - b0(part of e7)
596
597    add             v24.4s, v22.4s , v28.4s //// a2 + b2(part of e2)
598    sub             v22.4s, v22.4s , v28.4s //// a2 - b2(part of e5)
599
600    add             v28.4s, v18.4s , v26.4s //// a1 + b1(part of e1)
601    sub             v18.4s, v18.4s , v26.4s //// a1 - b1(part of e6)
602
603    add             v26.4s, v12.4s , v30.4s //// a3 + b3(part of e3)
604    sub             v30.4s, v12.4s , v30.4s //// a3 - b3(part of x4)
605
606    sqrshrn         v4.4h, v20.4s, #idct_stg1_shift //// x0 = (a0 + b0 + rnd) >> 7(IDCT_STG1_SHIFT)
607    sqrshrn         v17.4h, v8.4s, #idct_stg1_shift //// x7 = (a0 - b0 + rnd) >> 7(IDCT_STG1_SHIFT)
608    sqrshrn         v5.4h, v24.4s, #idct_stg1_shift //// x2 = (a2 + b2 + rnd) >> 7(IDCT_STG1_SHIFT)
609    sqrshrn         v16.4h, v22.4s, #idct_stg1_shift //// x5 = (a2 - b2 + rnd) >> 7(IDCT_STG1_SHIFT)
610    sqrshrn         v8.4h, v28.4s, #idct_stg1_shift //// x1 = (a1 + b1 + rnd) >> 7(IDCT_STG1_SHIFT)
611    sqrshrn         v13.4h, v18.4s, #idct_stg1_shift //// x6 = (a1 - b1 + rnd) >> 7(IDCT_STG1_SHIFT)
612    sqrshrn         v9.4h, v26.4s, #idct_stg1_shift //// x3 = (a3 + b3 + rnd) >> 7(IDCT_STG1_SHIFT)
613    sqrshrn         v12.4h, v30.4s, #idct_stg1_shift //// x4 = (a3 - b3 + rnd) >> 7(IDCT_STG1_SHIFT)
614    b               end_skip_last4cols
615
616
617
618skip_last4cols:
619    adrp            x14, :got:gai2_impeg2_idct_first_col_q11
620    ldr             x14, [x14, #:got_lo12:gai2_impeg2_idct_first_col_q11]
621    ld1             {v0.4h, v1.4h}, [x14]
622
623    umov            x15, v25.d[0]
624
625    trn1            v25.4h, v2.4h, v6.4h
626    trn2            v29.4h, v2.4h, v6.4h ////[x3,x1],[x2,x0] first qudrant transposing
627
628    trn1            v27.4h, v3.4h, v7.4h
629    trn2            v31.4h, v3.4h, v7.4h ////[x3,x1],[x2,x0] first qudrant transposing
630
631    trn1            v6.2s, v29.2s, v31.2s
632    trn2            v7.2s, v29.2s, v31.2s ////x0,x1,x2,x3 first qudrant transposing continued.....
633    trn1            v2.2s, v25.2s, v27.2s
634    trn2            v3.2s, v25.2s, v27.2s ////x0,x1,x2,x3 first qudrant transposing continued.....
635
636
637    trn1            v25.4h, v10.4h, v14.4h
638    trn2            v29.4h, v10.4h, v14.4h ////[x7,x5],[x6,x4] third qudrant transposing
639
640    trn1            v27.4h, v11.4h, v15.4h
641    trn2            v31.4h, v11.4h, v15.4h ////[x7,x5],[x6,x4] third qudrant transposing
642
643    trn1            v10.2s, v25.2s, v27.2s
644    trn2            v11.2s, v25.2s, v27.2s ////x4,x5,x6,x7 third qudrant transposing continued.....
645    trn1            v14.2s, v29.2s, v31.2s
646    trn2            v15.2s, v29.2s, v31.2s ////x4,x5,x6,x7 third qudrant transposing continued.....
647
648    mov             v25.d[0], x15
649
650    smull           v24.4s, v6.4h, v0.4h[1] //// y1 * cos1(part of b0)
651    smull           v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1)
652    smull           v28.4s, v6.4h, v1.4h[1] //// y1 * sin3(part of b2)
653    smull           v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3)
654
655    smlal           v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
656    smlsl           v26.4s, v7.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
657    smlsl           v28.4s, v7.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
658    smlsl           v30.4s, v7.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
659
660    smull           v20.4s, v2.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1)
661//    vmull.s16    q11,d4,d0[0]                    @// y4 * cos4(part of c0 and c1)
662
663    smull           v18.4s, v3.4h, v1.4h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1)
664    smull           v6.4s, v3.4h, v0.4h[2] //// y2 * cos2(part of d0)
665
666
667
668
669    sub             v22.4s, v20.4s , v6.4s //// a3 = c0 - d0(part of x3,x4)
670    add             v4.4s, v20.4s , v6.4s ////    a0 = c0 + d0(part of x0,x7)
671
672
673    add             v2.4s, v4.4s , v24.4s
674
675    sub             v6.4s, v4.4s , v24.4s
676
677    add             v8.4s, v22.4s , v30.4s
678
679    sub             v24.4s, v22.4s , v30.4s
680
681    sqrshrn         v5.4h, v8.4s, #idct_stg2_shift
682    sqrshrn         v2.4h, v2.4s, #idct_stg2_shift
683    sqrshrn         v9.4h, v6.4s, #idct_stg2_shift
684    sqrshrn         v6.4h, v24.4s, #idct_stg2_shift
685
686    sub             v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5)
687    add             v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6)
688
689
690    add             v30.4s, v22.4s , v28.4s
691
692    sub             v24.4s, v22.4s , v28.4s
693
694    add             v28.4s, v18.4s , v26.4s
695
696    sub             v22.4s, v18.4s , v26.4s
697    sqrshrn         v4.4h, v30.4s, #idct_stg2_shift
698    sqrshrn         v7.4h, v24.4s, #idct_stg2_shift
699    sqrshrn         v3.4h, v28.4s, #idct_stg2_shift
700    sqrshrn         v8.4h, v22.4s, #idct_stg2_shift
701
702
703
704    umov            x19, v25.d[0]
705    umov            x20, v25.d[1]
706
707    trn1            v27.4h, v2.4h, v3.4h
708    trn2            v29.4h, v2.4h, v3.4h
709    trn1            v25.4h, v4.4h, v5.4h
710    trn2            v31.4h, v4.4h, v5.4h
711
712    trn1            v2.2s, v27.2s, v25.2s
713    trn2            v4.2s, v27.2s, v25.2s
714    trn1            v3.2s, v29.2s, v31.2s
715    trn2            v5.2s, v29.2s, v31.2s
716
717    trn1            v27.4h, v6.4h, v7.4h
718    trn2            v29.4h, v6.4h, v7.4h
719    trn1            v25.4h, v8.4h, v9.4h
720    trn2            v31.4h, v8.4h, v9.4h
721
722    trn1            v6.2s, v27.2s, v25.2s
723    trn2            v8.2s, v27.2s, v25.2s
724    trn1            v7.2s, v29.2s, v31.2s
725    trn2            v9.2s, v29.2s, v31.2s
726
727    mov             v25.d[0], x19
728    mov             v25.d[1], x20
729
730    smull           v24.4s, v14.4h, v0.4h[1] //// y1 * cos1(part of b0)
731
732    smull           v26.4s, v14.4h, v0.4h[3] //// y1 * cos3(part of b1)
733    smull           v28.4s, v14.4h, v1.4h[1] //// y1 * sin3(part of b2)
734    smull           v30.4s, v14.4h, v1.4h[3] //// y1 * sin1(part of b3)
735
736    smlal           v24.4s, v15.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
737    smlsl           v26.4s, v15.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
738    smlsl           v28.4s, v15.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
739    smlsl           v30.4s, v15.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
740    smull           v20.4s, v10.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1)
741    smull           v18.4s, v11.4h, v1.4h[2] //// y2 * sin2 (q7 is freed by this time)(part of d1)
742    smull           v14.4s, v11.4h, v0.4h[2] //// y2 * cos2(part of d0)
743
744
745    add             x4, x2, x8, lsl #1  // x4 = x2 + pred_strd * 2    => x4 points to 3rd row of pred data
746
747
748    add             x5, x8, x8, lsl #1  //
749
750
751    add             x0, x3, x7, lsl #1  // x0 points to 3rd row of dest data
752
753
754    add             x10, x7, x7, lsl #1 //
755
756    // swapping v3 and v6
757    mov             v31.d[0], v3.d[0]
758    mov             v3.d[0], v6.d[0]
759    mov             v6.d[0], v31.d[0]
760
761    // swapping v5 and v8
762    mov             v31.d[0], v5.d[0]
763    mov             v5.d[0], v8.d[0]
764    mov             v8.d[0], v31.d[0]
765
766
767    sub             v22.4s, v20.4s , v14.4s //// a3 = c0 - d0(part of x3,x4)
768    add             v12.4s, v20.4s , v14.4s ////    a0 = c0 + d0(part of x0,x7)
769
770
771    add             v0.4s, v12.4s , v24.4s
772
773
774    sub             v24.4s, v12.4s , v24.4s
775
776
777    add             v12.4s, v22.4s , v30.4s
778
779
780    sub             v14.4s, v22.4s , v30.4s
781
782    sqrshrn         v10.4h, v0.4s, #idct_stg2_shift
783    sqrshrn         v17.4h, v24.4s, #idct_stg2_shift
784    sqrshrn         v13.4h, v12.4s, #idct_stg2_shift
785    sqrshrn         v14.4h, v14.4s, #idct_stg2_shift
786
787    sub             v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5)
788    add             v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6)
789
790
791    add             v0.4s, v22.4s , v28.4s
792
793
794    sub             v24.4s, v22.4s , v28.4s
795
796
797    add             v28.4s, v18.4s , v26.4s
798
799
800    sub             v26.4s, v18.4s , v26.4s
801    ld1             {v18.8b}, [x2], x8
802
803    sqrshrn         v12.4h, v0.4s, #idct_stg2_shift
804    ld1             {v20.8b}, [x2], x5
805
806
807    sqrshrn         v15.4h, v24.4s, #idct_stg2_shift
808    ld1             {v19.8b}, [x2], x8
809
810
811
812
813    sqrshrn         v11.4h, v28.4s, #idct_stg2_shift
814    ld1             {v22.8b}, [x4], x8
815
816
817
818
819    sqrshrn         v16.4h, v26.4s, #idct_stg2_shift
820    ld1             {v21.8b}, [x2], x5
821
822
823    b               pred_buff_addition
824end_skip_last4cols:
825    adrp            x14, :got:gai2_impeg2_idct_first_col_q11
826    ldr             x14, [x14, #:got_lo12:gai2_impeg2_idct_first_col_q11]
827    ld1             {v0.4h, v1.4h}, [x14]
828
829
830    umov            x19, v25.d[0]
831    umov            x20, v25.d[1]
832
833///* now the idct of columns is done, transpose so that row idct done efficiently(step5) */
834    trn1            v27.4h, v2.4h, v6.4h
835    trn2            v29.4h, v2.4h, v6.4h ////[x3,x1],[x2,x0] first qudrant transposing
836    trn1            v25.4h, v3.4h, v7.4h
837    trn2            v31.4h, v3.4h, v7.4h ////[x3,x1],[x2,x0] first qudrant transposing
838
839    trn1            v2.2s, v27.2s, v25.2s
840    trn2            v3.2s, v27.2s, v25.2s ////x0,x1,x2,x3 first qudrant transposing continued.....
841    trn1            v6.2s, v29.2s, v31.2s
842    trn2            v7.2s, v29.2s, v31.2s ////x0,x1,x2,x3 first qudrant transposing continued.....
843
844    trn1            v27.4h, v4.4h, v8.4h
845    trn2            v29.4h, v4.4h, v8.4h ////[x3,x1],[x2,x0] second qudrant transposing
846    trn1            v25.4h, v5.4h, v9.4h
847    trn2            v31.4h, v5.4h, v9.4h ////[x3,x1],[x2,x0] second qudrant transposing
848
849    trn1            v4.2s, v27.2s, v25.2s
850    trn2            v5.2s, v27.2s, v25.2s ////x0,x1,x2,x3 second qudrant transposing continued.....
851    trn1            v8.2s, v29.2s, v31.2s
852    trn2            v9.2s, v29.2s, v31.2s ////x0,x1,x2,x3 second qudrant transposing continued.....
853
854    trn1            v27.4h, v10.4h, v14.4h
855    trn2            v29.4h, v10.4h, v14.4h ////[x7,x5],[x6,x4] third qudrant transposing
856    trn1            v25.4h, v11.4h, v15.4h
857    trn2            v31.4h, v11.4h, v15.4h ////[x7,x5],[x6,x4] third qudrant transposing
858
859    trn1            v10.2s, v27.2s, v25.2s
860    trn2            v11.2s, v27.2s, v25.2s ////x4,x5,x6,x7 third qudrant transposing continued.....
861    trn1            v14.2s, v29.2s, v31.2s
862    trn2            v15.2s, v29.2s, v31.2s ////x4,x5,x6,x7 third qudrant transposing continued.....
863
864    trn1            v27.4h, v12.4h, v16.4h
865    trn2            v29.4h, v12.4h, v16.4h ////[x7,x5],[x6,x4] fourth qudrant transposing
866    trn1            v25.4h, v13.4h, v17.4h
867    trn2            v31.4h, v13.4h, v17.4h ////[x7,x5],[x6,x4] fourth qudrant transposing
868
869    trn1            v12.2s, v27.2s, v25.2s
870    trn2            v13.2s, v27.2s, v25.2s ////x4,x5,x6,x7 fourth qudrant transposing continued.....
871    trn1            v16.2s, v29.2s, v31.2s
872    trn2            v17.2s, v29.2s, v31.2s ////x4,x5,x6,x7 fourth qudrant transposing continued.....
873
874    mov             v25.d[0], x19
875    mov             v25.d[1], x20
876
877    ////step6 operate on first four rows and find their idct
878    ////register usage.extern        - storing and idct of rows
879////    cosine constants     -     d0
880////    sine constants         -     d1
881////    element 0 first four     -     d2        -    y0
882////    element 1 first four     -     d6        -    y1
883////    element 2 first four     -     d3        -    y2
884////    element 3 first four     -     d7        -    y3
885////    element 4 first four     -     d4        -    y4
886////    element 5 first four     -     d8        -    y5
887////    element 6 first four     -     d5        -    y6
888////    element 7 first four     -     d9        -    y7
889////    element 0 second four    -     d10        -    y0
890////    element 1 second four    -     d14     -    y1
891////    element 2 second four    -     d11     -    y2
892////    element 3 second four    -     d15     -    y3
893////    element 4 second four    -     d12     -    y4
894////    element 5 second four    -     d16     -    y5
895////    element 6 second four    -     d13     -    y6
896////    element 7 second four    -     d17     -    y7
897
898    //// map between first kernel code seq and current
899////        d2    ->    d2
900////        d6    ->    d6
901////        d3    ->    d3
902////        d7    ->    d7
903////        d10    ->    d4
904////        d14    ->    d8
905////        d11    ->    d5
906////        d15    ->    d9
907////        q3    ->    q3
908////        q5    ->    q2
909////        q7    ->    q4
910
911    smull           v24.4s, v6.4h, v0.4h[1] //// y1 * cos1(part of b0)
912    smull           v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1)
913    smull           v28.4s, v6.4h, v1.4h[1] //// y1 * sin3(part of b2)
914    smull           v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3)
915
916    smlal           v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
917    smlsl           v26.4s, v7.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
918    smlsl           v28.4s, v7.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
919    smlsl           v30.4s, v7.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
920
921    smull           v20.4s, v2.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1)
922    smull           v22.4s, v4.4h, v0.4h[0] //// y4 * cos4(part of c0 and c1)
923
924    smull           v18.4s, v3.4h, v1.4h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1)
925    smull           v6.4s, v3.4h, v0.4h[2] //// y2 * cos2(part of d0)
926
927
928    smlal           v24.4s, v8.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
929    smlsl           v26.4s, v8.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
930    smlal           v28.4s, v8.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
931    smlal           v30.4s, v8.4h, v0.4h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
932
933    smlsl           v18.4s, v5.4h, v0.4h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
934    smlal           v6.4s, v5.4h, v1.4h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
935
936    add             v2.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
937    sub             v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
938
939    smlal           v24.4s, v9.4h, v1.4h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7)
940    smlsl           v26.4s, v9.4h, v1.4h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6)
941    smlal           v28.4s, v9.4h, v0.4h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5)
942    smlsl           v30.4s, v9.4h, v0.4h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4)
943
944    sub             v22.4s, v2.4s , v6.4s //// a3 = c0 - d0(part of x3,x4)
945    add             v4.4s, v2.4s , v6.4s ////    a0 = c0 + d0(part of x0,x7)
946
947
948    add             v2.4s, v4.4s , v24.4s
949
950    sub             v6.4s, v4.4s , v24.4s
951
952    add             v8.4s, v22.4s , v30.4s
953
954    sub             v24.4s, v22.4s , v30.4s
955
956    sqrshrn         v5.4h, v8.4s, #idct_stg2_shift
957    sqrshrn         v2.4h, v2.4s, #idct_stg2_shift
958    sqrshrn         v9.4h, v6.4s, #idct_stg2_shift
959    sqrshrn         v6.4h, v24.4s, #idct_stg2_shift
960
961    sub             v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5)
962    add             v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6)
963
964
965    add             v30.4s, v22.4s , v28.4s
966
967    sub             v24.4s, v22.4s , v28.4s
968
969    add             v28.4s, v18.4s , v26.4s
970
971    sub             v22.4s, v18.4s , v26.4s
972    sqrshrn         v4.4h, v30.4s, #idct_stg2_shift
973    sqrshrn         v7.4h, v24.4s, #idct_stg2_shift
974    sqrshrn         v3.4h, v28.4s, #idct_stg2_shift
975    sqrshrn         v8.4h, v22.4s, #idct_stg2_shift
976
977
978
979    umov            x19, v25.d[0]
980    umov            x20, v25.d[1]
981
982    trn1            v27.4h, v2.4h, v3.4h
983    trn2            v29.4h, v2.4h, v3.4h
984    trn1            v25.4h, v4.4h, v5.4h
985    trn2            v31.4h, v4.4h, v5.4h
986
987    trn1            v2.2s, v27.2s, v25.2s
988    trn2            v4.2s, v27.2s, v25.2s
989    trn1            v3.2s, v29.2s, v31.2s
990    trn2            v5.2s, v29.2s, v31.2s
991
992    trn1            v27.4h, v6.4h, v7.4h
993    trn2            v29.4h, v6.4h, v7.4h
994    trn1            v25.4h, v8.4h, v9.4h
995    trn2            v31.4h, v8.4h, v9.4h
996
997    trn1            v6.2s, v27.2s, v25.2s
998    trn2            v8.2s, v27.2s, v25.2s
999    trn1            v7.2s, v29.2s, v31.2s
1000    trn2            v9.2s, v29.2s, v31.2s
1001
1002    mov             v25.d[0], x19
1003    mov             v25.d[1], x20
1004
1005
1006
1007    smull           v24.4s, v14.4h, v0.4h[1] //// y1 * cos1(part of b0)
1008    smull           v26.4s, v14.4h, v0.4h[3] //// y1 * cos3(part of b1)
1009    smull           v28.4s, v14.4h, v1.4h[1] //// y1 * sin3(part of b2)
1010    smull           v30.4s, v14.4h, v1.4h[3] //// y1 * sin1(part of b3)
1011    smlal           v24.4s, v15.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
1012    smlsl           v26.4s, v15.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
1013    smlsl           v28.4s, v15.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
1014    smlsl           v30.4s, v15.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
1015    smull           v20.4s, v10.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1)
1016    smull           v22.4s, v12.4h, v0.4h[0] //// y4 * cos4(part of c0 and c1)
1017    smull           v18.4s, v11.4h, v1.4h[2] //// y2 * sin2 (q7 is freed by this time)(part of d1)
1018    smull           v14.4s, v11.4h, v0.4h[2] //// y2 * cos2(part of d0)
1019    smlal           v24.4s, v16.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
1020
1021    add             x4, x2, x8, lsl #1  // x4 = x2 + pred_strd * 2    => x4 points to 3rd row of pred data
1022    smlsl           v26.4s, v16.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
1023
1024    add             x5, x8, x8, lsl #1  //
1025    smlal           v28.4s, v16.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
1026
1027    add             x0, x3, x7, lsl #1  // x0 points to 3rd row of dest data
1028    smlal           v30.4s, v16.4h, v0.4h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
1029
1030    add             x10, x7, x7, lsl #1 //
1031    smlsl           v18.4s, v13.4h, v0.4h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
1032
1033
1034    smlal           v14.4s, v13.4h, v1.4h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
1035
1036    add             v12.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
1037    sub             v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
1038
1039    smlal           v24.4s, v17.4h, v1.4h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7)
1040
1041    // swapping v3 and v6
1042    mov             v31.d[0], v3.d[0]
1043    mov             v3.d[0], v6.d[0]
1044    mov             v6.d[0], v31.d[0]
1045
1046    smlsl           v26.4s, v17.4h, v1.4h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6)
1047    // swapping v5 and v8
1048    mov             v31.d[0], v5.d[0]
1049    mov             v5.d[0], v8.d[0]
1050    mov             v8.d[0], v31.d[0]
1051
1052    smlal           v28.4s, v17.4h, v0.4h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5)
1053    smlsl           v30.4s, v17.4h, v0.4h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4)
1054
1055    sub             v22.4s, v12.4s , v14.4s //// a3 = c0 - d0(part of x3,x4)
1056    add             v12.4s, v12.4s , v14.4s ////    a0 = c0 + d0(part of x0,x7)
1057
1058
1059    add             v0.4s, v12.4s , v24.4s
1060
1061
1062    sub             v24.4s, v12.4s , v24.4s
1063
1064
1065    add             v12.4s, v22.4s , v30.4s
1066
1067
1068    sub             v14.4s, v22.4s , v30.4s
1069
1070    sqrshrn         v10.4h, v0.4s, #idct_stg2_shift
1071    sqrshrn         v17.4h, v24.4s, #idct_stg2_shift
1072    sqrshrn         v13.4h, v12.4s, #idct_stg2_shift
1073    sqrshrn         v14.4h, v14.4s, #idct_stg2_shift
1074
1075    sub             v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5)
1076    add             v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6)
1077
1078
1079    add             v0.4s, v22.4s , v28.4s
1080
1081
1082    sub             v24.4s, v22.4s , v28.4s
1083
1084
1085    add             v28.4s, v18.4s , v26.4s
1086
1087
1088    sub             v26.4s, v18.4s , v26.4s
1089    ld1             {v18.8b}, [x2], x8
1090
1091    sqrshrn         v12.4h, v0.4s, #idct_stg2_shift
1092    ld1             {v20.8b}, [x2], x5
1093
1094
1095    sqrshrn         v15.4h, v24.4s, #idct_stg2_shift
1096    ld1             {v19.8b}, [x2], x8
1097
1098
1099
1100
1101    sqrshrn         v11.4h, v28.4s, #idct_stg2_shift
1102    ld1             {v22.8b}, [x4], x8
1103
1104
1105
1106
1107    sqrshrn         v16.4h, v26.4s, #idct_stg2_shift
1108    ld1             {v21.8b}, [x2], x5
1109
1110
1111
1112
1113pred_buff_addition:
1114
1115    umov            x19, v25.d[0]
1116    umov            x20, v25.d[1]
1117
1118    trn1            v27.4h, v10.4h, v11.4h
1119    trn2            v29.4h, v10.4h, v11.4h
1120    trn1            v25.4h, v12.4h, v13.4h
1121    trn2            v31.4h, v12.4h, v13.4h
1122
1123    trn1            v10.2s, v27.2s, v25.2s
1124    trn2            v12.2s, v27.2s, v25.2s
1125    trn1            v11.2s, v29.2s, v31.2s
1126    trn2            v13.2s, v29.2s, v31.2s
1127
1128    trn1            v27.4h, v14.4h, v15.4h
1129    trn2            v29.4h, v14.4h, v15.4h
1130    trn1            v25.4h, v16.4h, v17.4h
1131    trn2            v31.4h, v16.4h, v17.4h
1132
1133    trn1            v14.2s, v27.2s, v25.2s
1134    trn2            v16.2s, v27.2s, v25.2s
1135    trn1            v15.2s, v29.2s, v31.2s
1136    trn2            v17.2s, v29.2s, v31.2s
1137
1138
1139    mov             v25.d[0], x19
1140    mov             v25.d[1], x20
1141
1142
1143    ld1             {v24.8b}, [x4], x5
1144    ld1             {v23.8b}, [x4], x8
1145    ld1             {v25.8b}, [x4], x5
1146    mov             v2.d[1], v3.d[0]
1147    mov             v4.d[1], v5.d[0]
1148    mov             v6.d[1], v7.d[0]
1149    mov             v8.d[1], v9.d[0]
1150    uaddw           v2.8h, v2.8h , v18.8b
1151    uaddw           v4.8h, v4.8h , v22.8b
1152    uaddw           v6.8h, v6.8h , v20.8b
1153    uaddw           v8.8h, v8.8h , v24.8b
1154
1155    // swapping v11 and v14
1156    mov             v31.d[0], v11.d[0]
1157    mov             v11.d[0], v14.d[0]
1158    mov             v14.d[0], v31.d[0]
1159
1160    // swapping v13 and v16
1161    mov             v31.d[0], v13.d[0]
1162    mov             v13.d[0], v16.d[0]
1163    mov             v16.d[0], v31.d[0]
1164// row values stored in the q register.
1165
1166//q1 :x0
1167//q3: x1
1168//q2: x2
1169//q4: x3
1170//q5: x4
1171//q7: x5
1172//q6: x6
1173//q8: x7
1174
1175
1176
1177///// adding the prediction buffer
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187    // load prediction data
1188
1189
1190
1191
1192
1193    //adding recon with prediction
1194
1195
1196
1197
1198    mov             v10.d[1], v11.d[0]
1199    mov             v12.d[1], v13.d[0]
1200    mov             v14.d[1], v15.d[0]
1201    mov             v16.d[1], v17.d[0]
1202    uaddw           v10.8h, v10.8h , v19.8b
1203    sqxtun          v2.8b, v2.8h
1204    uaddw           v14.8h, v14.8h , v21.8b
1205    sqxtun          v4.8b, v4.8h
1206    uaddw           v12.8h, v12.8h , v23.8b
1207    sqxtun          v6.8b, v6.8h
1208    uaddw           v16.8h, v16.8h , v25.8b
1209    sqxtun          v8.8b, v8.8h
1210
1211
1212
1213
1214
1215
1216
1217    st1             {v2.8b}, [x3], x7
1218    sqxtun          v10.8b, v10.8h
1219    st1             {v6.8b}, [x3], x10
1220    sqxtun          v14.8b, v14.8h
1221    st1             {v4.8b}, [x0], x7
1222    sqxtun          v12.8b, v12.8h
1223    st1             {v8.8b}, [x0], x10
1224    sqxtun          v16.8b, v16.8h
1225
1226
1227
1228
1229
1230
1231
1232    st1             {v10.8b}, [x3], x7
1233    st1             {v14.8b}, [x3], x10
1234    st1             {v12.8b}, [x0], x7
1235    st1             {v16.8b}, [x0], x10
1236
1237
1238
1239
1240    // ldmfd sp!,{x4-x12,pc}
1241    ldp             x19, x20, [sp], #16
1242    pop_v_regs
1243    ret
1244
1245
1246
1247
1248