1///*****************************************************************************
2//*
3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************/
18///**
19// *******************************************************************************
20// * @file
21// *  ihevc_itrans_recon_8x8_neon.s
22// *
23// * @brief
24// *  contains function definitions for single stage  inverse transform
25// *
26// * @author
27// * anand s
28// *
29// * @par list of functions:
30// *  - ihevc_itrans_recon_32x32()
31// *
32// * @remarks
33// *  the input buffer is being corrupted
34// *
35// *******************************************************************************
36//*/
37
38///**
39// *******************************************************************************
40// *
41// * @brief
42// *  this function performs inverse transform  and reconstruction for 8x8
43// * input block
44// *
45// * @par description:
46// *  performs inverse transform and adds the prediction  data and clips output
47// * to 8 bit
48// *
49// * @param[in] pi2_src
50// *  input 16x16 coefficients
51// *
52// * @param[in] pi2_tmp
53// *  temporary 16x16 buffer for storing inverse
54// *
55// *  transform
56// *  1st stage output
57// *
58// * @param[in] pu1_pred
59// *  prediction 16x16 block
60// *
61// * @param[out] pu1_dst
62// *  output 8x8 block
63// *
64// * @param[in] src_strd
65// *  input stride
66// *
67// * @param[in] pred_strd
68// *  prediction stride
69// *
70// * @param[in] dst_strd
71// *  output stride
72// *
73// * @param[in] shift
74// *  output shift
75// *
76// * @param[in] x12
77// *  zero columns in pi2_src
78// *
79// * @returns  void
80// *
81// * @remarks
82// *  none
83// *
84// *******************************************************************************
85// */
86
87//void ihevc_itrans_recon_32x32(word16 *pi2_src,
88//                            word16 *pi2_tmp,
89//                            uword8 *pu1_pred,
90//                            uword8 *pu1_dst,
91//                            word32 src_strd,
92//                            word32 pred_strd,
93//                            word32 dst_strd,
94//                            word32 x12
95//                             word32    x11                )
96
97//**************variables vs registers*************************
98//    x0 => *pi2_src
99//    x1 => *pi2_tmp
100//    x2 => *pu1_pred
101//    x3 => *pu1_dst
102//    src_strd
103//    pred_strd
104//    dst_strd
105//    x12
106//    x11
107
108
109//d0[0]=    64        d2[0]=83
110//d0[1]= 90        d2[1]=82
111//d0[2]= 90        d2[2]=80
112//d0[3]= 90        d2[3]=78
113//d1[0]= 89         d3[0]=75
114//d1[1]= 88        d3[1]=73
115//d1[2]= 87        d3[2]=70
116//d1[3]= 85        d3[3]=67
117
118//d4[0]=    64        d6[0]=36
119//d4[1]= 61        d6[1]=31
120//d4[2]= 57        d6[2]=25
121//d4[3]= 54        d6[3]=22
122//d5[0]= 50         d7[0]=18
123//d5[1]= 46        d7[1]=13
124//d5[2]= 43        d7[2]=9
125//d5[3]= 38        d7[3]=4
126
127.text
128.align 4
129.include "ihevc_neon_macros.s"
130
131
132
133
134.set shift_stage1_idct ,   7
135.set shift_stage2_idct ,   12
136
137//#define zero_cols      x12
138//#define zero_rows     x11
139
140.globl ihevc_itrans_recon_32x32_av8
141
142.extern g_ai2_ihevc_trans_32_transpose
143
144x5_addr: .word 0xfffff000
145x9_addr: .word 0xffff0000
146
147.type ihevc_itrans_recon_32x32_av8, %function
148
149ihevc_itrans_recon_32x32_av8:
150
151    ldr         w11, [sp]
152
153// stmfd sp!,{x0-x12,x14}
154    push_v_regs
155    stp         x19, x20,[sp,#-16]!
156    stp         x0, x1,[sp,#-16]!
157    stp         x5, x6,[sp,#-16]!
158
159//ldr            x8,[sp,#56]     @ prediction stride
160//ldr            x7,[sp,#64]     @ destination stride
161    mov         x6, x4 // src stride
162    mov         x12, x7
163    lsl         x6, x6, #1                  // x sizeof(word16)
164    add         x10,x6,x6, lsl #1           // 3 rows
165
166
167    mov         x8,x0
168
169    adrp        x14, :got:g_ai2_ihevc_trans_32_transpose
170    ldr         x14, [x14, #:got_lo12:g_ai2_ihevc_trans_32_transpose]
171
172    ld1         {v0.4h, v1.4h, v2.4h, v3.4h},[x14],#32
173    ld1         {v4.4h, v5.4h, v6.4h, v7.4h},[x14],#32
174
175//registers which are free
176//  x10,x9,x11,x12
177    mov         x9,#0xffffff00
178    mov         x10,#0xfffffff0
179    ldr         w5, x5_addr
180    ldr         w7, x9_addr
181    cmp         x12,x10
182    mov         x20,#1
183    csel        x14, x20, x14,hs
184    bhs         stage1
185
186
187    cmp         x12,x9
188    mov         x20,#2
189    csel        x14, x20, x14,hs
190    bhs         stage1
191
192    cmp         x12,x5
193    mov         x20,#3
194    csel        x14, x20, x14,hs
195    bhs         stage1
196
197    cmp         x12,x7
198    mov         x20,#4
199    csel        x14, x20, x14,hs
200
201    mov         x14,#8
202    b           stage1
203//.ltorg
204
205
206dct_stage1:
207    add         x8,x8,#8
208    mov         x0,x8
209
210stage1:
211    ld1         {v10.4h},[x0],x6
212    ld1         {v8.4h},[x0],x6
213    ld1         {v11.4h},[x0],x6
214    ld1         {v9.4h},[x0],x6
215
216    smull       v24.4s, v8.4h, v0.h[1]     //// y1 * cos1(part of b0)
217    smull       v26.4s, v8.4h, v0.h[3]     //// y1 * cos3(part of b1)
218    smull       v28.4s, v8.4h, v1.h[1]     //// y1 * sin3(part of b2)
219    smull       v30.4s, v8.4h, v1.h[3]     //// y1 * sin1(part of b3)
220
221    smlal       v24.4s, v9.4h, v0.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
222    smlal       v26.4s, v9.4h, v2.h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
223    smlal       v28.4s, v9.4h, v3.h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
224    smlal       v30.4s, v9.4h, v5.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
225
226
227
228
229
230    smull       v20.4s, v10.4h, v0.h[0]
231    smlal       v20.4s, v11.4h, v0.h[2]
232
233
234    smull       v22.4s, v10.4h, v0.h[0]
235    smlal       v22.4s, v11.4h, v1.h[2]
236
237    smull       v16.4s, v10.4h, v0.h[0]
238    smlal       v16.4s, v11.4h, v2.h[2]
239
240    smull       v18.4s, v10.4h, v0.h[0]
241    smlal       v18.4s, v11.4h, v3.h[2]
242    cmp         x11,x10
243    bhs         shift1
244
245    ld1         {v12.4h},[x0],x6
246    ld1         {v14.4h},[x0],x6
247    ld1         {v13.4h},[x0],x6
248    ld1         {v15.4h},[x0],x6
249
250
251
252
253
254
255
256    smlal       v24.4s, v14.4h, v1.h[1]
257    smlal       v26.4s, v14.4h, v3.h[3]
258    smlal       v28.4s, v14.4h, v6.h[1]
259    smlsl       v30.4s, v14.4h, v7.h[1]
260
261
262    smlal       v24.4s, v15.4h, v1.h[3]
263    smlal       v26.4s, v15.4h, v5.h[1]
264    smlsl       v28.4s, v15.4h, v7.h[1]
265    smlsl       v30.4s, v15.4h, v3.h[3]
266
267
268    smlal       v20.4s, v12.4h, v1.h[0]
269    smlal       v20.4s, v13.4h, v1.h[2]
270    smlal       v22.4s, v12.4h, v3.h[0]
271    smlal       v22.4s, v13.4h, v4.h[2]
272    smlal       v16.4s, v12.4h, v5.h[0]
273    smlal       v16.4s, v13.4h, v7.h[2]
274    smlal       v18.4s, v12.4h, v7.h[0]
275    smlsl       v18.4s, v13.4h, v5.h[2]
276
277    cmp         x11,x9
278    bhs         shift1
279
280    ld1         {v10.4h},[x0],x6
281    ld1         {v8.4h},[x0],x6
282    ld1         {v11.4h},[x0],x6
283    ld1         {v9.4h},[x0],x6
284
285
286    smlal       v24.4s, v8.4h, v2.h[1]     //// y1 * cos1(part of b0)
287    smlal       v26.4s, v8.4h, v6.h[3]     //// y1 * cos3(part of b1)
288    smlsl       v28.4s, v8.4h, v4.h[3]     //// y1 * sin3(part of b2)
289    smlsl       v30.4s, v8.4h, v0.h[1]     //// y1 * sin1(part of b3)
290
291    smlal       v24.4s, v9.4h, v2.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
292    smlsl       v26.4s, v9.4h, v7.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
293    smlsl       v28.4s, v9.4h, v2.h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
294    smlsl       v30.4s, v9.4h, v3.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
295
296
297
298
299
300    smlal       v20.4s, v10.4h, v2.h[0]
301    smlal       v20.4s, v11.4h, v2.h[2]
302
303
304    smlal       v22.4s, v10.4h, v6.h[0]
305    smlal       v22.4s, v11.4h, v7.h[2]
306
307    smlsl       v16.4s, v10.4h, v6.h[0]
308    smlsl       v16.4s, v11.4h, v3.h[2]
309
310    smlsl       v18.4s, v10.4h, v2.h[0]
311    smlsl       v18.4s, v11.4h, v1.h[2]
312
313    cmp         x11,x5
314    bhs         shift1
315
316
317    ld1         {v12.4h},[x0],x6
318    ld1         {v14.4h},[x0],x6
319    ld1         {v13.4h},[x0],x6
320    ld1         {v15.4h},[x0],x6
321
322
323
324
325
326
327
328
329
330    smlal       v24.4s, v14.4h, v3.h[1]
331    smlsl       v26.4s, v14.4h, v6.h[1]
332    smlsl       v28.4s, v14.4h, v0.h[1]
333    smlsl       v30.4s, v14.4h, v6.h[3]
334
335
336    smlal       v24.4s, v15.4h, v3.h[3]
337    smlsl       v26.4s, v15.4h, v4.h[3]
338    smlsl       v28.4s, v15.4h, v2.h[3]
339    smlal       v30.4s, v15.4h, v5.h[3]
340
341
342    smlal       v20.4s, v12.4h, v3.h[0]
343    smlal       v20.4s, v13.4h, v3.h[2]
344    smlsl       v22.4s, v12.4h, v7.h[0]
345    smlsl       v22.4s, v13.4h, v5.h[2]
346    smlsl       v16.4s, v12.4h, v1.h[0]
347    smlsl       v16.4s, v13.4h, v1.h[2]
348    smlsl       v18.4s, v12.4h, v5.h[0]
349    smlal       v18.4s, v13.4h, v7.h[2]
350
351    cmp         x11,x7
352    bhs         shift1
353
354
355    ld1         {v10.4h},[x0],x6
356    ld1         {v8.4h},[x0],x6
357    ld1         {v11.4h},[x0],x6
358    ld1         {v9.4h},[x0],x6
359
360
361
362    smlal       v24.4s, v8.4h, v4.h[1]     //// y1 * cos1(part of b0)
363    smlsl       v26.4s, v8.4h, v3.h[1]     //// y1 * cos3(part of b1)
364    smlsl       v28.4s, v8.4h, v5.h[1]     //// y1 * sin3(part of b2)
365    smlal       v30.4s, v8.4h, v2.h[1]     //// y1 * sin1(part of b3)
366
367    smlal       v24.4s, v9.4h, v4.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
368    smlsl       v26.4s, v9.4h, v1.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
369    smlsl       v28.4s, v9.4h, v7.h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
370    smlal       v30.4s, v9.4h, v1.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
371
372
373
374
375
376    smlal       v20.4s, v10.4h, v0.h[0]
377    smlal       v20.4s, v11.4h, v4.h[2]
378
379
380    smlsl       v22.4s, v10.4h, v0.h[0]
381    smlsl       v22.4s, v11.4h, v2.h[2]
382
383    smlsl       v16.4s, v10.4h, v0.h[0]
384    smlsl       v16.4s, v11.4h, v6.h[2]
385
386    smlal       v18.4s, v10.4h, v0.h[0]
387    smlal       v18.4s, v11.4h, v0.h[2]
388
389
390
391    ld1         {v12.4h},[x0],x6
392    ld1         {v14.4h},[x0],x6
393    ld1         {v13.4h},[x0],x6
394    ld1         {v15.4h},[x0],x6
395
396
397
398
399    smlal       v24.4s, v14.4h, v5.h[1]
400    smlsl       v26.4s, v14.4h, v0.h[2]
401    smlal       v28.4s, v14.4h, v5.h[3]
402    smlal       v30.4s, v14.4h, v4.h[3]
403
404
405    smlal       v24.4s, v15.4h, v5.h[3]
406    smlsl       v26.4s, v15.4h, v1.h[1]
407    smlal       v28.4s, v15.4h, v3.h[1]
408    smlsl       v30.4s, v15.4h, v7.h[3]
409
410
411    smlal       v20.4s, v12.4h, v5.h[0]
412    smlal       v20.4s, v13.4h, v5.h[2]
413    smlsl       v22.4s, v12.4h, v1.h[0]
414    smlsl       v22.4s, v13.4h, v0.h[2]
415    smlal       v16.4s, v12.4h, v7.h[0]
416    smlal       v16.4s, v13.4h, v4.h[2]
417    smlal       v18.4s, v12.4h, v3.h[0]
418    smlal       v18.4s, v13.4h, v6.h[2]
419
420
421    ld1         {v10.4h},[x0],x6
422    ld1         {v8.4h},[x0],x6
423    ld1         {v11.4h},[x0],x6
424    ld1         {v9.4h},[x0],x6
425
426
427
428
429
430
431
432    smlal       v24.4s, v8.4h, v6.h[1]     //// y1 * cos1(part of b0)
433    smlsl       v26.4s, v8.4h, v2.h[3]     //// y1 * cos3(part of b1)
434    smlal       v28.4s, v8.4h, v0.h[1]     //// y1 * sin3(part of b2)
435    smlsl       v30.4s, v8.4h, v4.h[1]     //// y1 * sin1(part of b3)
436
437    smlal       v24.4s, v9.4h, v6.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
438    smlsl       v26.4s, v9.4h, v4.h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
439    smlal       v28.4s, v9.4h, v1.h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
440    smlsl       v30.4s, v9.4h, v0.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
441
442
443
444
445
446    smlal       v20.4s, v10.4h, v6.h[0]
447    smlal       v20.4s, v11.4h, v6.h[2]
448
449
450    smlsl       v22.4s, v10.4h, v2.h[0]
451    smlsl       v22.4s, v11.4h, v3.h[2]
452
453    smlal       v16.4s, v10.4h, v2.h[0]
454    smlal       v16.4s, v11.4h, v0.h[2]
455
456    smlsl       v18.4s, v10.4h, v6.h[0]
457    smlsl       v18.4s, v11.4h, v2.h[2]
458
459    ld1         {v12.4h},[x0],x6
460    ld1         {v14.4h},[x0],x6
461    ld1         {v13.4h},[x0],x6
462    ld1         {v15.4h},[x0],x6
463
464
465    smlal       v24.4s, v14.4h, v7.h[1]
466    smlsl       v26.4s, v14.4h, v5.h[3]
467    smlal       v28.4s, v14.4h, v4.h[1]
468    smlsl       v30.4s, v14.4h, v2.h[3]
469
470
471    smlal       v24.4s, v15.4h, v7.h[3]
472    smlsl       v26.4s, v15.4h, v7.h[1]
473    smlal       v28.4s, v15.4h, v6.h[3]
474    smlsl       v30.4s, v15.4h, v6.h[1]
475
476
477    smlal       v20.4s, v12.4h, v7.h[0]
478    smlal       v20.4s, v13.4h, v7.h[2]
479    smlsl       v22.4s, v12.4h, v5.h[0]
480    smlsl       v22.4s, v13.4h, v6.h[2]
481    smlal       v16.4s, v12.4h, v3.h[0]
482    smlal       v16.4s, v13.4h, v5.h[2]
483    smlsl       v18.4s, v12.4h, v1.h[0]
484    smlsl       v18.4s, v13.4h, v4.h[2]
485
486
487
488shift1:
489    add         v8.4s,  v20.4s ,  v24.4s
490    sub         v10.4s,  v20.4s ,  v24.4s
491
492    add         v12.4s,  v22.4s ,  v26.4s
493    sub         v24.4s,  v22.4s ,  v26.4s
494
495    add         v14.4s,  v16.4s ,  v28.4s
496    sub         v26.4s,  v16.4s ,  v28.4s
497
498
499    add         v16.4s,  v18.4s ,  v30.4s
500    sub         v28.4s,  v18.4s ,  v30.4s
501
502
503    sqrshrn     v30.4h, v8.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
504    sqrshrn     v19.4h, v10.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
505    sqrshrn     v31.4h, v14.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
506    sqrshrn     v18.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
507    sqrshrn     v12.4h, v12.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
508    sqrshrn     v15.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
509    sqrshrn     v13.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
510    sqrshrn     v14.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
511
512
513    // registers used q15,q14,q6,q7
514
515    umov        x15,v24.d[0]
516    umov        x16,v25.d[0]
517    umov        x19,v26.d[0]
518    umov        x20,v27.d[0]
519
520    trn1        v24.4h, v30.4h, v12.4h
521    trn2        v25.4h, v30.4h, v12.4h
522    trn1        v26.4h, v31.4h, v13.4h
523    trn2        v27.4h, v31.4h, v13.4h
524
525    trn1        v30.2s, v24.2s, v26.2s
526    trn2        v31.2s, v24.2s, v26.2s
527    trn1        v12.2s, v25.2s, v27.2s
528    trn2        v13.2s, v25.2s, v27.2s
529
530    trn1        v24.4h, v14.4h, v18.4h
531    trn2        v25.4h, v14.4h, v18.4h
532    trn1        v26.4h, v15.4h, v19.4h
533    trn2        v27.4h, v15.4h, v19.4h
534
535    trn1        v14.2s, v24.2s, v26.2s
536    trn2        v15.2s, v24.2s, v26.2s
537    trn1        v18.2s, v25.2s, v27.2s
538    trn2        v19.2s, v25.2s, v27.2s
539
540    mov         v24.d[0],x15
541    mov         v25.d[0],x16
542    mov         v26.d[0],x19
543    mov         v27.d[0],x20
544
545// d30 =x0 1- 4 values
546// d31 =x2 1- 4 values
547// d12=x1 1- 4 values
548// d13=x3 1- 4 values
549// d14 =x0 28-31 values
550// d15 =x2 28- 31 values
551// d18=x1 28- 31 values
552// d19=x3 28- 31 values
553
554
555
556    st1         { v30.4h, v31.4h},[x1],#16
557    st1         { v12.4h, v13.4h},[x1],#16
558    add         x1,x1,#192
559    st1         { v14.4h, v15.4h},[x1],#16
560    st1         { v18.4h, v19.4h},[x1],#16
561    sub         x1,x1,#224
562
563    mov         x0,x8
564
565
566
567
568
569    ld1         {v10.4h},[x0],x6
570    ld1         {v8.4h},[x0],x6
571    ld1         {v11.4h},[x0],x6
572    ld1         {v9.4h},[x0],x6
573
574
575
576
577    smull       v24.4s, v8.4h, v2.h[1]     //// y1 * cos1(part of b0)
578    smull       v26.4s, v8.4h, v2.h[3]     //// y1 * cos3(part of b1)
579    smull       v28.4s, v8.4h, v3.h[1]     //// y1 * sin3(part of b2)
580    smull       v30.4s, v8.4h, v3.h[3]     //// y1 * sin1(part of b3)
581
582    smlal       v24.4s, v9.4h, v6.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
583    smlsl       v26.4s, v9.4h, v7.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
584    smlsl       v28.4s, v9.4h, v6.h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
585    smlsl       v30.4s, v9.4h, v4.h[3]     //// y1 * sin1 - y3 * sin3(part of b3)
586
587
588
589
590
591    smull       v20.4s, v10.4h, v0.h[0]
592    smlal       v20.4s, v11.4h, v4.h[2]
593
594
595    smull       v22.4s, v10.4h, v0.h[0]
596    smlal       v22.4s, v11.4h, v5.h[2]
597
598    smull       v16.4s, v10.4h, v0.h[0]
599    smlal       v16.4s, v11.4h, v6.h[2]
600
601    smull       v18.4s, v10.4h, v0.h[0]
602    smlal       v18.4s, v11.4h, v7.h[2]
603    cmp         x11,x10
604    bhs         shift2
605
606    ld1         {v12.4h},[x0],x6
607    ld1         {v14.4h},[x0],x6
608    ld1         {v13.4h},[x0],x6
609    ld1         {v15.4h},[x0],x6
610
611
612    smlsl       v24.4s, v14.4h, v4.h[3]
613    smlsl       v26.4s, v14.4h, v2.h[1]
614    smlsl       v28.4s, v14.4h, v0.h[1]
615    smlsl       v30.4s, v14.4h, v2.h[3]
616
617
618    smlsl       v24.4s, v15.4h, v0.h[3]
619    smlsl       v26.4s, v15.4h, v3.h[1]
620    smlsl       v28.4s, v15.4h, v6.h[3]
621    smlal       v30.4s, v15.4h, v5.h[3]
622
623
624    smlsl       v20.4s, v12.4h, v7.h[0]
625    smlsl       v20.4s, v13.4h, v2.h[2]
626    smlsl       v22.4s, v12.4h, v5.h[0]
627    smlsl       v22.4s, v13.4h, v0.h[2]
628    smlsl       v16.4s, v12.4h, v3.h[0]
629    smlsl       v16.4s, v13.4h, v3.h[2]
630    smlsl       v18.4s, v12.4h, v1.h[0]
631    smlsl       v18.4s, v13.4h, v6.h[2]
632
633    cmp         x11,x9
634    bhs         shift2
635
636
637    ld1         {v10.4h},[x0],x6
638    ld1         {v8.4h},[x0],x6
639    ld1         {v11.4h},[x0],x6
640    ld1         {v9.4h},[x0],x6
641
642
643
644
645
646
647
648    smlsl       v24.4s, v8.4h, v4.h[1]     //// y1 * cos1(part of b0)
649    smlal       v26.4s, v8.4h, v7.h[1]     //// y1 * cos3(part of b1)
650    smlal       v28.4s, v8.4h, v2.h[3]     //// y1 * sin3(part of b2)
651    smlal       v30.4s, v8.4h, v1.h[3]     //// y1 * sin1(part of b3)
652
653    smlal       v24.4s, v9.4h, v7.h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
654    smlal       v26.4s, v9.4h, v1.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
655    smlal       v28.4s, v9.4h, v3.h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
656    smlsl       v30.4s, v9.4h, v6.h[3]     //// y1 * sin1 - y3 * sin3(part of b3)
657
658
659
660
661
662    smlsl       v20.4s, v10.4h, v2.h[0]
663    smlsl       v20.4s, v11.4h, v6.h[2]
664
665
666    smlsl       v22.4s, v10.4h, v6.h[0]
667    smlal       v22.4s, v11.4h, v4.h[2]
668
669    smlal       v16.4s, v10.4h, v6.h[0]
670    smlal       v16.4s, v11.4h, v0.h[2]
671
672    smlal       v18.4s, v10.4h, v2.h[0]
673    smlal       v18.4s, v11.4h, v5.h[2]
674
675    cmp         x11,x5
676    bhs         shift2
677
678
679    ld1         {v12.4h},[x0],x6
680    ld1         {v14.4h},[x0],x6
681    ld1         {v13.4h},[x0],x6
682    ld1         {v15.4h},[x0],x6
683
684
685
686
687
688    smlal       v24.4s, v14.4h, v2.h[3]
689    smlal       v26.4s, v14.4h, v3.h[3]
690    smlsl       v28.4s, v14.4h, v5.h[3]
691    smlsl       v30.4s, v14.4h, v0.h[3]
692
693
694    smlal       v24.4s, v15.4h, v1.h[3]
695    smlsl       v26.4s, v15.4h, v6.h[3]
696    smlsl       v28.4s, v15.4h, v0.h[3]
697    smlal       v30.4s, v15.4h, v7.h[3]
698
699
700    smlal       v20.4s, v12.4h, v5.h[0]
701    smlal       v20.4s, v13.4h, v0.h[2]
702    smlal       v22.4s, v12.4h, v1.h[0]
703    smlal       v22.4s, v13.4h, v6.h[2]
704    smlal       v16.4s, v12.4h, v7.h[0]
705    smlsl       v16.4s, v13.4h, v2.h[2]
706    smlsl       v18.4s, v12.4h, v3.h[0]
707    smlsl       v18.4s, v13.4h, v4.h[2]
708
709
710    cmp         x11,x7
711    bhs         shift2
712
713
714    ld1         {v10.4h},[x0],x6
715    ld1         {v8.4h},[x0],x6
716    ld1         {v11.4h},[x0],x6
717    ld1         {v9.4h},[x0],x6
718
719
720
721
722
723
724
725    smlal       v24.4s, v8.4h, v6.h[1]     //// y1 * cos1(part of b0)
726    smlsl       v26.4s, v8.4h, v1.h[1]     //// y1 * cos3(part of b1)
727    smlsl       v28.4s, v8.4h, v7.h[1]     //// y1 * sin3(part of b2)
728    smlal       v30.4s, v8.4h, v0.h[3]     //// y1 * sin1(part of b3)
729
730    smlsl       v24.4s, v9.4h, v5.h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
731    smlsl       v26.4s, v9.4h, v4.h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
732    smlal       v28.4s, v9.4h, v2.h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
733    smlal       v30.4s, v9.4h, v7.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
734
735
736
737
738
739    smlal       v20.4s, v10.4h, v0.h[0]
740    smlsl       v20.4s, v11.4h, v7.h[2]
741
742
743    smlsl       v22.4s, v10.4h, v0.h[0]
744    smlsl       v22.4s, v11.4h, v1.h[2]
745
746    smlsl       v16.4s, v10.4h, v0.h[0]
747    smlal       v16.4s, v11.4h, v5.h[2]
748
749    smlal       v18.4s, v10.4h, v0.h[0]
750    smlal       v18.4s, v11.4h, v3.h[2]
751
752
753
754    ld1         {v12.4h},[x0],x6
755    ld1         {v14.4h},[x0],x6
756    ld1         {v13.4h},[x0],x6
757    ld1         {v15.4h},[x0],x6
758
759
760    smlsl       v24.4s, v14.4h, v0.h[1]
761    smlal       v26.4s, v14.4h, v6.h[1]
762    smlal       v28.4s, v14.4h, v4.h[1]
763    smlsl       v30.4s, v14.4h, v1.h[1]
764
765
766    smlsl       v24.4s, v15.4h, v3.h[3]
767    smlal       v26.4s, v15.4h, v0.h[1]
768    smlsl       v28.4s, v15.4h, v5.h[1]
769    smlsl       v30.4s, v15.4h, v6.h[1]
770
771
772    smlsl       v20.4s, v12.4h, v3.h[0]
773    smlsl       v20.4s, v13.4h, v1.h[2]
774    smlsl       v22.4s, v12.4h, v7.h[0]
775    smlal       v22.4s, v13.4h, v3.h[2]
776    smlal       v16.4s, v12.4h, v1.h[0]
777    smlal       v16.4s, v13.4h, v7.h[2]
778    smlsl       v18.4s, v12.4h, v5.h[0]
779    smlsl       v18.4s, v13.4h, v2.h[2]
780
781    ld1         {v10.4h},[x0],x6
782    ld1         {v8.4h},[x0],x6
783    ld1         {v11.4h},[x0],x6
784    ld1         {v9.4h},[x0],x6
785
786
787
788
789    smlal       v24.4s, v8.4h, v7.h[3]     //// y1 * cos1(part of b0)
790    smlal       v26.4s, v8.4h, v4.h[3]     //// y1 * cos3(part of b1)
791    smlsl       v28.4s, v8.4h, v1.h[1]     //// y1 * sin3(part of b2)
792    smlal       v30.4s, v8.4h, v2.h[1]     //// y1 * sin1(part of b3)
793
794    smlal       v24.4s, v9.4h, v3.h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
795    smlsl       v26.4s, v9.4h, v5.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
796    smlsl       v28.4s, v9.4h, v7.h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
797    smlal       v30.4s, v9.4h, v5.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
798
799
800
801
802
803    smlsl       v20.4s, v10.4h, v6.h[0]
804    smlal       v20.4s, v11.4h, v5.h[2]
805
806
807    smlal       v22.4s, v10.4h, v2.h[0]
808    smlal       v22.4s, v11.4h, v7.h[2]
809
810    smlsl       v16.4s, v10.4h, v2.h[0]
811    smlsl       v16.4s, v11.4h, v4.h[2]
812
813    smlal       v18.4s, v10.4h, v6.h[0]
814    smlal       v18.4s, v11.4h, v1.h[2]
815
816
817    ld1         {v12.4h},[x0],x6
818    ld1         {v14.4h},[x0],x6
819    ld1         {v13.4h},[x0],x6
820    ld1         {v15.4h},[x0],x6
821
822
823
824
825
826    smlal       v24.4s, v14.4h, v1.h[1]
827    smlsl       v26.4s, v14.4h, v0.h[3]
828    smlal       v28.4s, v14.4h, v1.h[3]
829    smlsl       v30.4s, v14.4h, v3.h[1]
830
831
832    smlal       v24.4s, v15.4h, v5.h[3]
833    smlsl       v26.4s, v15.4h, v5.h[1]
834    smlal       v28.4s, v15.4h, v4.h[3]
835    smlsl       v30.4s, v15.4h, v4.h[1]
836
837
838    smlal       v20.4s, v12.4h, v1.h[0]
839    smlal       v20.4s, v13.4h, v3.h[2]
840    smlsl       v22.4s, v12.4h, v3.h[0]
841    smlsl       v22.4s, v13.4h, v2.h[2]
842    smlal       v16.4s, v12.4h, v5.h[0]
843    smlal       v16.4s, v13.4h, v1.h[2]
844    smlsl       v18.4s, v12.4h, v7.h[0]
845    smlsl       v18.4s, v13.4h, v0.h[2]
846
847shift2:
848    add         v8.4s,  v20.4s ,  v24.4s
849    sub         v10.4s,  v20.4s ,  v24.4s
850
851    add         v12.4s,  v22.4s ,  v26.4s
852    sub         v24.4s,  v22.4s ,  v26.4s
853
854    add         v14.4s,  v16.4s ,  v28.4s
855    sub         v26.4s,  v16.4s ,  v28.4s
856
857
858    add         v16.4s,  v18.4s ,  v30.4s
859    sub         v28.4s,  v18.4s ,  v30.4s
860
861
862    sqrshrn     v30.4h, v8.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
863    sqrshrn     v19.4h, v10.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
864    sqrshrn     v31.4h, v14.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
865    sqrshrn     v18.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
866    sqrshrn     v12.4h, v12.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
867    sqrshrn     v15.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
868    sqrshrn     v13.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
869    sqrshrn     v14.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
870
871    umov        x15,v24.d[0]
872    umov        x16,v25.d[0]
873    umov        x19,v26.d[0]
874    umov        x20,v27.d[0]
875
876    trn1        v24.4h, v30.4h, v12.4h
877    trn2        v25.4h, v30.4h, v12.4h
878    trn1        v26.4h, v31.4h, v13.4h
879    trn2        v27.4h, v31.4h, v13.4h
880
881    trn1        v30.2s, v24.2s, v26.2s
882    trn2        v31.2s, v24.2s, v26.2s
883    trn1        v12.2s, v25.2s, v27.2s
884    trn2        v13.2s, v25.2s, v27.2s
885
886    trn1        v24.4h, v14.4h, v18.4h
887    trn2        v25.4h, v14.4h, v18.4h
888    trn1        v26.4h, v15.4h, v19.4h
889    trn2        v27.4h, v15.4h, v19.4h
890
891    trn1        v14.2s, v24.2s, v26.2s
892    trn2        v15.2s, v24.2s, v26.2s
893    trn1        v18.2s, v25.2s, v27.2s
894    trn2        v19.2s, v25.2s, v27.2s
895
896    mov         v24.d[0],x15
897    mov         v25.d[0],x16
898    mov         v26.d[0],x19
899    mov         v27.d[0],x20
900
901    st1         { v30.4h, v31.4h},[x1],#16
902    st1         { v12.4h, v13.4h},[x1],#16
903    add         x1,x1,#128
904    st1         { v14.4h, v15.4h},[x1],#16
905    st1         { v18.4h, v19.4h},[x1],#16
906    sub         x1,x1,#160
907    mov         x0,x8
908
909
910
911    ld1         {v10.4h},[x0],x6
912    ld1         {v8.4h},[x0],x6
913    ld1         {v11.4h},[x0],x6
914    ld1         {v9.4h},[x0],x6
915
916
917    smull       v24.4s, v8.4h, v4.h[1]     //// y1 * cos1(part of b0)
918    smull       v26.4s, v8.4h, v4.h[3]     //// y1 * cos3(part of b1)
919    smull       v28.4s, v8.4h, v5.h[1]     //// y1 * sin3(part of b2)
920    smull       v30.4s, v8.4h, v5.h[3]     //// y1 * sin1(part of b3)
921
922    smlsl       v24.4s, v9.4h, v3.h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
923    smlsl       v26.4s, v9.4h, v1.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
924    smlsl       v28.4s, v9.4h, v0.h[2]     //// y1 * sin3 - y3 * cos1(part of b2)
925    smlsl       v30.4s, v9.4h, v1.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
926
927
928
929
930
931    smull       v20.4s, v10.4h, v0.h[0]
932    smlsl       v20.4s, v11.4h, v7.h[2]
933
934
935    smull       v22.4s, v10.4h, v0.h[0]
936    smlsl       v22.4s, v11.4h, v6.h[2]
937
938    smull       v16.4s, v10.4h, v0.h[0]
939    smlsl       v16.4s, v11.4h, v5.h[2]
940
941    smull       v18.4s, v10.4h, v0.h[0]
942    smlsl       v18.4s, v11.4h, v4.h[2]
943
944    cmp         x11,x10
945    bhs         shift3
946
947    ld1         {v12.4h},[x0],x6
948    ld1         {v14.4h},[x0],x6
949    ld1         {v13.4h},[x0],x6
950    ld1         {v15.4h},[x0],x6
951
952
953
954
955    smlsl       v24.4s, v14.4h, v5.h[1]
956    smlsl       v26.4s, v14.4h, v7.h[3]
957    smlal       v28.4s, v14.4h, v5.h[3]
958    smlal       v30.4s, v14.4h, v3.h[1]
959
960
961    smlal       v24.4s, v15.4h, v2.h[1]
962    smlal       v26.4s, v15.4h, v1.h[1]
963    smlal       v28.4s, v15.4h, v4.h[3]
964    smlsl       v30.4s, v15.4h, v7.h[3]
965
966
967    smlsl       v20.4s, v12.4h, v1.h[0]
968    smlal       v20.4s, v13.4h, v6.h[2]
969    smlsl       v22.4s, v12.4h, v3.h[0]
970    smlal       v22.4s, v13.4h, v3.h[2]
971    smlsl       v16.4s, v12.4h, v5.h[0]
972    smlal       v16.4s, v13.4h, v0.h[2]
973    smlsl       v18.4s, v12.4h, v7.h[0]
974    smlal       v18.4s, v13.4h, v2.h[2]
975
976    cmp         x11,x9
977    bhs         shift3
978
979    ld1         {v10.4h},[x0],x6
980    ld1         {v8.4h},[x0],x6
981    ld1         {v11.4h},[x0],x6
982    ld1         {v9.4h},[x0],x6
983
984    smlal       v24.4s, v8.4h, v6.h[1]     //// y1 * cos1(part of b0)
985    smlsl       v26.4s, v8.4h, v5.h[1]     //// y1 * cos3(part of b1)
986    smlsl       v28.4s, v8.4h, v0.h[3]     //// y1 * sin3(part of b2)
987    smlsl       v30.4s, v8.4h, v3.h[3]     //// y1 * sin1(part of b3)
988
989    smlsl       v24.4s, v9.4h, v1.h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
990    smlsl       v26.4s, v9.4h, v4.h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
991    smlal       v28.4s, v9.4h, v6.h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
992    smlal       v30.4s, v9.4h, v0.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
993
994
995
996
997
998    smlal       v20.4s, v10.4h, v2.h[0]
999    smlsl       v20.4s, v11.4h, v5.h[2]
1000
1001
1002    smlal       v22.4s, v10.4h, v6.h[0]
1003    smlsl       v22.4s, v11.4h, v0.h[2]
1004
1005    smlsl       v16.4s, v10.4h, v6.h[0]
1006    smlsl       v16.4s, v11.4h, v4.h[2]
1007
1008    smlsl       v18.4s, v10.4h, v2.h[0]
1009    smlal       v18.4s, v11.4h, v6.h[2]
1010
1011    cmp         x11,x5
1012    bhs         shift3
1013
1014
1015    ld1         {v12.4h},[x0],x6
1016    ld1         {v14.4h},[x0],x6
1017    ld1         {v13.4h},[x0],x6
1018    ld1         {v15.4h},[x0],x6
1019
1020
1021
1022
1023
1024
1025    smlsl       v24.4s, v14.4h, v7.h[1]
1026    smlal       v26.4s, v14.4h, v2.h[1]
1027    smlal       v28.4s, v14.4h, v4.h[1]
1028    smlsl       v30.4s, v14.4h, v5.h[1]
1029
1030
1031    smlal       v24.4s, v15.4h, v0.h[3]
1032    smlal       v26.4s, v15.4h, v7.h[1]
1033    smlsl       v28.4s, v15.4h, v1.h[1]
1034    smlsl       v30.4s, v15.4h, v6.h[1]
1035
1036
1037    smlsl       v20.4s, v12.4h, v3.h[0]
1038    smlal       v20.4s, v13.4h, v4.h[2]
1039    smlal       v22.4s, v12.4h, v7.h[0]
1040    smlal       v22.4s, v13.4h, v2.h[2]
1041    smlal       v16.4s, v12.4h, v1.h[0]
1042    smlsl       v16.4s, v13.4h, v6.h[2]
1043    smlal       v18.4s, v12.4h, v5.h[0]
1044    smlsl       v18.4s, v13.4h, v0.h[2]
1045
1046
1047    cmp         x11,x7
1048    bhs         shift3
1049
1050
1051    ld1         {v10.4h},[x0],x6
1052    ld1         {v8.4h},[x0],x6
1053    ld1         {v11.4h},[x0],x6
1054    ld1         {v9.4h},[x0],x6
1055
1056
1057    smlsl       v24.4s, v8.4h, v7.h[3]     //// y1 * cos1(part of b0)
1058    smlsl       v26.4s, v8.4h, v0.h[1]     //// y1 * cos3(part of b1)
1059    smlal       v28.4s, v8.4h, v6.h[3]     //// y1 * sin3(part of b2)
1060    smlal       v30.4s, v8.4h, v1.h[3]     //// y1 * sin1(part of b3)
1061
1062    smlsl       v24.4s, v9.4h, v0.h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
1063    smlal       v26.4s, v9.4h, v5.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
1064    smlal       v28.4s, v9.4h, v3.h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
1065    smlsl       v30.4s, v9.4h, v2.h[3]     //// y1 * sin1 - y3 * sin3(part of b3)
1066
1067
1068
1069
1070
1071    smlal       v20.4s, v10.4h, v0.h[0]
1072    smlsl       v20.4s, v11.4h, v3.h[2]
1073
1074
1075    smlsl       v22.4s, v10.4h, v0.h[0]
1076    smlsl       v22.4s, v11.4h, v5.h[2]
1077
1078    smlsl       v16.4s, v10.4h, v0.h[0]
1079    smlal       v16.4s, v11.4h, v1.h[2]
1080
1081    smlal       v18.4s, v10.4h, v0.h[0]
1082    smlal       v18.4s, v11.4h, v7.h[2]
1083
1084
1085    ld1         {v12.4h},[x0],x6
1086    ld1         {v14.4h},[x0],x6
1087    ld1         {v13.4h},[x0],x6
1088    ld1         {v15.4h},[x0],x6
1089
1090
1091
1092    smlal       v24.4s, v14.4h, v6.h[3]
1093    smlal       v26.4s, v14.4h, v3.h[3]
1094    smlsl       v28.4s, v14.4h, v1.h[3]
1095    smlal       v30.4s, v14.4h, v7.h[1]
1096
1097
1098    smlal       v24.4s, v15.4h, v1.h[3]
1099    smlsl       v26.4s, v15.4h, v2.h[3]
1100    smlal       v28.4s, v15.4h, v7.h[1]
1101    smlal       v30.4s, v15.4h, v4.h[1]
1102
1103
1104    smlsl       v20.4s, v12.4h, v5.h[0]
1105    smlal       v20.4s, v13.4h, v2.h[2]
1106    smlal       v22.4s, v12.4h, v1.h[0]
1107    smlsl       v22.4s, v13.4h, v7.h[2]
1108    smlsl       v16.4s, v12.4h, v7.h[0]
1109    smlsl       v16.4s, v13.4h, v3.h[2]
1110    smlsl       v18.4s, v12.4h, v3.h[0]
1111    smlal       v18.4s, v13.4h, v1.h[2]
1112
1113
1114
1115    ld1         {v10.4h},[x0],x6
1116    ld1         {v8.4h},[x0],x6
1117    ld1         {v11.4h},[x0],x6
1118    ld1         {v9.4h},[x0],x6
1119
1120
1121
1122
1123    smlsl       v24.4s, v8.4h, v5.h[3]     //// y1 * cos1(part of b0)
1124    smlsl       v26.4s, v8.4h, v6.h[3]     //// y1 * cos3(part of b1)
1125    smlal       v28.4s, v8.4h, v3.h[1]     //// y1 * sin3(part of b2)
1126    smlsl       v30.4s, v8.4h, v0.h[1]     //// y1 * sin1(part of b3)
1127
1128    smlsl       v24.4s, v9.4h, v2.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
1129    smlal       v26.4s, v9.4h, v0.h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
1130    smlsl       v28.4s, v9.4h, v2.h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
1131    smlal       v30.4s, v9.4h, v4.h[3]     //// y1 * sin1 - y3 * sin3(part of b3)
1132
1133
1134
1135
1136
1137    smlal       v20.4s, v10.4h, v6.h[0]
1138    smlsl       v20.4s, v11.4h, v1.h[2]
1139
1140
1141    smlsl       v22.4s, v10.4h, v2.h[0]
1142    smlal       v22.4s, v11.4h, v4.h[2]
1143
1144    smlal       v16.4s, v10.4h, v2.h[0]
1145    smlsl       v16.4s, v11.4h, v7.h[2]
1146
1147    smlsl       v18.4s, v10.4h, v6.h[0]
1148    smlsl       v18.4s, v11.4h, v5.h[2]
1149
1150
1151    ld1         {v12.4h},[x0],x6
1152    ld1         {v14.4h},[x0],x6
1153    ld1         {v13.4h},[x0],x6
1154    ld1         {v15.4h},[x0],x6
1155
1156    smlal       v24.4s, v14.4h, v4.h[3]
1157    smlsl       v26.4s, v14.4h, v6.h[1]
1158    smlal       v28.4s, v14.4h, v7.h[3]
1159    smlal       v30.4s, v14.4h, v6.h[3]
1160
1161
1162    smlal       v24.4s, v15.4h, v3.h[3]
1163    smlsl       v26.4s, v15.4h, v3.h[1]
1164    smlal       v28.4s, v15.4h, v2.h[3]
1165    smlsl       v30.4s, v15.4h, v2.h[1]
1166
1167
1168    smlsl       v20.4s, v12.4h, v7.h[0]
1169    smlal       v20.4s, v13.4h, v0.h[2]
1170    smlal       v22.4s, v12.4h, v5.h[0]
1171    smlsl       v22.4s, v13.4h, v1.h[2]
1172    smlsl       v16.4s, v12.4h, v3.h[0]
1173    smlal       v16.4s, v13.4h, v2.h[2]
1174    smlal       v18.4s, v12.4h, v1.h[0]
1175    smlsl       v18.4s, v13.4h, v3.h[2]
1176
1177shift3:
1178    add         v8.4s,  v20.4s ,  v24.4s
1179    sub         v10.4s,  v20.4s ,  v24.4s
1180
1181    add         v12.4s,  v22.4s ,  v26.4s
1182    sub         v24.4s,  v22.4s ,  v26.4s
1183
1184    add         v14.4s,  v16.4s ,  v28.4s
1185    sub         v26.4s,  v16.4s ,  v28.4s
1186
1187
1188    add         v16.4s,  v18.4s ,  v30.4s
1189    sub         v28.4s,  v18.4s ,  v30.4s
1190
1191
1192    sqrshrn     v30.4h, v8.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
1193    sqrshrn     v19.4h, v10.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
1194    sqrshrn     v31.4h, v14.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
1195    sqrshrn     v18.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
1196    sqrshrn     v12.4h, v12.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
1197    sqrshrn     v15.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
1198    sqrshrn     v13.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
1199    sqrshrn     v14.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
1200
1201    umov        x15,v24.d[0]
1202    umov        x16,v25.d[0]
1203    umov        x19,v26.d[0]
1204    umov        x20,v27.d[0]
1205
1206    trn1        v24.4h, v30.4h, v12.4h
1207    trn2        v25.4h, v30.4h, v12.4h
1208    trn1        v26.4h, v31.4h, v13.4h
1209    trn2        v27.4h, v31.4h, v13.4h
1210
1211    trn1        v30.2s, v24.2s, v26.2s
1212    trn2        v31.2s, v24.2s, v26.2s
1213    trn1        v12.2s, v25.2s, v27.2s
1214    trn2        v13.2s, v25.2s, v27.2s
1215
1216    trn1        v24.4h, v14.4h, v18.4h
1217    trn2        v25.4h, v14.4h, v18.4h
1218    trn1        v26.4h, v15.4h, v19.4h
1219    trn2        v27.4h, v15.4h, v19.4h
1220
1221    trn1        v14.2s, v24.2s, v26.2s
1222    trn2        v15.2s, v24.2s, v26.2s
1223    trn1        v18.2s, v25.2s, v27.2s
1224    trn2        v19.2s, v25.2s, v27.2s
1225
1226    mov         v24.d[0],x15
1227    mov         v25.d[0],x16
1228    mov         v26.d[0],x19
1229    mov         v27.d[0],x20
1230    st1         { v30.4h, v31.4h},[x1],#16
1231    st1         { v12.4h, v13.4h},[x1],#16
1232    add         x1,x1,#64
1233    st1         { v14.4h, v15.4h},[x1],#16
1234    st1         { v18.4h, v19.4h},[x1],#16
1235    sub         x1,x1,#96
1236
1237    mov         x0,x8
1238
1239
1240
1241    ld1         {v10.4h},[x0],x6
1242    ld1         {v8.4h},[x0],x6
1243    ld1         {v11.4h},[x0],x6
1244    ld1         {v9.4h},[x0],x6
1245
1246
1247    smull       v24.4s, v8.4h, v6.h[1]     //// y1 * cos1(part of b0)
1248    smull       v26.4s, v8.4h, v6.h[3]     //// y1 * cos3(part of b1)
1249    smull       v28.4s, v8.4h, v7.h[1]     //// y1 * sin3(part of b2)
1250    smull       v30.4s, v8.4h, v7.h[3]     //// y1 * sin1(part of b3)
1251
1252    smlsl       v24.4s, v9.4h, v2.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
1253    smlsl       v26.4s, v9.4h, v4.h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
1254    smlsl       v28.4s, v9.4h, v5.h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
1255    smlsl       v30.4s, v9.4h, v7.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
1256
1257
1258
1259
1260
1261    smull       v20.4s, v10.4h, v0.h[0]
1262    smlsl       v20.4s, v11.4h, v3.h[2]
1263
1264
1265    smull       v22.4s, v10.4h, v0.h[0]
1266    smlsl       v22.4s, v11.4h, v2.h[2]
1267
1268    smull       v16.4s, v10.4h, v0.h[0]
1269    smlsl       v16.4s, v11.4h, v1.h[2]
1270
1271    smull       v18.4s, v10.4h, v0.h[0]
1272    smlsl       v18.4s, v11.4h, v0.h[2]
1273
1274    cmp         x11,x10
1275    bhs         shift4
1276
1277    ld1         {v12.4h},[x0],x6
1278    ld1         {v14.4h},[x0],x6
1279    ld1         {v13.4h},[x0],x6
1280    ld1         {v15.4h},[x0],x6
1281
1282
1283
1284
1285
1286
1287    smlal       v24.4s, v14.4h, v0.h[1]
1288    smlal       v26.4s, v14.4h, v1.h[3]
1289    smlal       v28.4s, v14.4h, v4.h[1]
1290    smlal       v30.4s, v14.4h, v6.h[3]
1291
1292
1293    smlsl       v24.4s, v15.4h, v4.h[1]
1294    smlsl       v26.4s, v15.4h, v0.h[3]
1295    smlsl       v28.4s, v15.4h, v2.h[3]
1296    smlsl       v30.4s, v15.4h, v6.h[1]
1297
1298
1299    smlal       v20.4s, v12.4h, v7.h[0]
1300    smlal       v20.4s, v13.4h, v5.h[2]
1301    smlal       v22.4s, v12.4h, v5.h[0]
1302    smlsl       v22.4s, v13.4h, v7.h[2]
1303    smlal       v16.4s, v12.4h, v3.h[0]
1304    smlsl       v16.4s, v13.4h, v4.h[2]
1305    smlal       v18.4s, v12.4h, v1.h[0]
1306    smlsl       v18.4s, v13.4h, v1.h[2]
1307
1308    cmp         x11,x9
1309    bhs         shift4
1310
1311    ld1         {v10.4h},[x0],x6
1312    ld1         {v8.4h},[x0],x6
1313    ld1         {v11.4h},[x0],x6
1314    ld1         {v9.4h},[x0],x6
1315
1316
1317
1318    smlal       v24.4s, v8.4h, v7.h[3]     //// y1 * cos1(part of b0)
1319    smlal       v26.4s, v8.4h, v3.h[1]     //// y1 * cos3(part of b1)
1320    smlal       v28.4s, v8.4h, v1.h[1]     //// y1 * sin3(part of b2)
1321    smlal       v30.4s, v8.4h, v5.h[3]     //// y1 * sin1(part of b3)
1322
1323    smlal       v24.4s, v9.4h, v4.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
1324    smlsl       v26.4s, v9.4h, v5.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
1325    smlsl       v28.4s, v9.4h, v0.h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
1326    smlsl       v30.4s, v9.4h, v5.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
1327
1328
1329
1330
1331
1332    smlsl       v20.4s, v10.4h, v2.h[0]
1333    smlal       v20.4s, v11.4h, v1.h[2]
1334
1335
1336    smlsl       v22.4s, v10.4h, v6.h[0]
1337    smlal       v22.4s, v11.4h, v3.h[2]
1338
1339    smlal       v16.4s, v10.4h, v6.h[0]
1340    smlsl       v16.4s, v11.4h, v7.h[2]
1341
1342    smlal       v18.4s, v10.4h, v2.h[0]
1343    smlsl       v18.4s, v11.4h, v2.h[2]
1344
1345    cmp         x11,x5
1346    bhs         shift4
1347
1348
1349    ld1         {v12.4h},[x0],x6
1350    ld1         {v14.4h},[x0],x6
1351    ld1         {v13.4h},[x0],x6
1352    ld1         {v15.4h},[x0],x6
1353
1354
1355
1356
1357
1358
1359    smlsl       v24.4s, v14.4h, v1.h[1]
1360    smlsl       v26.4s, v14.4h, v7.h[3]
1361    smlal       v28.4s, v14.4h, v1.h[3]
1362    smlal       v30.4s, v14.4h, v4.h[3]
1363
1364
1365    smlal       v24.4s, v15.4h, v2.h[1]
1366    smlal       v26.4s, v15.4h, v5.h[1]
1367    smlsl       v28.4s, v15.4h, v3.h[1]
1368    smlsl       v30.4s, v15.4h, v4.h[1]
1369
1370
1371    smlsl       v20.4s, v12.4h, v5.h[0]
1372    smlsl       v20.4s, v13.4h, v7.h[2]
1373    smlsl       v22.4s, v12.4h, v1.h[0]
1374    smlal       v22.4s, v13.4h, v1.h[2]
1375    smlsl       v16.4s, v12.4h, v7.h[0]
1376    smlal       v16.4s, v13.4h, v5.h[2]
1377    smlal       v18.4s, v12.4h, v3.h[0]
1378    smlsl       v18.4s, v13.4h, v3.h[2]
1379
1380    cmp         x11,x7
1381    bhs         shift4
1382
1383
1384    ld1         {v10.4h},[x0],x6
1385    ld1         {v8.4h},[x0],x6
1386    ld1         {v11.4h},[x0],x6
1387    ld1         {v9.4h},[x0],x6
1388
1389
1390    smlsl       v24.4s, v8.4h, v5.h[3]     //// y1 * cos1(part of b0)
1391    smlsl       v26.4s, v8.4h, v2.h[3]     //// y1 * cos3(part of b1)
1392    smlal       v28.4s, v8.4h, v4.h[3]     //// y1 * sin3(part of b2)
1393    smlal       v30.4s, v8.4h, v3.h[3]     //// y1 * sin1(part of b3)
1394
1395    smlsl       v24.4s, v9.4h, v6.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
1396    smlal       v26.4s, v9.4h, v0.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
1397    smlsl       v28.4s, v9.4h, v6.h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
1398    smlsl       v30.4s, v9.4h, v3.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
1399
1400
1401
1402
1403
1404    smlal       v20.4s, v10.4h, v0.h[0]
1405    smlsl       v20.4s, v11.4h, v0.h[2]
1406
1407
1408    smlsl       v22.4s, v10.4h, v0.h[0]
1409    smlal       v22.4s, v11.4h, v6.h[2]
1410
1411    smlsl       v16.4s, v10.4h, v0.h[0]
1412    smlal       v16.4s, v11.4h, v2.h[2]
1413
1414    smlal       v18.4s, v10.4h, v0.h[0]
1415    smlsl       v18.4s, v11.4h, v4.h[2]
1416
1417
1418
1419
1420    ld1         {v12.4h},[x0],x6
1421    ld1         {v14.4h},[x0],x6
1422    ld1         {v13.4h},[x0],x6
1423    ld1         {v15.4h},[x0],x6
1424
1425
1426
1427
1428
1429
1430    smlal       v24.4s, v14.4h, v3.h[1]
1431    smlsl       v26.4s, v14.4h, v2.h[1]
1432    smlal       v28.4s, v14.4h, v7.h[3]
1433    smlal       v30.4s, v14.4h, v2.h[3]
1434
1435
1436    smlsl       v24.4s, v15.4h, v0.h[3]
1437    smlal       v26.4s, v15.4h, v4.h[3]
1438    smlal       v28.4s, v15.4h, v6.h[3]
1439    smlsl       v30.4s, v15.4h, v2.h[1]
1440
1441
1442    smlal       v20.4s, v12.4h, v3.h[0]
1443    smlsl       v20.4s, v13.4h, v6.h[2]
1444    smlal       v22.4s, v12.4h, v7.h[0]
1445    smlsl       v22.4s, v13.4h, v4.h[2]
1446    smlsl       v16.4s, v12.4h, v1.h[0]
1447    smlal       v16.4s, v13.4h, v0.h[2]
1448    smlal       v18.4s, v12.4h, v5.h[0]
1449    smlsl       v18.4s, v13.4h, v5.h[2]
1450
1451
1452    ld1         {v10.4h},[x0],x6
1453    ld1         {v8.4h},[x0],x6
1454    ld1         {v11.4h},[x0],x6
1455    ld1         {v9.4h},[x0],x6
1456
1457
1458
1459
1460
1461    smlal       v24.4s, v8.4h, v3.h[3]     //// y1 * cos1(part of b0)
1462    smlsl       v26.4s, v8.4h, v7.h[1]     //// y1 * cos3(part of b1)
1463    smlsl       v28.4s, v8.4h, v5.h[1]     //// y1 * sin3(part of b2)
1464    smlal       v30.4s, v8.4h, v1.h[3]     //// y1 * sin1(part of b3)
1465
1466    smlsl       v24.4s, v9.4h, v7.h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
1467    smlsl       v26.4s, v9.4h, v6.h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
1468    smlal       v28.4s, v9.4h, v3.h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
1469    smlsl       v30.4s, v9.4h, v1.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
1470
1471
1472
1473
1474
1475    smlsl       v20.4s, v10.4h, v6.h[0]
1476    smlal       v20.4s, v11.4h, v2.h[2]
1477
1478
1479    smlal       v22.4s, v10.4h, v2.h[0]
1480    smlsl       v22.4s, v11.4h, v0.h[2]
1481
1482    smlsl       v16.4s, v10.4h, v2.h[0]
1483    smlal       v16.4s, v11.4h, v3.h[2]
1484
1485    smlal       v18.4s, v10.4h, v6.h[0]
1486    smlsl       v18.4s, v11.4h, v6.h[2]
1487
1488
1489    ld1         {v12.4h},[x0],x6
1490    ld1         {v14.4h},[x0],x6
1491    ld1         {v13.4h},[x0],x6
1492    ld1         {v15.4h},[x0],x6
1493
1494
1495
1496
1497    smlsl       v24.4s, v14.4h, v5.h[1]
1498    smlal       v26.4s, v14.4h, v3.h[3]
1499    smlsl       v28.4s, v14.4h, v2.h[1]
1500    smlal       v30.4s, v14.4h, v0.h[3]
1501
1502
1503    smlal       v24.4s, v15.4h, v1.h[3]
1504    smlsl       v26.4s, v15.4h, v1.h[1]
1505    smlal       v28.4s, v15.4h, v0.h[3]
1506    smlsl       v30.4s, v15.4h, v0.h[1]
1507
1508
1509    smlsl       v20.4s, v12.4h, v1.h[0]
1510    smlal       v20.4s, v13.4h, v4.h[2]
1511    smlal       v22.4s, v12.4h, v3.h[0]
1512    smlsl       v22.4s, v13.4h, v5.h[2]
1513    smlsl       v16.4s, v12.4h, v5.h[0]
1514    smlal       v16.4s, v13.4h, v6.h[2]
1515    smlal       v18.4s, v12.4h, v7.h[0]
1516    smlsl       v18.4s, v13.4h, v7.h[2]
1517
1518shift4:
1519    add         v8.4s,  v20.4s ,  v24.4s
1520    sub         v10.4s,  v20.4s ,  v24.4s
1521
1522    add         v12.4s,  v22.4s ,  v26.4s
1523    sub         v24.4s,  v22.4s ,  v26.4s
1524
1525    add         v14.4s,  v16.4s ,  v28.4s
1526    sub         v26.4s,  v16.4s ,  v28.4s
1527
1528
1529    add         v16.4s,  v18.4s ,  v30.4s
1530    sub         v28.4s,  v18.4s ,  v30.4s
1531
1532
1533    sqrshrn     v30.4h, v8.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
1534    sqrshrn     v19.4h, v10.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
1535    sqrshrn     v31.4h, v14.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
1536    sqrshrn     v18.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
1537    sqrshrn     v12.4h, v12.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
1538    sqrshrn     v15.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
1539    sqrshrn     v13.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
1540    sqrshrn     v14.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
1541
1542    umov        x15,v24.d[0]
1543    umov        x16,v25.d[0]
1544    umov        x19,v26.d[0]
1545    umov        x20,v27.d[0]
1546
1547    trn1        v24.4h, v30.4h, v12.4h
1548    trn2        v25.4h, v30.4h, v12.4h
1549    trn1        v26.4h, v31.4h, v13.4h
1550    trn2        v27.4h, v31.4h, v13.4h
1551
1552    trn1        v30.2s, v24.2s, v26.2s
1553    trn2        v31.2s, v24.2s, v26.2s
1554    trn1        v12.2s, v25.2s, v27.2s
1555    trn2        v13.2s, v25.2s, v27.2s
1556
1557    trn1        v24.4h, v14.4h, v18.4h
1558    trn2        v25.4h, v14.4h, v18.4h
1559    trn1        v26.4h, v15.4h, v19.4h
1560    trn2        v27.4h, v15.4h, v19.4h
1561
1562    trn1        v14.2s, v24.2s, v26.2s
1563    trn2        v15.2s, v24.2s, v26.2s
1564    trn1        v18.2s, v25.2s, v27.2s
1565    trn2        v19.2s, v25.2s, v27.2s
1566
1567    mov         v24.d[0],x15
1568    mov         v25.d[0],x16
1569    mov         v26.d[0],x19
1570    mov         v27.d[0],x20
1571
1572    st1         { v30.4h, v31.4h},[x1],#16
1573    st1         { v12.4h, v13.4h},[x1],#16
1574    st1         { v14.4h, v15.4h},[x1],#16
1575    st1         { v18.4h, v19.4h},[x1],#16
1576
1577    add         x1,x1,#96
1578
1579    subs        x14,x14,#1
1580    bne         dct_stage1
1581second_stage_dct:
1582//    mov        x0,x1
1583    ldp         x8, x7,[sp],#16
1584    ldp         x0, x1,[sp],#16
1585
1586//    add x4,x2,x8, lsl #1    @ x4 = x2 + pred_strd * 2    => x4 points to 3rd row of pred data
1587//    add x5,x8,x8, lsl #1    @
1588//    sub   x0,x0,#512
1589    mov         x11,#0xfffffff0
1590    mov         x5, #0xffffff00
1591    ldr         w6, x5_addr
1592    ldr         w9, x9_addr
1593//    sub         x1,x1,#2048
1594    mov         x4,x1
1595    mov         x10,#240
1596    mov         x14,#8
1597    b           stage2
1598
1599// registers free :
1600
1601// arm registers used
1602// x8 : predicition stride
1603// x7 : destination stride
1604// x1: temp buffer
1605// x2 : pred buffer
1606// x3 : destination buffer
1607// x14 : loop counter
1608//x0 : scratch buffer
1609//x10 : used as stride
1610// x4 : used to store the initial address
1611//x12 : zero cols
1612// x11 : 0xfffffff0
1613// x5 : 0xffffff00
1614dct_stage2:
1615    add         x4,x4,#32
1616    mov         x1,x4
1617stage2:
1618    ld1         {v10.4h, v11.4h},[x1],#16
1619    ld1         {v8.4h, v9.4h},[x1],x10
1620
1621    smull       v24.4s, v8.4h, v0.h[1]     //// y1 * cos1(part of b0)
1622    smull       v26.4s, v8.4h, v0.h[3]     //// y1 * cos3(part of b1)
1623    smull       v28.4s, v8.4h, v1.h[1]     //// y1 * sin3(part of b2)
1624    smull       v30.4s, v8.4h, v1.h[3]     //// y1 * sin1(part of b3)
1625
1626    smlal       v24.4s, v9.4h, v0.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
1627    smlal       v26.4s, v9.4h, v2.h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
1628    smlal       v28.4s, v9.4h, v3.h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
1629    smlal       v30.4s, v9.4h, v5.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
1630
1631
1632
1633    smull       v20.4s, v10.4h, v0.h[0]
1634    smlal       v20.4s, v11.4h, v0.h[2]
1635
1636
1637    smull       v22.4s, v10.4h, v0.h[0]
1638    smlal       v22.4s, v11.4h, v1.h[2]
1639
1640    smull       v16.4s, v10.4h, v0.h[0]
1641    smlal       v16.4s, v11.4h, v2.h[2]
1642
1643    smull       v18.4s, v10.4h, v0.h[0]
1644    smlal       v18.4s, v11.4h, v3.h[2]
1645    cmp         x12,x11
1646    bhs         stage2_shift1
1647
1648    ld1         {v12.4h, v13.4h},[x1],#16
1649    ld1         {v14.4h, v15.4h},[x1],x10
1650
1651
1652
1653
1654
1655
1656    smlal       v24.4s, v14.4h, v1.h[1]
1657    smlal       v26.4s, v14.4h, v3.h[3]
1658    smlal       v28.4s, v14.4h, v6.h[1]
1659    smlsl       v30.4s, v14.4h, v7.h[1]
1660
1661
1662    smlal       v24.4s, v15.4h, v1.h[3]
1663    smlal       v26.4s, v15.4h, v5.h[1]
1664    smlsl       v28.4s, v15.4h, v7.h[1]
1665    smlsl       v30.4s, v15.4h, v3.h[3]
1666
1667
1668    smlal       v20.4s, v12.4h, v1.h[0]
1669    smlal       v20.4s, v13.4h, v1.h[2]
1670    smlal       v22.4s, v12.4h, v3.h[0]
1671    smlal       v22.4s, v13.4h, v4.h[2]
1672    smlal       v16.4s, v12.4h, v5.h[0]
1673    smlal       v16.4s, v13.4h, v7.h[2]
1674    smlal       v18.4s, v12.4h, v7.h[0]
1675    smlsl       v18.4s, v13.4h, v5.h[2]
1676    cmp         x12,x5
1677    bhs         stage2_shift1
1678
1679    ld1         {v10.4h, v11.4h},[x1],#16
1680    ld1         {v8.4h, v9.4h},[x1],x10
1681
1682    smlal       v24.4s, v8.4h, v2.h[1]     //// y1 * cos1(part of b0)
1683    smlal       v26.4s, v8.4h, v6.h[3]     //// y1 * cos3(part of b1)
1684    smlsl       v28.4s, v8.4h, v4.h[3]     //// y1 * sin3(part of b2)
1685    smlsl       v30.4s, v8.4h, v0.h[1]     //// y1 * sin1(part of b3)
1686
1687    smlal       v24.4s, v9.4h, v2.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
1688    smlsl       v26.4s, v9.4h, v7.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
1689    smlsl       v28.4s, v9.4h, v2.h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
1690    smlsl       v30.4s, v9.4h, v3.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
1691
1692
1693
1694
1695
1696    smlal       v20.4s, v10.4h, v2.h[0]
1697    smlal       v20.4s, v11.4h, v2.h[2]
1698
1699
1700    smlal       v22.4s, v10.4h, v6.h[0]
1701    smlal       v22.4s, v11.4h, v7.h[2]
1702
1703    smlsl       v16.4s, v10.4h, v6.h[0]
1704    smlsl       v16.4s, v11.4h, v3.h[2]
1705
1706    smlsl       v18.4s, v10.4h, v2.h[0]
1707    smlsl       v18.4s, v11.4h, v1.h[2]
1708
1709    cmp         x12,x6
1710    bhs         stage2_shift1
1711
1712
1713    ld1         {v12.4h, v13.4h},[x1],#16
1714    ld1         {v14.4h, v15.4h},[x1],x10
1715
1716
1717
1718
1719
1720    smlal       v24.4s, v14.4h, v3.h[1]
1721    smlsl       v26.4s, v14.4h, v6.h[1]
1722    smlsl       v28.4s, v14.4h, v0.h[1]
1723    smlsl       v30.4s, v14.4h, v6.h[3]
1724
1725
1726    smlal       v24.4s, v15.4h, v3.h[3]
1727    smlsl       v26.4s, v15.4h, v4.h[3]
1728    smlsl       v28.4s, v15.4h, v2.h[3]
1729    smlal       v30.4s, v15.4h, v5.h[3]
1730
1731
1732    smlal       v20.4s, v12.4h, v3.h[0]
1733    smlal       v20.4s, v13.4h, v3.h[2]
1734    smlsl       v22.4s, v12.4h, v7.h[0]
1735    smlsl       v22.4s, v13.4h, v5.h[2]
1736    smlsl       v16.4s, v12.4h, v1.h[0]
1737    smlsl       v16.4s, v13.4h, v1.h[2]
1738    smlsl       v18.4s, v12.4h, v5.h[0]
1739    smlal       v18.4s, v13.4h, v7.h[2]
1740
1741    cmp         x12,x9
1742    bhs         stage2_shift1
1743
1744
1745    ld1         {v10.4h, v11.4h},[x1],#16
1746    ld1         {v8.4h, v9.4h},[x1],x10
1747
1748
1749    smlal       v24.4s, v8.4h, v4.h[1]     //// y1 * cos1(part of b0)
1750    smlsl       v26.4s, v8.4h, v3.h[1]     //// y1 * cos3(part of b1)
1751    smlsl       v28.4s, v8.4h, v5.h[1]     //// y1 * sin3(part of b2)
1752    smlal       v30.4s, v8.4h, v2.h[1]     //// y1 * sin1(part of b3)
1753
1754    smlal       v24.4s, v9.4h, v4.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
1755    smlsl       v26.4s, v9.4h, v1.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
1756    smlsl       v28.4s, v9.4h, v7.h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
1757    smlal       v30.4s, v9.4h, v1.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
1758
1759
1760
1761
1762
1763    smlal       v20.4s, v10.4h, v0.h[0]
1764    smlal       v20.4s, v11.4h, v4.h[2]
1765
1766
1767    smlsl       v22.4s, v10.4h, v0.h[0]
1768    smlsl       v22.4s, v11.4h, v2.h[2]
1769
1770    smlsl       v16.4s, v10.4h, v0.h[0]
1771    smlsl       v16.4s, v11.4h, v6.h[2]
1772
1773    smlal       v18.4s, v10.4h, v0.h[0]
1774    smlal       v18.4s, v11.4h, v0.h[2]
1775
1776    ld1         {v12.4h, v13.4h},[x1],#16
1777    ld1         {v14.4h, v15.4h},[x1],x10
1778
1779
1780
1781
1782
1783    smlal       v24.4s, v14.4h, v5.h[1]
1784    smlsl       v26.4s, v14.4h, v0.h[2]
1785    smlal       v28.4s, v14.4h, v5.h[3]
1786    smlal       v30.4s, v14.4h, v4.h[3]
1787
1788
1789    smlal       v24.4s, v15.4h, v5.h[3]
1790    smlsl       v26.4s, v15.4h, v1.h[1]
1791    smlal       v28.4s, v15.4h, v3.h[1]
1792    smlsl       v30.4s, v15.4h, v7.h[3]
1793
1794
1795    smlal       v20.4s, v12.4h, v5.h[0]
1796    smlal       v20.4s, v13.4h, v5.h[2]
1797    smlsl       v22.4s, v12.4h, v1.h[0]
1798    smlsl       v22.4s, v13.4h, v0.h[2]
1799    smlal       v16.4s, v12.4h, v7.h[0]
1800    smlal       v16.4s, v13.4h, v4.h[2]
1801    smlal       v18.4s, v12.4h, v3.h[0]
1802    smlal       v18.4s, v13.4h, v6.h[2]
1803
1804
1805    ld1         {v10.4h, v11.4h},[x1],#16
1806    ld1         {v8.4h, v9.4h},[x1],x10
1807
1808
1809
1810
1811    smlal       v24.4s, v8.4h, v6.h[1]     //// y1 * cos1(part of b0)
1812    smlsl       v26.4s, v8.4h, v2.h[3]     //// y1 * cos3(part of b1)
1813    smlal       v28.4s, v8.4h, v0.h[1]     //// y1 * sin3(part of b2)
1814    smlsl       v30.4s, v8.4h, v4.h[1]     //// y1 * sin1(part of b3)
1815
1816    smlal       v24.4s, v9.4h, v6.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
1817    smlsl       v26.4s, v9.4h, v4.h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
1818    smlal       v28.4s, v9.4h, v1.h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
1819    smlsl       v30.4s, v9.4h, v0.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
1820
1821
1822
1823
1824
1825    smlal       v20.4s, v10.4h, v6.h[0]
1826    smlal       v20.4s, v11.4h, v6.h[2]
1827
1828
1829    smlsl       v22.4s, v10.4h, v2.h[0]
1830    smlsl       v22.4s, v11.4h, v3.h[2]
1831
1832    smlal       v16.4s, v10.4h, v2.h[0]
1833    smlal       v16.4s, v11.4h, v0.h[2]
1834
1835    smlsl       v18.4s, v10.4h, v6.h[0]
1836    smlsl       v18.4s, v11.4h, v2.h[2]
1837
1838    ld1         {v12.4h, v13.4h},[x1],#16
1839    ld1         {v14.4h, v15.4h},[x1],x10
1840
1841    smlal       v24.4s, v14.4h, v7.h[1]
1842    smlsl       v26.4s, v14.4h, v5.h[3]
1843    smlal       v28.4s, v14.4h, v4.h[1]
1844    smlsl       v30.4s, v14.4h, v2.h[3]
1845
1846
1847    smlal       v24.4s, v15.4h, v7.h[3]
1848    smlsl       v26.4s, v15.4h, v7.h[1]
1849    smlal       v28.4s, v15.4h, v6.h[3]
1850    smlsl       v30.4s, v15.4h, v6.h[1]
1851
1852
1853    smlal       v20.4s, v12.4h, v7.h[0]
1854    smlal       v20.4s, v13.4h, v7.h[2]
1855    smlsl       v22.4s, v12.4h, v5.h[0]
1856    smlsl       v22.4s, v13.4h, v6.h[2]
1857    smlal       v16.4s, v12.4h, v3.h[0]
1858    smlal       v16.4s, v13.4h, v5.h[2]
1859    smlsl       v18.4s, v12.4h, v1.h[0]
1860    smlsl       v18.4s, v13.4h, v4.h[2]
1861
1862stage2_shift1:
1863    add         v8.4s,  v20.4s ,  v24.4s
1864    sub         v10.4s,  v20.4s ,  v24.4s
1865
1866    add         v12.4s,  v22.4s ,  v26.4s
1867    sub         v24.4s,  v22.4s ,  v26.4s
1868
1869    add         v14.4s,  v16.4s ,  v28.4s
1870    sub         v26.4s,  v16.4s ,  v28.4s
1871
1872
1873    add         v16.4s,  v18.4s ,  v30.4s
1874    sub         v28.4s,  v18.4s ,  v30.4s
1875
1876
1877    sqrshrn     v30.4h, v8.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage2_idct)
1878    sqrshrn     v19.4h, v10.4s,#shift_stage2_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage2_idct)
1879    sqrshrn     v31.4h, v14.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage2_idct)
1880    sqrshrn     v18.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage2_idct)
1881    sqrshrn     v12.4h, v12.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage2_idct)
1882    sqrshrn     v15.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage2_idct)
1883    sqrshrn     v13.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage2_idct)
1884    sqrshrn     v14.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage2_idct)
1885
1886
1887    umov        x15,v24.d[0]
1888    umov        x16,v25.d[0]
1889    umov        x19,v26.d[0]
1890    umov        x20,v27.d[0]
1891
1892    trn1        v24.4h, v30.4h, v12.4h
1893    trn2        v25.4h, v30.4h, v12.4h
1894    trn1        v26.4h, v31.4h, v13.4h
1895    trn2        v27.4h, v31.4h, v13.4h
1896
1897    trn1        v30.2s, v24.2s, v26.2s
1898    trn2        v31.2s, v24.2s, v26.2s
1899    trn1        v12.2s, v25.2s, v27.2s
1900    trn2        v13.2s, v25.2s, v27.2s
1901
1902    trn1        v24.4h, v14.4h, v18.4h
1903    trn2        v25.4h, v14.4h, v18.4h
1904    trn1        v26.4h, v15.4h, v19.4h
1905    trn2        v27.4h, v15.4h, v19.4h
1906
1907    trn1        v14.2s, v24.2s, v26.2s
1908    trn2        v15.2s, v24.2s, v26.2s
1909    trn1        v18.2s, v25.2s, v27.2s
1910    trn2        v19.2s, v25.2s, v27.2s
1911
1912    mov         v24.d[0],x15
1913    mov         v25.d[0],x16
1914    mov         v26.d[0],x19
1915    mov         v27.d[0],x20
1916
1917    st1         { v30.4h, v31.4h},[x0],#16
1918    st1         { v12.4h, v13.4h},[x0],#16
1919    st1         { v14.4h, v15.4h},[x0],#16
1920    st1         { v18.4h, v19.4h},[x0],#16
1921
1922    mov         x1,x4
1923
1924
1925
1926
1927
1928
1929    ld1         {v10.4h, v11.4h},[x1],#16
1930    ld1         {v8.4h, v9.4h},[x1],x10
1931
1932
1933    smull       v24.4s, v8.4h, v2.h[1]     //// y1 * cos1(part of b0)
1934    smull       v26.4s, v8.4h, v2.h[3]     //// y1 * cos3(part of b1)
1935    smull       v28.4s, v8.4h, v3.h[1]     //// y1 * sin3(part of b2)
1936    smull       v30.4s, v8.4h, v3.h[3]     //// y1 * sin1(part of b3)
1937
1938    smlal       v24.4s, v9.4h, v6.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
1939    smlsl       v26.4s, v9.4h, v7.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
1940    smlsl       v28.4s, v9.4h, v6.h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
1941    smlsl       v30.4s, v9.4h, v4.h[3]     //// y1 * sin1 - y3 * sin3(part of b3)
1942
1943
1944
1945
1946
1947    smull       v20.4s, v10.4h, v0.h[0]
1948    smlal       v20.4s, v11.4h, v4.h[2]
1949
1950
1951    smull       v22.4s, v10.4h, v0.h[0]
1952    smlal       v22.4s, v11.4h, v5.h[2]
1953
1954    smull       v16.4s, v10.4h, v0.h[0]
1955    smlal       v16.4s, v11.4h, v6.h[2]
1956
1957    smull       v18.4s, v10.4h, v0.h[0]
1958    smlal       v18.4s, v11.4h, v7.h[2]
1959
1960    cmp         x12,x11
1961    bhs         stage2_shift2
1962
1963    ld1         {v12.4h, v13.4h},[x1],#16
1964    ld1         {v14.4h, v15.4h},[x1],x10
1965
1966
1967    smlsl       v24.4s, v14.4h, v4.h[3]
1968    smlsl       v26.4s, v14.4h, v2.h[1]
1969    smlsl       v28.4s, v14.4h, v0.h[1]
1970    smlsl       v30.4s, v14.4h, v2.h[3]
1971
1972
1973    smlsl       v24.4s, v15.4h, v0.h[3]
1974    smlsl       v26.4s, v15.4h, v3.h[1]
1975    smlsl       v28.4s, v15.4h, v6.h[3]
1976    smlal       v30.4s, v15.4h, v5.h[3]
1977
1978
1979    smlsl       v20.4s, v12.4h, v7.h[0]
1980    smlsl       v20.4s, v13.4h, v2.h[2]
1981    smlsl       v22.4s, v12.4h, v5.h[0]
1982    smlsl       v22.4s, v13.4h, v0.h[2]
1983    smlsl       v16.4s, v12.4h, v3.h[0]
1984    smlsl       v16.4s, v13.4h, v3.h[2]
1985    smlsl       v18.4s, v12.4h, v1.h[0]
1986    smlsl       v18.4s, v13.4h, v6.h[2]
1987
1988    cmp         x12,x5
1989    bhs         stage2_shift2
1990
1991    ld1         {v10.4h, v11.4h},[x1],#16
1992    ld1         {v8.4h, v9.4h},[x1],x10
1993
1994
1995
1996
1997
1998    smlsl       v24.4s, v8.4h, v4.h[1]     //// y1 * cos1(part of b0)
1999    smlal       v26.4s, v8.4h, v7.h[1]     //// y1 * cos3(part of b1)
2000    smlal       v28.4s, v8.4h, v2.h[3]     //// y1 * sin3(part of b2)
2001    smlal       v30.4s, v8.4h, v1.h[3]     //// y1 * sin1(part of b3)
2002
2003    smlal       v24.4s, v9.4h, v7.h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
2004    smlal       v26.4s, v9.4h, v1.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
2005    smlal       v28.4s, v9.4h, v3.h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
2006    smlsl       v30.4s, v9.4h, v6.h[3]     //// y1 * sin1 - y3 * sin3(part of b3)
2007
2008
2009
2010
2011
2012    smlsl       v20.4s, v10.4h, v2.h[0]
2013    smlsl       v20.4s, v11.4h, v6.h[2]
2014
2015
2016    smlsl       v22.4s, v10.4h, v6.h[0]
2017    smlal       v22.4s, v11.4h, v4.h[2]
2018
2019    smlal       v16.4s, v10.4h, v6.h[0]
2020    smlal       v16.4s, v11.4h, v0.h[2]
2021
2022    smlal       v18.4s, v10.4h, v2.h[0]
2023    smlal       v18.4s, v11.4h, v5.h[2]
2024
2025    cmp         x12,x6
2026    bhs         stage2_shift2
2027
2028
2029    ld1         {v12.4h, v13.4h},[x1],#16
2030    ld1         {v14.4h, v15.4h},[x1],x10
2031
2032
2033
2034
2035
2036
2037    smlal       v24.4s, v14.4h, v2.h[3]
2038    smlal       v26.4s, v14.4h, v3.h[3]
2039    smlsl       v28.4s, v14.4h, v5.h[3]
2040    smlsl       v30.4s, v14.4h, v0.h[3]
2041
2042
2043    smlal       v24.4s, v15.4h, v1.h[3]
2044    smlsl       v26.4s, v15.4h, v6.h[3]
2045    smlsl       v28.4s, v15.4h, v0.h[3]
2046    smlal       v30.4s, v15.4h, v7.h[3]
2047
2048
2049    smlal       v20.4s, v12.4h, v5.h[0]
2050    smlal       v20.4s, v13.4h, v0.h[2]
2051    smlal       v22.4s, v12.4h, v1.h[0]
2052    smlal       v22.4s, v13.4h, v6.h[2]
2053    smlal       v16.4s, v12.4h, v7.h[0]
2054    smlsl       v16.4s, v13.4h, v2.h[2]
2055    smlsl       v18.4s, v12.4h, v3.h[0]
2056    smlsl       v18.4s, v13.4h, v4.h[2]
2057
2058    cmp         x12,x9
2059    bhs         stage2_shift2
2060
2061
2062    ld1         {v10.4h, v11.4h},[x1],#16
2063    ld1         {v8.4h, v9.4h},[x1],x10
2064
2065
2066
2067    smlal       v24.4s, v8.4h, v6.h[1]     //// y1 * cos1(part of b0)
2068    smlsl       v26.4s, v8.4h, v1.h[1]     //// y1 * cos3(part of b1)
2069    smlsl       v28.4s, v8.4h, v7.h[1]     //// y1 * sin3(part of b2)
2070    smlal       v30.4s, v8.4h, v0.h[3]     //// y1 * sin1(part of b3)
2071
2072    smlsl       v24.4s, v9.4h, v5.h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
2073    smlsl       v26.4s, v9.4h, v4.h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
2074    smlal       v28.4s, v9.4h, v2.h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
2075    smlal       v30.4s, v9.4h, v7.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
2076
2077
2078
2079
2080
2081    smlal       v20.4s, v10.4h, v0.h[0]
2082    smlsl       v20.4s, v11.4h, v7.h[2]
2083
2084
2085    smlsl       v22.4s, v10.4h, v0.h[0]
2086    smlsl       v22.4s, v11.4h, v1.h[2]
2087
2088    smlsl       v16.4s, v10.4h, v0.h[0]
2089    smlal       v16.4s, v11.4h, v5.h[2]
2090
2091    smlal       v18.4s, v10.4h, v0.h[0]
2092    smlal       v18.4s, v11.4h, v3.h[2]
2093
2094    ld1         {v12.4h, v13.4h},[x1],#16
2095    ld1         {v14.4h, v15.4h},[x1],x10
2096
2097
2098
2099
2100    smlsl       v24.4s, v14.4h, v0.h[1]
2101    smlal       v26.4s, v14.4h, v6.h[1]
2102    smlal       v28.4s, v14.4h, v4.h[1]
2103    smlsl       v30.4s, v14.4h, v1.h[1]
2104
2105
2106    smlsl       v24.4s, v15.4h, v3.h[3]
2107    smlal       v26.4s, v15.4h, v0.h[1]
2108    smlsl       v28.4s, v15.4h, v5.h[1]
2109    smlsl       v30.4s, v15.4h, v6.h[1]
2110
2111
2112    smlsl       v20.4s, v12.4h, v3.h[0]
2113    smlsl       v20.4s, v13.4h, v1.h[2]
2114    smlsl       v22.4s, v12.4h, v7.h[0]
2115    smlal       v22.4s, v13.4h, v3.h[2]
2116    smlal       v16.4s, v12.4h, v1.h[0]
2117    smlal       v16.4s, v13.4h, v7.h[2]
2118    smlsl       v18.4s, v12.4h, v5.h[0]
2119    smlsl       v18.4s, v13.4h, v2.h[2]
2120
2121
2122    ld1         {v10.4h, v11.4h},[x1],#16
2123    ld1         {v8.4h, v9.4h},[x1],x10
2124
2125
2126    smlal       v24.4s, v8.4h, v7.h[3]     //// y1 * cos1(part of b0)
2127    smlal       v26.4s, v8.4h, v4.h[3]     //// y1 * cos3(part of b1)
2128    smlsl       v28.4s, v8.4h, v1.h[1]     //// y1 * sin3(part of b2)
2129    smlal       v30.4s, v8.4h, v2.h[1]     //// y1 * sin1(part of b3)
2130
2131    smlal       v24.4s, v9.4h, v3.h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
2132    smlsl       v26.4s, v9.4h, v5.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
2133    smlsl       v28.4s, v9.4h, v7.h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
2134    smlal       v30.4s, v9.4h, v5.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
2135
2136
2137
2138
2139
2140    smlsl       v20.4s, v10.4h, v6.h[0]
2141    smlal       v20.4s, v11.4h, v5.h[2]
2142
2143
2144    smlal       v22.4s, v10.4h, v2.h[0]
2145    smlal       v22.4s, v11.4h, v7.h[2]
2146
2147    smlsl       v16.4s, v10.4h, v2.h[0]
2148    smlsl       v16.4s, v11.4h, v4.h[2]
2149
2150    smlal       v18.4s, v10.4h, v6.h[0]
2151    smlal       v18.4s, v11.4h, v1.h[2]
2152
2153
2154    ld1         {v12.4h, v13.4h},[x1],#16
2155    ld1         {v14.4h, v15.4h},[x1],x10
2156
2157
2158
2159    smlal       v24.4s, v14.4h, v1.h[1]
2160    smlsl       v26.4s, v14.4h, v0.h[3]
2161    smlal       v28.4s, v14.4h, v1.h[3]
2162    smlsl       v30.4s, v14.4h, v3.h[1]
2163
2164
2165    smlal       v24.4s, v15.4h, v5.h[3]
2166    smlsl       v26.4s, v15.4h, v5.h[1]
2167    smlal       v28.4s, v15.4h, v4.h[3]
2168    smlsl       v30.4s, v15.4h, v4.h[1]
2169
2170
2171    smlal       v20.4s, v12.4h, v1.h[0]
2172    smlal       v20.4s, v13.4h, v3.h[2]
2173    smlsl       v22.4s, v12.4h, v3.h[0]
2174    smlsl       v22.4s, v13.4h, v2.h[2]
2175    smlal       v16.4s, v12.4h, v5.h[0]
2176    smlal       v16.4s, v13.4h, v1.h[2]
2177    smlsl       v18.4s, v12.4h, v7.h[0]
2178    smlsl       v18.4s, v13.4h, v0.h[2]
2179
2180stage2_shift2:
2181    add         v8.4s,  v20.4s ,  v24.4s
2182    sub         v10.4s,  v20.4s ,  v24.4s
2183
2184    add         v12.4s,  v22.4s ,  v26.4s
2185    sub         v24.4s,  v22.4s ,  v26.4s
2186
2187    add         v14.4s,  v16.4s ,  v28.4s
2188    sub         v26.4s,  v16.4s ,  v28.4s
2189
2190
2191    add         v16.4s,  v18.4s ,  v30.4s
2192    sub         v28.4s,  v18.4s ,  v30.4s
2193
2194
2195    sqrshrn     v30.4h, v8.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage2_idct)
2196    sqrshrn     v19.4h, v10.4s,#shift_stage2_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage2_idct)
2197    sqrshrn     v31.4h, v14.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage2_idct)
2198    sqrshrn     v18.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage2_idct)
2199    sqrshrn     v12.4h, v12.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage2_idct)
2200    sqrshrn     v15.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage2_idct)
2201    sqrshrn     v13.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage2_idct)
2202    sqrshrn     v14.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage2_idct)
2203
2204    umov        x15,v24.d[0]
2205    umov        x16,v25.d[0]
2206    umov        x19,v26.d[0]
2207    umov        x20,v27.d[0]
2208
2209    trn1        v24.4h, v30.4h, v12.4h
2210    trn2        v25.4h, v30.4h, v12.4h
2211    trn1        v26.4h, v31.4h, v13.4h
2212    trn2        v27.4h, v31.4h, v13.4h
2213
2214    trn1        v30.2s, v24.2s, v26.2s
2215    trn2        v31.2s, v24.2s, v26.2s
2216    trn1        v12.2s, v25.2s, v27.2s
2217    trn2        v13.2s, v25.2s, v27.2s
2218
2219    trn1        v24.4h, v14.4h, v18.4h
2220    trn2        v25.4h, v14.4h, v18.4h
2221    trn1        v26.4h, v15.4h, v19.4h
2222    trn2        v27.4h, v15.4h, v19.4h
2223
2224    trn1        v14.2s, v24.2s, v26.2s
2225    trn2        v15.2s, v24.2s, v26.2s
2226    trn1        v18.2s, v25.2s, v27.2s
2227    trn2        v19.2s, v25.2s, v27.2s
2228
2229    mov         v24.d[0],x15
2230    mov         v25.d[0],x16
2231    mov         v26.d[0],x19
2232    mov         v27.d[0],x20
2233
2234    st1         { v30.4h, v31.4h},[x0],#16
2235    st1         { v12.4h, v13.4h},[x0],#16
2236    st1         { v14.4h, v15.4h},[x0],#16
2237    st1         { v18.4h, v19.4h},[x0],#16
2238
2239
2240    mov         x1,x4
2241
2242
2243
2244
2245    ld1         {v10.4h, v11.4h},[x1],#16
2246    ld1         {v8.4h, v9.4h},[x1],x10
2247
2248    smull       v24.4s, v8.4h, v4.h[1]     //// y1 * cos1(part of b0)
2249    smull       v26.4s, v8.4h, v4.h[3]     //// y1 * cos3(part of b1)
2250    smull       v28.4s, v8.4h, v5.h[1]     //// y1 * sin3(part of b2)
2251    smull       v30.4s, v8.4h, v5.h[3]     //// y1 * sin1(part of b3)
2252
2253    smlsl       v24.4s, v9.4h, v3.h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
2254    smlsl       v26.4s, v9.4h, v1.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
2255    smlsl       v28.4s, v9.4h, v0.h[2]     //// y1 * sin3 - y3 * cos1(part of b2)
2256    smlsl       v30.4s, v9.4h, v1.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
2257
2258
2259
2260
2261
2262    smull       v20.4s, v10.4h, v0.h[0]
2263    smlsl       v20.4s, v11.4h, v7.h[2]
2264
2265
2266    smull       v22.4s, v10.4h, v0.h[0]
2267    smlsl       v22.4s, v11.4h, v6.h[2]
2268
2269    smull       v16.4s, v10.4h, v0.h[0]
2270    smlsl       v16.4s, v11.4h, v5.h[2]
2271
2272    smull       v18.4s, v10.4h, v0.h[0]
2273    smlsl       v18.4s, v11.4h, v4.h[2]
2274
2275    cmp         x12,x11
2276    bhs         stage2_shift3
2277
2278    ld1         {v12.4h, v13.4h},[x1],#16
2279    ld1         {v14.4h, v15.4h},[x1],x10
2280
2281    smlsl       v24.4s, v14.4h, v5.h[1]
2282    smlsl       v26.4s, v14.4h, v7.h[3]
2283    smlal       v28.4s, v14.4h, v5.h[3]
2284    smlal       v30.4s, v14.4h, v3.h[1]
2285
2286
2287    smlal       v24.4s, v15.4h, v2.h[1]
2288    smlal       v26.4s, v15.4h, v1.h[1]
2289    smlal       v28.4s, v15.4h, v4.h[3]
2290    smlsl       v30.4s, v15.4h, v7.h[3]
2291
2292
2293    smlsl       v20.4s, v12.4h, v1.h[0]
2294    smlal       v20.4s, v13.4h, v6.h[2]
2295    smlsl       v22.4s, v12.4h, v3.h[0]
2296    smlal       v22.4s, v13.4h, v3.h[2]
2297    smlsl       v16.4s, v12.4h, v5.h[0]
2298    smlal       v16.4s, v13.4h, v0.h[2]
2299    smlsl       v18.4s, v12.4h, v7.h[0]
2300    smlal       v18.4s, v13.4h, v2.h[2]
2301
2302    cmp         x12,x5
2303    bhs         stage2_shift3
2304
2305    ld1         {v10.4h, v11.4h},[x1],#16
2306    ld1         {v8.4h, v9.4h},[x1],x10
2307
2308
2309
2310    smlal       v24.4s, v8.4h, v6.h[1]     //// y1 * cos1(part of b0)
2311    smlsl       v26.4s, v8.4h, v5.h[1]     //// y1 * cos3(part of b1)
2312    smlsl       v28.4s, v8.4h, v0.h[3]     //// y1 * sin3(part of b2)
2313    smlsl       v30.4s, v8.4h, v3.h[3]     //// y1 * sin1(part of b3)
2314
2315    smlsl       v24.4s, v9.4h, v1.h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
2316    smlsl       v26.4s, v9.4h, v4.h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
2317    smlal       v28.4s, v9.4h, v6.h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
2318    smlal       v30.4s, v9.4h, v0.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
2319
2320
2321
2322
2323
2324    smlal       v20.4s, v10.4h, v2.h[0]
2325    smlsl       v20.4s, v11.4h, v5.h[2]
2326
2327
2328    smlal       v22.4s, v10.4h, v6.h[0]
2329    smlsl       v22.4s, v11.4h, v0.h[2]
2330
2331    smlsl       v16.4s, v10.4h, v6.h[0]
2332    smlsl       v16.4s, v11.4h, v4.h[2]
2333
2334    smlsl       v18.4s, v10.4h, v2.h[0]
2335    smlal       v18.4s, v11.4h, v6.h[2]
2336
2337    cmp         x12,x6
2338    bhs         stage2_shift3
2339
2340    ld1         {v12.4h, v13.4h},[x1],#16
2341    ld1         {v14.4h, v15.4h},[x1],x10
2342
2343
2344
2345
2346
2347    smlsl       v24.4s, v14.4h, v7.h[1]
2348    smlal       v26.4s, v14.4h, v2.h[1]
2349    smlal       v28.4s, v14.4h, v4.h[1]
2350    smlsl       v30.4s, v14.4h, v5.h[1]
2351
2352
2353    smlal       v24.4s, v15.4h, v0.h[3]
2354    smlal       v26.4s, v15.4h, v7.h[1]
2355    smlsl       v28.4s, v15.4h, v1.h[1]
2356    smlsl       v30.4s, v15.4h, v6.h[1]
2357
2358
2359    smlsl       v20.4s, v12.4h, v3.h[0]
2360    smlal       v20.4s, v13.4h, v4.h[2]
2361    smlal       v22.4s, v12.4h, v7.h[0]
2362    smlal       v22.4s, v13.4h, v2.h[2]
2363    smlal       v16.4s, v12.4h, v1.h[0]
2364    smlsl       v16.4s, v13.4h, v6.h[2]
2365    smlal       v18.4s, v12.4h, v5.h[0]
2366    smlsl       v18.4s, v13.4h, v0.h[2]
2367
2368    cmp         x12,x9
2369    bhs         stage2_shift3
2370
2371
2372    ld1         {v10.4h, v11.4h},[x1],#16
2373    ld1         {v8.4h, v9.4h},[x1],x10
2374
2375
2376    smlsl       v24.4s, v8.4h, v7.h[3]     //// y1 * cos1(part of b0)
2377    smlsl       v26.4s, v8.4h, v0.h[1]     //// y1 * cos3(part of b1)
2378    smlal       v28.4s, v8.4h, v6.h[3]     //// y1 * sin3(part of b2)
2379    smlal       v30.4s, v8.4h, v1.h[3]     //// y1 * sin1(part of b3)
2380
2381    smlsl       v24.4s, v9.4h, v0.h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
2382    smlal       v26.4s, v9.4h, v5.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
2383    smlal       v28.4s, v9.4h, v3.h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
2384    smlsl       v30.4s, v9.4h, v2.h[3]     //// y1 * sin1 - y3 * sin3(part of b3)
2385
2386
2387
2388
2389
2390    smlal       v20.4s, v10.4h, v0.h[0]
2391    smlsl       v20.4s, v11.4h, v3.h[2]
2392
2393
2394    smlsl       v22.4s, v10.4h, v0.h[0]
2395    smlsl       v22.4s, v11.4h, v5.h[2]
2396
2397    smlsl       v16.4s, v10.4h, v0.h[0]
2398    smlal       v16.4s, v11.4h, v1.h[2]
2399
2400    smlal       v18.4s, v10.4h, v0.h[0]
2401    smlal       v18.4s, v11.4h, v7.h[2]
2402
2403    ld1         {v12.4h, v13.4h},[x1],#16
2404    ld1         {v14.4h, v15.4h},[x1],x10
2405
2406
2407
2408
2409    smlal       v24.4s, v14.4h, v6.h[3]
2410    smlal       v26.4s, v14.4h, v3.h[3]
2411    smlsl       v28.4s, v14.4h, v1.h[3]
2412    smlal       v30.4s, v14.4h, v7.h[1]
2413
2414
2415    smlal       v24.4s, v15.4h, v1.h[3]
2416    smlsl       v26.4s, v15.4h, v2.h[3]
2417    smlal       v28.4s, v15.4h, v7.h[1]
2418    smlal       v30.4s, v15.4h, v4.h[1]
2419
2420
2421    smlsl       v20.4s, v12.4h, v5.h[0]
2422    smlal       v20.4s, v13.4h, v2.h[2]
2423    smlal       v22.4s, v12.4h, v1.h[0]
2424    smlsl       v22.4s, v13.4h, v7.h[2]
2425    smlsl       v16.4s, v12.4h, v7.h[0]
2426    smlsl       v16.4s, v13.4h, v3.h[2]
2427    smlsl       v18.4s, v12.4h, v3.h[0]
2428    smlal       v18.4s, v13.4h, v1.h[2]
2429
2430
2431    ld1         {v10.4h, v11.4h},[x1],#16
2432    ld1         {v8.4h, v9.4h},[x1],x10
2433
2434
2435    smlsl       v24.4s, v8.4h, v5.h[3]     //// y1 * cos1(part of b0)
2436    smlsl       v26.4s, v8.4h, v6.h[3]     //// y1 * cos3(part of b1)
2437    smlal       v28.4s, v8.4h, v3.h[1]     //// y1 * sin3(part of b2)
2438    smlsl       v30.4s, v8.4h, v0.h[1]     //// y1 * sin1(part of b3)
2439
2440    smlsl       v24.4s, v9.4h, v2.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
2441    smlal       v26.4s, v9.4h, v0.h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
2442    smlsl       v28.4s, v9.4h, v2.h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
2443    smlal       v30.4s, v9.4h, v4.h[3]     //// y1 * sin1 - y3 * sin3(part of b3)
2444
2445
2446
2447
2448
2449    smlal       v20.4s, v10.4h, v6.h[0]
2450    smlsl       v20.4s, v11.4h, v1.h[2]
2451
2452
2453    smlsl       v22.4s, v10.4h, v2.h[0]
2454    smlal       v22.4s, v11.4h, v4.h[2]
2455
2456    smlal       v16.4s, v10.4h, v2.h[0]
2457    smlsl       v16.4s, v11.4h, v7.h[2]
2458
2459    smlsl       v18.4s, v10.4h, v6.h[0]
2460    smlsl       v18.4s, v11.4h, v5.h[2]
2461
2462    ld1         {v12.4h, v13.4h},[x1],#16
2463    ld1         {v14.4h, v15.4h},[x1],x10
2464
2465
2466
2467    smlal       v24.4s, v14.4h, v4.h[3]
2468    smlsl       v26.4s, v14.4h, v6.h[1]
2469    smlal       v28.4s, v14.4h, v7.h[3]
2470    smlal       v30.4s, v14.4h, v6.h[3]
2471
2472
2473    smlal       v24.4s, v15.4h, v3.h[3]
2474    smlsl       v26.4s, v15.4h, v3.h[1]
2475    smlal       v28.4s, v15.4h, v2.h[3]
2476    smlsl       v30.4s, v15.4h, v2.h[1]
2477
2478
2479    smlsl       v20.4s, v12.4h, v7.h[0]
2480    smlal       v20.4s, v13.4h, v0.h[2]
2481    smlal       v22.4s, v12.4h, v5.h[0]
2482    smlsl       v22.4s, v13.4h, v1.h[2]
2483    smlsl       v16.4s, v12.4h, v3.h[0]
2484    smlal       v16.4s, v13.4h, v2.h[2]
2485    smlal       v18.4s, v12.4h, v1.h[0]
2486    smlsl       v18.4s, v13.4h, v3.h[2]
2487
2488stage2_shift3:
2489    add         v8.4s,  v20.4s ,  v24.4s
2490    sub         v10.4s,  v20.4s ,  v24.4s
2491
2492    add         v12.4s,  v22.4s ,  v26.4s
2493    sub         v24.4s,  v22.4s ,  v26.4s
2494
2495    add         v14.4s,  v16.4s ,  v28.4s
2496    sub         v26.4s,  v16.4s ,  v28.4s
2497
2498
2499    add         v16.4s,  v18.4s ,  v30.4s
2500    sub         v28.4s,  v18.4s ,  v30.4s
2501
2502
2503    sqrshrn     v30.4h, v8.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage2_idct)
2504    sqrshrn     v19.4h, v10.4s,#shift_stage2_idct //// x11 = (a0 - b0 + rnd) >> 7(shift_stage2_idct)
2505    sqrshrn     v31.4h, v14.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage2_idct)
2506    sqrshrn     v18.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage2_idct)
2507    sqrshrn     v12.4h, v12.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage2_idct)
2508    sqrshrn     v15.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage2_idct)
2509    sqrshrn     v13.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage2_idct)
2510    sqrshrn     v14.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage2_idct)
2511
2512    umov        x15,v24.d[0]
2513    umov        x16,v25.d[0]
2514    umov        x19,v26.d[0]
2515    umov        x20,v27.d[0]
2516
2517    trn1        v24.4h, v30.4h, v12.4h
2518    trn2        v25.4h, v30.4h, v12.4h
2519    trn1        v26.4h, v31.4h, v13.4h
2520    trn2        v27.4h, v31.4h, v13.4h
2521
2522    trn1        v30.2s, v24.2s, v26.2s
2523    trn2        v31.2s, v24.2s, v26.2s
2524    trn1        v12.2s, v25.2s, v27.2s
2525    trn2        v13.2s, v25.2s, v27.2s
2526
2527    trn1        v24.4h, v14.4h, v18.4h
2528    trn2        v25.4h, v14.4h, v18.4h
2529    trn1        v26.4h, v15.4h, v19.4h
2530    trn2        v27.4h, v15.4h, v19.4h
2531
2532    trn1        v14.2s, v24.2s, v26.2s
2533    trn2        v15.2s, v24.2s, v26.2s
2534    trn1        v18.2s, v25.2s, v27.2s
2535    trn2        v19.2s, v25.2s, v27.2s
2536
2537    mov         v24.d[0],x15
2538    mov         v25.d[0],x16
2539    mov         v26.d[0],x19
2540    mov         v27.d[0],x20
2541
2542    st1         { v30.4h, v31.4h},[x0],#16
2543    st1         { v12.4h, v13.4h},[x0],#16
2544    st1         { v14.4h, v15.4h},[x0],#16
2545    st1         { v18.4h, v19.4h},[x0],#16
2546
2547
2548
2549    mov         x1,x4
2550
2551
2552
2553
2554    ld1         {v10.4h, v11.4h},[x1],#16
2555    ld1         {v8.4h, v9.4h},[x1],x10
2556
2557
2558    smull       v24.4s, v8.4h, v6.h[1]     //// y1 * cos1(part of b0)
2559    smull       v26.4s, v8.4h, v6.h[3]     //// y1 * cos3(part of b1)
2560    smull       v28.4s, v8.4h, v7.h[1]     //// y1 * sin3(part of b2)
2561    smull       v30.4s, v8.4h, v7.h[3]     //// y1 * sin1(part of b3)
2562
2563    smlsl       v24.4s, v9.4h, v2.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
2564    smlsl       v26.4s, v9.4h, v4.h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
2565    smlsl       v28.4s, v9.4h, v5.h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
2566    smlsl       v30.4s, v9.4h, v7.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
2567
2568
2569
2570
2571
2572    smull       v20.4s, v10.4h, v0.h[0]
2573    smlsl       v20.4s, v11.4h, v3.h[2]
2574
2575
2576    smull       v22.4s, v10.4h, v0.h[0]
2577    smlsl       v22.4s, v11.4h, v2.h[2]
2578
2579    smull       v16.4s, v10.4h, v0.h[0]
2580    smlsl       v16.4s, v11.4h, v1.h[2]
2581
2582    smull       v18.4s, v10.4h, v0.h[0]
2583    smlsl       v18.4s, v11.4h, v0.h[2]
2584
2585    cmp         x12,x11
2586    bhs         stage2_shift4
2587    ld1         {v12.4h, v13.4h},[x1],#16
2588    ld1         {v14.4h, v15.4h},[x1],x10
2589
2590
2591
2592
2593
2594
2595    smlal       v24.4s, v14.4h, v0.h[1]
2596    smlal       v26.4s, v14.4h, v1.h[3]
2597    smlal       v28.4s, v14.4h, v4.h[1]
2598    smlal       v30.4s, v14.4h, v6.h[3]
2599
2600
2601    smlsl       v24.4s, v15.4h, v4.h[1]
2602    smlsl       v26.4s, v15.4h, v0.h[3]
2603    smlsl       v28.4s, v15.4h, v2.h[3]
2604    smlsl       v30.4s, v15.4h, v6.h[1]
2605
2606
2607    smlal       v20.4s, v12.4h, v7.h[0]
2608    smlal       v20.4s, v13.4h, v5.h[2]
2609    smlal       v22.4s, v12.4h, v5.h[0]
2610    smlsl       v22.4s, v13.4h, v7.h[2]
2611    smlal       v16.4s, v12.4h, v3.h[0]
2612    smlsl       v16.4s, v13.4h, v4.h[2]
2613    smlal       v18.4s, v12.4h, v1.h[0]
2614    smlsl       v18.4s, v13.4h, v1.h[2]
2615
2616    cmp         x12,x5
2617    bhs         stage2_shift4
2618
2619    ld1         {v10.4h, v11.4h},[x1],#16
2620    ld1         {v8.4h, v9.4h},[x1],x10
2621
2622
2623
2624    smlal       v24.4s, v8.4h, v7.h[3]     //// y1 * cos1(part of b0)
2625    smlal       v26.4s, v8.4h, v3.h[1]     //// y1 * cos3(part of b1)
2626    smlal       v28.4s, v8.4h, v1.h[1]     //// y1 * sin3(part of b2)
2627    smlal       v30.4s, v8.4h, v5.h[3]     //// y1 * sin1(part of b3)
2628
2629    smlal       v24.4s, v9.4h, v4.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
2630    smlsl       v26.4s, v9.4h, v5.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
2631    smlsl       v28.4s, v9.4h, v0.h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
2632    smlsl       v30.4s, v9.4h, v5.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
2633
2634
2635
2636
2637
2638    smlsl       v20.4s, v10.4h, v2.h[0]
2639    smlal       v20.4s, v11.4h, v1.h[2]
2640
2641
2642    smlsl       v22.4s, v10.4h, v6.h[0]
2643    smlal       v22.4s, v11.4h, v3.h[2]
2644
2645    smlal       v16.4s, v10.4h, v6.h[0]
2646    smlsl       v16.4s, v11.4h, v7.h[2]
2647
2648    smlal       v18.4s, v10.4h, v2.h[0]
2649    smlsl       v18.4s, v11.4h, v2.h[2]
2650
2651    cmp         x12,x6
2652    bhs         stage2_shift4
2653
2654
2655    ld1         {v12.4h, v13.4h},[x1],#16
2656    ld1         {v14.4h, v15.4h},[x1],x10
2657
2658
2659
2660
2661
2662
2663    smlsl       v24.4s, v14.4h, v1.h[1]
2664    smlsl       v26.4s, v14.4h, v7.h[3]
2665    smlal       v28.4s, v14.4h, v1.h[3]
2666    smlal       v30.4s, v14.4h, v4.h[3]
2667
2668
2669    smlal       v24.4s, v15.4h, v2.h[1]
2670    smlal       v26.4s, v15.4h, v5.h[1]
2671    smlsl       v28.4s, v15.4h, v3.h[1]
2672    smlsl       v30.4s, v15.4h, v4.h[1]
2673
2674
2675    smlsl       v20.4s, v12.4h, v5.h[0]
2676    smlsl       v20.4s, v13.4h, v7.h[2]
2677    smlsl       v22.4s, v12.4h, v1.h[0]
2678    smlal       v22.4s, v13.4h, v1.h[2]
2679    smlsl       v16.4s, v12.4h, v7.h[0]
2680    smlal       v16.4s, v13.4h, v5.h[2]
2681    smlal       v18.4s, v12.4h, v3.h[0]
2682    smlsl       v18.4s, v13.4h, v3.h[2]
2683
2684    cmp         x12,x9
2685    bhs         stage2_shift4
2686
2687
2688    ld1         {v10.4h, v11.4h},[x1],#16
2689    ld1         {v8.4h, v9.4h},[x1],x10
2690
2691
2692    smlsl       v24.4s, v8.4h, v5.h[3]     //// y1 * cos1(part of b0)
2693    smlsl       v26.4s, v8.4h, v2.h[3]     //// y1 * cos3(part of b1)
2694    smlal       v28.4s, v8.4h, v4.h[3]     //// y1 * sin3(part of b2)
2695    smlal       v30.4s, v8.4h, v3.h[3]     //// y1 * sin1(part of b3)
2696
2697    smlsl       v24.4s, v9.4h, v6.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
2698    smlal       v26.4s, v9.4h, v0.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
2699    smlsl       v28.4s, v9.4h, v6.h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
2700    smlsl       v30.4s, v9.4h, v3.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
2701
2702
2703
2704
2705
2706    smlal       v20.4s, v10.4h, v0.h[0]
2707    smlsl       v20.4s, v11.4h, v0.h[2]
2708
2709
2710    smlsl       v22.4s, v10.4h, v0.h[0]
2711    smlal       v22.4s, v11.4h, v6.h[2]
2712
2713    smlsl       v16.4s, v10.4h, v0.h[0]
2714    smlal       v16.4s, v11.4h, v2.h[2]
2715
2716    smlal       v18.4s, v10.4h, v0.h[0]
2717    smlsl       v18.4s, v11.4h, v4.h[2]
2718
2719    ld1         {v12.4h, v13.4h},[x1],#16
2720    ld1         {v14.4h, v15.4h},[x1],x10
2721
2722
2723
2724
2725    smlal       v24.4s, v14.4h, v3.h[1]
2726    smlsl       v26.4s, v14.4h, v2.h[1]
2727    smlal       v28.4s, v14.4h, v7.h[3]
2728    smlal       v30.4s, v14.4h, v2.h[3]
2729
2730
2731    smlsl       v24.4s, v15.4h, v0.h[3]
2732    smlal       v26.4s, v15.4h, v4.h[3]
2733    smlal       v28.4s, v15.4h, v6.h[3]
2734    smlsl       v30.4s, v15.4h, v2.h[1]
2735
2736
2737    smlal       v20.4s, v12.4h, v3.h[0]
2738    smlsl       v20.4s, v13.4h, v6.h[2]
2739    smlal       v22.4s, v12.4h, v7.h[0]
2740    smlsl       v22.4s, v13.4h, v4.h[2]
2741    smlsl       v16.4s, v12.4h, v1.h[0]
2742    smlal       v16.4s, v13.4h, v0.h[2]
2743    smlal       v18.4s, v12.4h, v5.h[0]
2744    smlsl       v18.4s, v13.4h, v5.h[2]
2745
2746
2747    ld1         {v10.4h, v11.4h},[x1],#16
2748    ld1         {v8.4h, v9.4h},[x1],x10
2749
2750
2751
2752
2753    smlal       v24.4s, v8.4h, v3.h[3]     //// y1 * cos1(part of b0)
2754    smlsl       v26.4s, v8.4h, v7.h[1]     //// y1 * cos3(part of b1)
2755    smlsl       v28.4s, v8.4h, v5.h[1]     //// y1 * sin3(part of b2)
2756    smlal       v30.4s, v8.4h, v1.h[3]     //// y1 * sin1(part of b3)
2757
2758    smlsl       v24.4s, v9.4h, v7.h[1]     //// y1 * cos1 + y3 * cos3(part of b0)
2759    smlsl       v26.4s, v9.4h, v6.h[1]     //// y1 * cos3 - y3 * sin1(part of b1)
2760    smlal       v28.4s, v9.4h, v3.h[3]     //// y1 * sin3 - y3 * cos1(part of b2)
2761    smlsl       v30.4s, v9.4h, v1.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
2762
2763
2764
2765
2766
2767    smlsl       v20.4s, v10.4h, v6.h[0]
2768    smlal       v20.4s, v11.4h, v2.h[2]
2769
2770
2771    smlal       v22.4s, v10.4h, v2.h[0]
2772    smlsl       v22.4s, v11.4h, v0.h[2]
2773
2774    smlsl       v16.4s, v10.4h, v2.h[0]
2775    smlal       v16.4s, v11.4h, v3.h[2]
2776
2777    smlal       v18.4s, v10.4h, v6.h[0]
2778    smlsl       v18.4s, v11.4h, v6.h[2]
2779
2780
2781    ld1         {v12.4h, v13.4h},[x1],#16
2782    ld1         {v14.4h, v15.4h},[x1],x10
2783
2784
2785
2786    smlsl       v24.4s, v14.4h, v5.h[1]
2787    smlal       v26.4s, v14.4h, v3.h[3]
2788    smlsl       v28.4s, v14.4h, v2.h[1]
2789    smlal       v30.4s, v14.4h, v0.h[3]
2790
2791
2792    smlal       v24.4s, v15.4h, v1.h[3]
2793    smlsl       v26.4s, v15.4h, v1.h[1]
2794    smlal       v28.4s, v15.4h, v0.h[3]
2795    smlsl       v30.4s, v15.4h, v0.h[1]
2796
2797
2798    smlsl       v20.4s, v12.4h, v1.h[0]
2799    smlal       v20.4s, v13.4h, v4.h[2]
2800    smlal       v22.4s, v12.4h, v3.h[0]
2801    smlsl       v22.4s, v13.4h, v5.h[2]
2802    smlsl       v16.4s, v12.4h, v5.h[0]
2803    smlal       v16.4s, v13.4h, v6.h[2]
2804    smlal       v18.4s, v12.4h, v7.h[0]
2805    smlsl       v18.4s, v13.4h, v7.h[2]
2806
2807stage2_shift4:
2808    add         v8.4s,  v20.4s ,  v24.4s
2809    sub         v10.4s,  v20.4s ,  v24.4s
2810
2811    add         v12.4s,  v22.4s ,  v26.4s
2812    sub         v24.4s,  v22.4s ,  v26.4s
2813
2814    add         v14.4s,  v16.4s ,  v28.4s
2815    sub         v26.4s,  v16.4s ,  v28.4s
2816
2817
2818    add         v16.4s,  v18.4s ,  v30.4s
2819    sub         v28.4s,  v18.4s ,  v30.4s
2820
2821
2822    sqrshrn     v30.4h, v8.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage2_idct)
2823    sqrshrn     v19.4h, v10.4s,#shift_stage2_idct //// x11 = (a0 - b0 + rnd) >> 7(shift_stage2_idct)
2824    sqrshrn     v31.4h, v14.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage2_idct)
2825    sqrshrn     v18.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage2_idct)
2826    sqrshrn     v12.4h, v12.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage2_idct)
2827    sqrshrn     v15.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage2_idct)
2828    sqrshrn     v13.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage2_idct)
2829    sqrshrn     v14.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage2_idct)
2830
2831
2832
2833    umov        x15,v24.d[0]
2834    umov        x16,v25.d[0]
2835    umov        x19,v26.d[0]
2836    umov        x20,v27.d[0]
2837
2838    trn1        v24.4h, v30.4h, v12.4h
2839    trn2        v25.4h, v30.4h, v12.4h
2840    trn1        v26.4h, v31.4h, v13.4h
2841    trn2        v27.4h, v31.4h, v13.4h
2842
2843    trn1        v30.2s, v24.2s, v26.2s
2844    trn2        v31.2s, v24.2s, v26.2s
2845    trn1        v12.2s, v25.2s, v27.2s
2846    trn2        v13.2s, v25.2s, v27.2s
2847
2848    trn1        v24.4h, v14.4h, v18.4h
2849    trn2        v25.4h, v14.4h, v18.4h
2850    trn1        v26.4h, v15.4h, v19.4h
2851    trn2        v27.4h, v15.4h, v19.4h
2852
2853    trn1        v14.2s, v24.2s, v26.2s
2854    trn2        v15.2s, v24.2s, v26.2s
2855    trn1        v18.2s, v25.2s, v27.2s
2856    trn2        v19.2s, v25.2s, v27.2s
2857
2858    mov         v24.d[0],x15
2859    mov         v25.d[0],x16
2860    mov         v26.d[0],x19
2861    mov         v27.d[0],x20
2862
2863    st1         { v30.4h, v31.4h},[x0],#16
2864    st1         { v12.4h, v13.4h},[x0],#16
2865    st1         { v14.4h, v15.4h},[x0],#16
2866    st1         { v18.4h, v19.4h},[x0],#16
2867
2868
2869
2870
2871    sub         x0,x0,#256
2872prediction_buffer:
2873
2874
2875    ld1         {v12.8h},[x0],#16
2876    ld1         {v14.8h},[x0],#16
2877
2878    add         x0,x0,#32
2879
2880    ld1         {v16.8h},[x0],#16
2881    ld1         {v18.8h},[x0],#16
2882    add         x0,x0,#32
2883
2884    ld1         {v20.8h},[x0],#16
2885    ld1         {v22.8h},[x0],#16
2886
2887
2888    add         x0,x0,#32
2889
2890    ld1         {v24.8h},[x0],#16
2891    ld1         {v26.8h},[x0],#16
2892
2893
2894
2895
2896
2897// d12 =x0 1- 4 values
2898// d13 =x2 1- 4 values
2899// d14=x1 1- 4 values
2900// d15=x3 1- 4 values
2901
2902// d16 =x0 5- 8 values
2903// d17 =x2 5- 8 values
2904// d18=x1 5- 8 values
2905// d19=x3 5- 8 values
2906
2907// d20 =x0 9- 12 values
2908// d21 =x2 9- 12 values
2909// d22=x1 9- 12 values
2910// d23=x3 9- 12 values
2911
2912// d24 =x0 13-16 values
2913// d25 =x2 13- 16 values
2914// d26=x1 13- 16 values
2915// d27=x3 13- 16 values
2916
2917    // swapping v12 upper and v16 lower 64bits
2918    mov         v13.d[0], v12.d[1]
2919    mov         v12.d[1], v16.d[0]
2920    mov         v16.d[0], v13.d[0]
2921    // swapping v20 upper and v24 lower 64bits
2922    mov         v21.d[0], v20.d[1]
2923    mov         v20.d[1], v24.d[0]
2924    mov         v24.d[0], v21.d[0]
2925    // swapping v14 uppper and v18 lower 64bits
2926    mov         v15.d[0], v14.d[1]
2927    mov         v14.d[1], v18.d[0]
2928    mov         v18.d[0], v15.d[0]
2929    // swapping v22 upper and v26 lower 64bits
2930    mov         v23.d[0], v22.d[1]
2931    mov         v22.d[1], v26.d[0]
2932    mov         v26.d[0], v23.d[0]
2933
2934
2935    ld1         {v8.8b, v9.8b},[x2],x8
2936    ld1         {v10.8b, v11.8b},[x2],x8
2937    ld1         {v28.8b, v29.8b},[x2],x8
2938    ld1         {v30.8b, v31.8b},[x2],x8
2939
2940
2941    uaddw       v12.8h,  v12.8h ,  v8.8b
2942    uaddw       v20.8h,  v20.8h ,  v9.8b
2943    uaddw       v14.8h,  v14.8h ,  v10.8b
2944    uaddw       v22.8h,  v22.8h ,  v11.8b
2945    uaddw       v16.8h,  v16.8h ,  v28.8b
2946    uaddw       v24.8h,  v24.8h ,  v29.8b
2947    uaddw       v18.8h,  v18.8h ,  v30.8b
2948    uaddw       v26.8h,  v26.8h ,  v31.8b
2949    sub         x2,x2,x8,lsl #2
2950    add         x2,x2,#16
2951    sqxtun      v12.8b, v12.8h
2952    sqxtun      v13.8b, v20.8h
2953    sqxtun      v20.8b, v14.8h
2954    sqxtun      v21.8b, v22.8h
2955    sqxtun      v14.8b, v16.8h
2956    sqxtun      v15.8b, v24.8h
2957    sqxtun      v22.8b, v18.8h
2958    sqxtun      v23.8b, v26.8h
2959
2960
2961    st1         {v12.8b, v13.8b},[x3],x7
2962    st1         {v20.8b, v21.8b},[x3],x7
2963    st1         {v14.8b, v15.8b},[x3],x7
2964    st1         {v22.8b, v23.8b},[x3],x7
2965
2966
2967    sub         x3,x3,x7,lsl #2
2968    add         x3,x3,#16
2969
2970    ld1         {v12.8h},[x0],#16
2971    ld1         {v14.8h},[x0],#16
2972
2973    sub         x0,x0,#96
2974
2975    ld1         {v16.8h},[x0],#16
2976    ld1         {v18.8h},[x0],#16
2977    sub         x0,x0,#96
2978
2979    ld1         {v20.8h},[x0],#16
2980    ld1         {v22.8h},[x0],#16
2981
2982
2983    sub         x0,x0,#96
2984
2985    ld1         {v24.8h},[x0],#16
2986    ld1         {v26.8h},[x0],#16
2987
2988
2989    sub         x0,x0,#64
2990
2991
2992    // swapping v12 upper and v16 lower 64bits
2993    mov         v13.d[0], v12.d[1]
2994    mov         v12.d[1], v16.d[0]
2995    mov         v16.d[0], v13.d[0]
2996    // swapping v20 upper and v24 lower 64bits
2997    mov         v21.d[0], v20.d[1]
2998    mov         v20.d[1], v24.d[0]
2999    mov         v24.d[0], v21.d[0]
3000    // swapping v14 uppper and v18 lower 64bits
3001    mov         v15.d[0], v14.d[1]
3002    mov         v14.d[1], v18.d[0]
3003    mov         v18.d[0], v15.d[0]
3004    // swapping v22 upper and v26 lower 64bits
3005    mov         v23.d[0], v22.d[1]
3006    mov         v22.d[1], v26.d[0]
3007    mov         v26.d[0], v23.d[0]
3008
3009
3010    ld1         {v8.8b, v9.8b},[x2],x8
3011    ld1         {v10.8b, v11.8b},[x2],x8
3012    ld1         {v28.8b, v29.8b},[x2],x8
3013    ld1         {v30.8b, v31.8b},[x2],x8
3014
3015
3016    uaddw       v12.8h,  v12.8h ,  v8.8b
3017    uaddw       v20.8h,  v20.8h ,  v9.8b
3018    uaddw       v14.8h,  v14.8h ,  v10.8b
3019    uaddw       v22.8h,  v22.8h ,  v11.8b
3020    uaddw       v16.8h,  v16.8h ,  v28.8b
3021    uaddw       v24.8h,  v24.8h ,  v29.8b
3022    uaddw       v18.8h,  v18.8h ,  v30.8b
3023    uaddw       v26.8h,  v26.8h ,  v31.8b
3024    sub         x2,x2,#16
3025
3026    sqxtun      v12.8b, v12.8h
3027    sqxtun      v13.8b, v20.8h
3028    sqxtun      v20.8b, v14.8h
3029    sqxtun      v21.8b, v22.8h
3030    sqxtun      v14.8b, v16.8h
3031    sqxtun      v15.8b, v24.8h
3032    sqxtun      v22.8b, v18.8h
3033    sqxtun      v23.8b, v26.8h
3034
3035
3036    st1         {v12.8b, v13.8b},[x3],x7
3037    st1         {v20.8b, v21.8b},[x3],x7
3038    st1         {v14.8b, v15.8b},[x3],x7
3039    st1         {v22.8b, v23.8b},[x3],x7
3040
3041    sub         x3,x3,#16
3042
3043    subs        x14,x14,#1
3044    bne         dct_stage2
3045    // ldmfd sp!,{x0-x12,pc}
3046    ldp         x19, x20,[sp],#16
3047    pop_v_regs
3048    ret
3049
3050
3051
3052
3053
3054