1f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org;
2f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org;
4f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org;  Use of this source code is governed by a BSD-style license
5f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org;  that can be found in the LICENSE file in the root of the source
6f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org;  tree. An additional intellectual property rights grant can be found
7f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org;  in the file PATENTS.  All contributing project authors may
8f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org;  be found in the AUTHORS file in the root of the source tree.
9f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org;
10f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
11ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org    EXPORT  |vp9_iht8x8_64_add_neon|
12f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ARM
13f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    REQUIRE8
14f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    PRESERVE8
15f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
16f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    AREA ||.text||, CODE, READONLY, ALIGN=2
17f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
18f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; Generate IADST constants in r0 - r12 for the IADST.
19f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    MACRO
20f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    GENERATE_IADST_CONSTANTS
21f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; generate  cospi_2_64  = 16305
22f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    mov             r0, #0x3f00
23f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    add             r0, #0xb1
24f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
25f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; generate cospi_30_64 = 1606
26f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    mov             r1, #0x600
27f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    add             r1, #0x46
28f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
29f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; generate cospi_10_64 = 14449
30f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    mov             r2, #0x3800
31f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    add             r2, #0x71
32f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
33f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; generate cospi_22_64 = 7723
34f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    mov             r3, #0x1e00
35f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    add             r3, #0x2b
36f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
37f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; generate cospi_18_64 = 10394
38f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    mov             r4, #0x2800
39f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    add             r4, #0x9a
40f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
41f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; generate cospi_14_64 = 12665
42f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    mov             r5, #0x3100
43f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    add             r5, #0x79
44f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
45f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; generate cospi_26_64 = 4756
46f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    mov             r6, #0x1200
47f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    add             r6, #0x94
48f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
49f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; generate cospi_6_64  = 15679
50f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    mov             r7, #0x3d00
51f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    add             r7, #0x3f
52f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
53f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; generate cospi_8_64  = 15137
54f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    mov             r8, #0x3b00
55f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    add             r8, #0x21
56f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
57f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; generate cospi_24_64 = 6270
58f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    mov             r9, #0x1800
59f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    add             r9, #0x7e
60f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
61f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; generate 0
62f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    mov             r10, #0
63f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
64f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; generate  cospi_16_64 = 11585
65f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    mov             r12, #0x2d00
66f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    add             r12, #0x41
67f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    MEND
68f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
69f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; Generate IDCT constants in r3 - r9 for the IDCT.
70f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    MACRO
71f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    GENERATE_IDCT_CONSTANTS
72f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; generate  cospi_28_64 = 3196
73f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    mov             r3, #0x0c00
74f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    add             r3, #0x7c
75f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
76f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; generate cospi_4_64  = 16069
77f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    mov             r4, #0x3e00
78f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    add             r4, #0xc5
79f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
80f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; generate cospi_12_64 = 13623
81f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    mov             r5, #0x3500
82f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    add             r5, #0x37
83f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
84f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; generate cospi_20_64 = 9102
85f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    mov             r6, #0x2300
86f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    add             r6, #0x8e
87f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
88f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; generate cospi_16_64 = 11585
89f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    mov             r7, #0x2d00
90f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    add             r7, #0x41
91f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
92f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; generate cospi_24_64 = 6270
93f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    mov             r8, #0x1800
94f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    add             r8, #0x7e
95f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
96f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; generate cospi_8_64 = 15137
97f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    mov             r9, #0x3b00
98f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    add             r9, #0x21
99f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    MEND
100f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
101f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; Transpose a 8x8 16bits data matrix. Datas are loaded in q8-q15.
102f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    MACRO
103f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    TRANSPOSE8X8
104f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vswp            d17, d24
105f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vswp            d23, d30
106f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vswp            d21, d28
107f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vswp            d19, d26
108f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vtrn.32         q8, q10
109f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vtrn.32         q9, q11
110f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vtrn.32         q12, q14
111f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vtrn.32         q13, q15
112f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vtrn.16         q8, q9
113f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vtrn.16         q10, q11
114f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vtrn.16         q12, q13
115f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vtrn.16         q14, q15
116f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    MEND
117f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
118f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; Parallel 1D IDCT on all the columns of a 8x8 16bits data matrix which are
119f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; loaded in q8-q15. The IDCT constants are loaded in r3 - r9. The output
120f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; will be stored back into q8-q15 registers. This macro will touch q0-q7
121f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; registers and use them as buffer during calculation.
122f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    MACRO
123f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    IDCT8x8_1D
124f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; stage 1
125f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vdup.16         d0, r3                    ; duplicate cospi_28_64
126f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vdup.16         d1, r4                    ; duplicate cospi_4_64
127f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vdup.16         d2, r5                    ; duplicate cospi_12_64
128f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vdup.16         d3, r6                    ; duplicate cospi_20_64
129f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
130f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; input[1] * cospi_28_64
131f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmull.s16       q2, d18, d0
132f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmull.s16       q3, d19, d0
133f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
134f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; input[5] * cospi_12_64
135f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmull.s16       q5, d26, d2
136f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmull.s16       q6, d27, d2
137f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
138f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; input[1]*cospi_28_64-input[7]*cospi_4_64
139f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmlsl.s16       q2, d30, d1
140f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmlsl.s16       q3, d31, d1
141f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
142f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; input[5] * cospi_12_64 - input[3] * cospi_20_64
143f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmlsl.s16       q5, d22, d3
144f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmlsl.s16       q6, d23, d3
145f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
146f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; dct_const_round_shift(input_dc * cospi_16_64)
147f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vqrshrn.s32     d8, q2, #14               ; >> 14
148f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vqrshrn.s32     d9, q3, #14               ; >> 14
149f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
150f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; dct_const_round_shift(input_dc * cospi_16_64)
151f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vqrshrn.s32     d10, q5, #14              ; >> 14
152f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vqrshrn.s32     d11, q6, #14              ; >> 14
153f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
154f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; input[1] * cospi_4_64
155f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmull.s16       q2, d18, d1
156f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmull.s16       q3, d19, d1
157f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
158f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; input[5] * cospi_20_64
159f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmull.s16       q9, d26, d3
160f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmull.s16       q13, d27, d3
161f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
162f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; input[1]*cospi_4_64+input[7]*cospi_28_64
163f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmlal.s16       q2, d30, d0
164f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmlal.s16       q3, d31, d0
165f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
166f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; input[5] * cospi_20_64 + input[3] * cospi_12_64
167f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmlal.s16       q9, d22, d2
168f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmlal.s16       q13, d23, d2
169f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
170f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; dct_const_round_shift(input_dc * cospi_16_64)
171f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vqrshrn.s32     d14, q2, #14              ; >> 14
172f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vqrshrn.s32     d15, q3, #14              ; >> 14
173f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
174f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; stage 2 & stage 3 - even half
175f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vdup.16         d0, r7                    ; duplicate cospi_16_64
176f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
177f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; dct_const_round_shift(input_dc * cospi_16_64)
178f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vqrshrn.s32     d12, q9, #14              ; >> 14
179f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vqrshrn.s32     d13, q13, #14             ; >> 14
180f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
181f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; input[0] * cospi_16_64
182f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmull.s16       q2, d16, d0
183f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmull.s16       q3, d17, d0
184f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
185f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; input[0] * cospi_16_64
186f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmull.s16       q13, d16, d0
187f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmull.s16       q15, d17, d0
188f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
189f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; (input[0] + input[2]) * cospi_16_64
190f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmlal.s16       q2,  d24, d0
191f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmlal.s16       q3, d25, d0
192f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
193f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; (input[0] - input[2]) * cospi_16_64
194f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmlsl.s16       q13, d24, d0
195f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmlsl.s16       q15, d25, d0
196f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
197f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vdup.16         d0, r8                    ; duplicate cospi_24_64
198f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vdup.16         d1, r9                    ; duplicate cospi_8_64
199f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
200f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; dct_const_round_shift(input_dc * cospi_16_64)
201f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vqrshrn.s32     d18, q2, #14              ; >> 14
202f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vqrshrn.s32     d19, q3, #14              ; >> 14
203f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
204f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; dct_const_round_shift(input_dc * cospi_16_64)
205f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vqrshrn.s32     d22, q13, #14             ; >> 14
206f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vqrshrn.s32     d23, q15, #14             ; >> 14
207f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
208f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; input[1] * cospi_24_64
209f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmull.s16       q2, d20, d0
210f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmull.s16       q3, d21, d0
211f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
212f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; input[1] * cospi_8_64
213f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmull.s16       q8, d20, d1
214f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmull.s16       q12, d21, d1
215f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
216f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; input[1] * cospi_24_64 - input[3] * cospi_8_64
217f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmlsl.s16       q2, d28, d1
218f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmlsl.s16       q3, d29, d1
219f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
220f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; input[1] * cospi_8_64 + input[3] * cospi_24_64
221f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmlal.s16       q8, d28, d0
222f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmlal.s16       q12, d29, d0
223f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
224f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; dct_const_round_shift(input_dc * cospi_16_64)
225f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vqrshrn.s32     d26, q2, #14              ; >> 14
226f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vqrshrn.s32     d27, q3, #14              ; >> 14
227f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
228f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; dct_const_round_shift(input_dc * cospi_16_64)
229f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vqrshrn.s32     d30, q8, #14              ; >> 14
230f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vqrshrn.s32     d31, q12, #14             ; >> 14
231f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
232f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vadd.s16        q0, q9, q15               ; output[0] = step[0] + step[3]
233f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vadd.s16        q1, q11, q13              ; output[1] = step[1] + step[2]
234f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vsub.s16        q2, q11, q13              ; output[2] = step[1] - step[2]
235f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vsub.s16        q3, q9, q15               ; output[3] = step[0] - step[3]
236f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
237f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; stage 3 -odd half
238f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vdup.16         d16, r7                   ; duplicate cospi_16_64
239f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
240f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; stage 2 - odd half
241f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vsub.s16        q13, q4, q5               ; step2[5] = step1[4] - step1[5]
242f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vadd.s16        q4, q4, q5                ; step2[4] = step1[4] + step1[5]
243f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vsub.s16        q14, q7, q6               ; step2[6] = -step1[6] + step1[7]
244f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vadd.s16        q7, q7, q6                ; step2[7] = step1[6] + step1[7]
245f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
246f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; step2[6] * cospi_16_64
247f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmull.s16       q9, d28, d16
248f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmull.s16       q10, d29, d16
249f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
250f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; step2[6] * cospi_16_64
251f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmull.s16       q11, d28, d16
252f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmull.s16       q12, d29, d16
253f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
254f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; (step2[6] - step2[5]) * cospi_16_64
255f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmlsl.s16       q9, d26, d16
256f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmlsl.s16       q10, d27, d16
257f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
258f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; (step2[5] + step2[6]) * cospi_16_64
259f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmlal.s16       q11, d26, d16
260f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmlal.s16       q12, d27, d16
261f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
262f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; dct_const_round_shift(input_dc * cospi_16_64)
263f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vqrshrn.s32     d10, q9, #14              ; >> 14
264f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vqrshrn.s32     d11, q10, #14             ; >> 14
265f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
266f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; dct_const_round_shift(input_dc * cospi_16_64)
267f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vqrshrn.s32     d12, q11, #14             ; >> 14
268f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vqrshrn.s32     d13, q12, #14             ; >> 14
269f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
270f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; stage 4
271f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vadd.s16        q8, q0, q7                ; output[0] = step1[0] + step1[7];
272f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vadd.s16        q9, q1, q6                ; output[1] = step1[1] + step1[6];
273f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vadd.s16        q10, q2, q5               ; output[2] = step1[2] + step1[5];
274f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vadd.s16        q11, q3, q4               ; output[3] = step1[3] + step1[4];
275f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vsub.s16        q12, q3, q4               ; output[4] = step1[3] - step1[4];
276f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vsub.s16        q13, q2, q5               ; output[5] = step1[2] - step1[5];
277f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vsub.s16        q14, q1, q6               ; output[6] = step1[1] - step1[6];
278f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vsub.s16        q15, q0, q7               ; output[7] = step1[0] - step1[7];
279f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    MEND
280f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
281f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; Parallel 1D IADST on all the columns of a 8x8 16bits data matrix which
282f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; loaded in q8-q15. IADST constants are loaded in r0 - r12 registers. The
283f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; output will be stored back into q8-q15 registers. This macro will touch
284f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; q0 - q7 registers and use them as buffer during calculation.
285f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    MACRO
286f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    IADST8X8_1D
287f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vdup.16         d14, r0                   ; duplicate cospi_2_64
288f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vdup.16         d15, r1                   ; duplicate cospi_30_64
289f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
290f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; cospi_2_64  * x0
291f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmull.s16       q1, d30, d14
292f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmull.s16       q2, d31, d14
293f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
294f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; cospi_30_64 * x0
295f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmull.s16       q3, d30, d15
296f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmull.s16       q4, d31, d15
297f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
2989c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    vdup.16         d30, r4                   ; duplicate cospi_18_64
2999c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    vdup.16         d31, r5                   ; duplicate cospi_14_64
3009c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org
3019c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    ; s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
3029c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    vmlal.s16       q1, d16, d15
3039c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    vmlal.s16       q2, d17, d15
3049c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org
305f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; s1 = cospi_30_64 * x0 - cospi_2_64  * x1
306f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmlsl.s16       q3, d16, d14
307f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmlsl.s16       q4, d17, d14
308f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
309f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; cospi_18_64 * x4
310f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmull.s16       q5, d22, d30
311f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmull.s16       q6, d23, d30
312f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
313f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; cospi_14_64 * x4
314f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmull.s16       q7, d22, d31
315f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmull.s16       q8, d23, d31
316f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
3179c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    ; s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
3189c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    vmlal.s16       q5, d24, d31
3199c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    vmlal.s16       q6, d25, d31
3209c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org
321f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; s5 = cospi_14_64 * x4 - cospi_18_64 * x5
322f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmlsl.s16       q7, d24, d30
323f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmlsl.s16       q8, d25, d30
324f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
325f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; (s0 + s4)
326f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vadd.s32        q11, q1, q5
327f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vadd.s32        q12, q2, q6
328f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
3299c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    vdup.16         d0, r2                   ; duplicate cospi_10_64
3309c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    vdup.16         d1, r3                   ; duplicate cospi_22_64
331f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
332f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; (s0 - s4)
333f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vsub.s32        q1, q1, q5
334f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vsub.s32        q2, q2, q6
335f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
3369c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    ; x0 = dct_const_round_shift(s0 + s4);
3379c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    vqrshrn.s32     d22, q11, #14             ; >> 14
3389c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    vqrshrn.s32     d23, q12, #14             ; >> 14
339f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
340f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; (s1 + s5)
341f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vadd.s32        q12, q3, q7
342f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vadd.s32        q15, q4, q8
343f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
344f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; (s1 - s5)
345f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vsub.s32        q3, q3, q7
346f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vsub.s32        q4, q4, q8
347f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
3489c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    ; x4 = dct_const_round_shift(s0 - s4);
3499c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    vqrshrn.s32     d2, q1, #14               ; >> 14
3509c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    vqrshrn.s32     d3, q2, #14               ; >> 14
3519c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org
3529c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    ; x1 = dct_const_round_shift(s1 + s5);
3539c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    vqrshrn.s32     d24, q12, #14             ; >> 14
3549c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    vqrshrn.s32     d25, q15, #14             ; >> 14
3559c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org
356f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; x5 = dct_const_round_shift(s1 - s5);
357f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vqrshrn.s32     d6, q3, #14               ; >> 14
358f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vqrshrn.s32     d7, q4, #14               ; >> 14
359f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
360f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; cospi_10_64 * x2
3619c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    vmull.s16       q4, d26, d0
3629c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    vmull.s16       q5, d27, d0
363f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
364f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; cospi_22_64 * x2
3659c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    vmull.s16       q2, d26, d1
3669c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    vmull.s16       q6, d27, d1
367f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
368f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vdup.16         d30, r6                   ; duplicate cospi_26_64
369f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vdup.16         d31, r7                   ; duplicate cospi_6_64
370f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
3719c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    ; s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
3729c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    vmlal.s16       q4, d20, d1
3739c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    vmlal.s16       q5, d21, d1
3749c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org
3759c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    ; s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
3769c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    vmlsl.s16       q2, d20, d0
3779c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    vmlsl.s16       q6, d21, d0
3789c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org
379f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; cospi_26_64 * x6
380f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmull.s16       q0, d18, d30
381f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmull.s16       q13, d19, d30
382f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
383f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
384f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmlal.s16       q0, d28, d31
385f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmlal.s16       q13, d29, d31
386f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
387f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; cospi_6_64  * x6
388f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmull.s16       q10, d18, d31
389f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmull.s16       q9, d19, d31
390f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
391f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
392f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmlsl.s16       q10, d28, d30
393f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmlsl.s16       q9, d29, d30
394f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
395f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; (s3 + s7)
396f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vadd.s32        q14, q2, q10
397f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vadd.s32        q15, q6, q9
398f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
399f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; (s3 - s7)
400f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vsub.s32        q2, q2, q10
401f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vsub.s32        q6, q6, q9
402f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
4039c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    ; x3 = dct_const_round_shift(s3 + s7);
4049c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    vqrshrn.s32     d28, q14, #14             ; >> 14
4059c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    vqrshrn.s32     d29, q15, #14             ; >> 14
4069c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org
407f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; x7 = dct_const_round_shift(s3 - s7);
408f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vqrshrn.s32     d4, q2, #14               ; >> 14
409f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vqrshrn.s32     d5, q6, #14               ; >> 14
410f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
411f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; (s2 + s6)
412f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vadd.s32        q9, q4, q0
413f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vadd.s32        q10, q5, q13
414f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
415f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; (s2 - s6)
416f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vsub.s32        q4, q4, q0
417f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vsub.s32        q5, q5, q13
418f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
4199c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    vdup.16         d30, r8                   ; duplicate cospi_8_64
4209c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    vdup.16         d31, r9                   ; duplicate cospi_24_64
4219c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org
4229c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    ; x2 = dct_const_round_shift(s2 + s6);
4239c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    vqrshrn.s32     d18, q9, #14              ; >> 14
4249c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    vqrshrn.s32     d19, q10, #14             ; >> 14
4259c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org
426f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; x6 = dct_const_round_shift(s2 - s6);
427f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vqrshrn.s32     d8, q4, #14               ; >> 14
428f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vqrshrn.s32     d9, q5, #14               ; >> 14
429f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
430f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; cospi_8_64  * x4
431f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmull.s16       q5, d2, d30
432f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmull.s16       q6, d3, d30
433f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
434f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; cospi_24_64 * x4
435f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmull.s16       q7, d2, d31
436f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmull.s16       q0, d3, d31
437f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
4389c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    ; s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;
4399c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    vmlal.s16       q5, d6, d31
4409c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    vmlal.s16       q6, d7, d31
4419c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org
442f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;
443f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmlsl.s16       q7, d6, d30
444f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmlsl.s16       q0, d7, d30
445f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
446f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; cospi_8_64  * x7
447f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmull.s16       q1, d4, d30
448f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmull.s16       q3, d5, d30
449f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
450f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; cospi_24_64 * x7
451f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmull.s16       q10, d4, d31
452f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmull.s16       q2, d5, d31
453f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
4549c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    ; s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;
4559c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    vmlsl.s16       q1, d8, d31
4569c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    vmlsl.s16       q3, d9, d31
4579c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org
458f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;
459f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmlal.s16       q10, d8, d30
460f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmlal.s16       q2, d9, d30
461f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
462f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vadd.s16        q8, q11, q9               ; x0 = s0 + s2;
463f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
464f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vsub.s16        q11, q11, q9              ; x2 = s0 - s2;
465f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
466f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vadd.s16        q4, q12, q14              ; x1 = s1 + s3;
467f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
468f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vsub.s16        q12, q12, q14             ; x3 = s1 - s3;
469f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
470f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; (s4 + s6)
471f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vadd.s32        q14, q5, q1
472f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vadd.s32        q15, q6, q3
473f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
474f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; (s4 - s6)
475f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vsub.s32        q5, q5, q1
476f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vsub.s32        q6, q6, q3
477f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
4789c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    ; x4 = dct_const_round_shift(s4 + s6);
4799c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    vqrshrn.s32     d18, q14, #14             ; >> 14
4809c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    vqrshrn.s32     d19, q15, #14             ; >> 14
4819c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org
482f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; x6 = dct_const_round_shift(s4 - s6);
483f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vqrshrn.s32     d10, q5, #14              ; >> 14
484f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vqrshrn.s32     d11, q6, #14              ; >> 14
485f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
486f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; (s5 + s7)
487f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vadd.s32        q1, q7, q10
488f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vadd.s32        q3, q0, q2
489f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
490f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; (s5 - s7))
491f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vsub.s32        q7, q7, q10
492f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vsub.s32        q0, q0, q2
493f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
4949c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    ; x5 = dct_const_round_shift(s5 + s7);
4959c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    vqrshrn.s32     d28, q1, #14               ; >> 14
4969c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    vqrshrn.s32     d29, q3, #14               ; >> 14
4979c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org
498f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; x7 = dct_const_round_shift(s5 - s7);
499f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vqrshrn.s32     d14, q7, #14              ; >> 14
500f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vqrshrn.s32     d15, q0, #14              ; >> 14
501f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
502f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vdup.16         d30, r12                  ; duplicate cospi_16_64
503f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
504f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; cospi_16_64 * x2
505f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmull.s16       q2, d22, d30
506f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmull.s16       q3, d23, d30
507f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
508f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; cospi_6_64  * x6
509f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmull.s16       q13, d22, d30
510f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmull.s16       q1, d23, d30
511f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
5129c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    ; cospi_16_64 * x2 + cospi_16_64  * x3;
5139c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    vmlal.s16       q2, d24, d30
5149c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    vmlal.s16       q3, d25, d30
5159c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org
516f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; cospi_16_64 * x2 - cospi_16_64  * x3;
517f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmlsl.s16       q13, d24, d30
518f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmlsl.s16       q1, d25, d30
519f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
5209c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    ; x2 = dct_const_round_shift(s2);
5219c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    vqrshrn.s32     d4, q2, #14               ; >> 14
5229c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    vqrshrn.s32     d5, q3, #14               ; >> 14
5239c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org
524f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ;x3 = dct_const_round_shift(s3);
525f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vqrshrn.s32     d24, q13, #14             ; >> 14
526f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vqrshrn.s32     d25, q1, #14              ; >> 14
527f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
528f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; cospi_16_64 * x6
529f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmull.s16       q13, d10, d30
530f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmull.s16       q1, d11, d30
531f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
5329c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    ; cospi_6_64  * x6
5339c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    vmull.s16       q11, d10, d30
5349c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    vmull.s16       q0, d11, d30
5359c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org
536f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; cospi_16_64 * x6 + cospi_16_64  * x7;
537f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmlal.s16       q13, d14, d30
538f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vmlal.s16       q1, d15, d30
539f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
5409c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    ; cospi_16_64 * x6 - cospi_16_64  * x7;
5419c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    vmlsl.s16       q11, d14, d30
5429c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    vmlsl.s16       q0, d15, d30
5439c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org
544f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; x6 = dct_const_round_shift(s6);
545f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vqrshrn.s32     d20, q13, #14             ; >> 14
546f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vqrshrn.s32     d21, q1, #14              ; >> 14
547f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
548f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ;x7 = dct_const_round_shift(s7);
5499c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    vqrshrn.s32     d12, q11, #14             ; >> 14
5509c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    vqrshrn.s32     d13, q0, #14              ; >> 14
551f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
552f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vdup.16         q5, r10                   ; duplicate 0
553f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
554f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vsub.s16        q9, q5, q9                ; output[1] = -x4;
555f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vsub.s16        q11, q5, q2               ; output[3] = -x2;
5569c920af5cd2f78ab30bb06f01f4a4d9d30d5c92bfgalligan@chromium.org    vsub.s16        q13, q5, q6               ; output[5] = -x7;
557f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vsub.s16        q15, q5, q4               ; output[7] = -x1;
558f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    MEND
559f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
560f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
561f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    AREA     Block, CODE, READONLY ; name this block of code
562ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org;void vp9_iht8x8_64_add_neon(int16_t *input, uint8_t *dest,
563f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org;                               int dest_stride, int tx_type)
564f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org;
565f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org; r0  int16_t input
566f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org; r1  uint8_t *dest
567f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org; r2  int dest_stride
568f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org; r3  int tx_type)
569f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org; This function will only handle tx_type of 1,2,3.
570ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org|vp9_iht8x8_64_add_neon| PROC
571f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
572f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; load the inputs into d16-d19
573f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vld1.s16        {q8,q9}, [r0]!
574f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vld1.s16        {q10,q11}, [r0]!
575f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vld1.s16        {q12,q13}, [r0]!
576f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vld1.s16        {q14,q15}, [r0]!
577f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
578f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    push            {r0-r10}
579411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    vpush           {d8-d15}
580f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
581f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; transpose the input data
582f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    TRANSPOSE8X8
583f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
584f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; decide the type of transform
585f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    cmp         r3, #2
586f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    beq         idct_iadst
587f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    cmp         r3, #3
588f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    beq         iadst_iadst
589f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
590f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.orgiadst_idct
591f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; generate IDCT constants
592f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    GENERATE_IDCT_CONSTANTS
593f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
594f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; first transform rows
595f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    IDCT8x8_1D
596f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
597f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; transpose the matrix
598f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    TRANSPOSE8X8
599f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
600f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; generate IADST constants
601f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    GENERATE_IADST_CONSTANTS
602f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
603f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; then transform columns
604f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    IADST8X8_1D
605f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
606ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org    b end_vp9_iht8x8_64_add_neon
607f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
608f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.orgidct_iadst
609f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; generate IADST constants
610f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    GENERATE_IADST_CONSTANTS
611f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
612f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; first transform rows
613f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    IADST8X8_1D
614f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
615f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; transpose the matrix
616f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    TRANSPOSE8X8
617f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
618f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; generate IDCT constants
619f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    GENERATE_IDCT_CONSTANTS
620f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
621f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; then transform columns
622f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    IDCT8x8_1D
623f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
624ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org    b end_vp9_iht8x8_64_add_neon
625f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
626f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.orgiadst_iadst
627f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; generate IADST constants
628f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    GENERATE_IADST_CONSTANTS
629f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
630f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; first transform rows
631f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    IADST8X8_1D
632f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
633f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; transpose the matrix
634f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    TRANSPOSE8X8
635f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
636f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; then transform columns
637f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    IADST8X8_1D
638f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
639ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.orgend_vp9_iht8x8_64_add_neon
640411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    vpop           {d8-d15}
641f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    pop            {r0-r10}
642f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
643f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; ROUND_POWER_OF_TWO(temp_out[j], 5)
644f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vrshr.s16       q8, q8, #5
645f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vrshr.s16       q9, q9, #5
646f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vrshr.s16       q10, q10, #5
647f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vrshr.s16       q11, q11, #5
648f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vrshr.s16       q12, q12, #5
649f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vrshr.s16       q13, q13, #5
650f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vrshr.s16       q14, q14, #5
651f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vrshr.s16       q15, q15, #5
652f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
653f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; save dest pointer
654f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    mov             r0, r1
655f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
656f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; load destination data
657f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vld1.64         {d0}, [r1], r2
658f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vld1.64         {d1}, [r1], r2
659f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vld1.64         {d2}, [r1], r2
660f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vld1.64         {d3}, [r1], r2
661f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vld1.64         {d4}, [r1], r2
662f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vld1.64         {d5}, [r1], r2
663f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vld1.64         {d6}, [r1], r2
664f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vld1.64         {d7}, [r1]
665f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
666f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]
667f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vaddw.u8        q8, q8, d0
668f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vaddw.u8        q9, q9, d1
669f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vaddw.u8        q10, q10, d2
670f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vaddw.u8        q11, q11, d3
671f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vaddw.u8        q12, q12, d4
672f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vaddw.u8        q13, q13, d5
673f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vaddw.u8        q14, q14, d6
674f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vaddw.u8        q15, q15, d7
675f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
676f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; clip_pixel
677f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vqmovun.s16     d0, q8
678f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vqmovun.s16     d1, q9
679f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vqmovun.s16     d2, q10
680f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vqmovun.s16     d3, q11
681f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vqmovun.s16     d4, q12
682f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vqmovun.s16     d5, q13
683f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vqmovun.s16     d6, q14
684f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vqmovun.s16     d7, q15
685f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
686f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ; store the data
687f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vst1.64         {d0}, [r0], r2
688f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vst1.64         {d1}, [r0], r2
689f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vst1.64         {d2}, [r0], r2
690f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vst1.64         {d3}, [r0], r2
691f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vst1.64         {d4}, [r0], r2
692f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vst1.64         {d5}, [r0], r2
693f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vst1.64         {d6}, [r0], r2
694f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    vst1.64         {d7}, [r0], r2
695f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    bx          lr
696ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org    ENDP  ; |vp9_iht8x8_64_add_neon|
697f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
698f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    END
699