1f71323e297a928af368937089d3ed71239786f86Andreas Huber;
2f71323e297a928af368937089d3ed71239786f86Andreas Huber;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3f71323e297a928af368937089d3ed71239786f86Andreas Huber;
4f71323e297a928af368937089d3ed71239786f86Andreas Huber;  Use of this source code is governed by a BSD-style license
5f71323e297a928af368937089d3ed71239786f86Andreas Huber;  that can be found in the LICENSE file in the root of the source
6f71323e297a928af368937089d3ed71239786f86Andreas Huber;  tree. An additional intellectual property rights grant can be found
7f71323e297a928af368937089d3ed71239786f86Andreas Huber;  in the file PATENTS.  All contributing project authors may
8f71323e297a928af368937089d3ed71239786f86Andreas Huber;  be found in the AUTHORS file in the root of the source tree.
9f71323e297a928af368937089d3ed71239786f86Andreas Huber;
10f71323e297a928af368937089d3ed71239786f86Andreas Huber
11f71323e297a928af368937089d3ed71239786f86Andreas Huber
12f71323e297a928af368937089d3ed71239786f86Andreas Huber%include "vpx_ports/x86_abi_support.asm"
13f71323e297a928af368937089d3ed71239786f86Andreas Huber
14f71323e297a928af368937089d3ed71239786f86Andreas Huber;void idct_dequant_0_2x_sse2
15f71323e297a928af368937089d3ed71239786f86Andreas Huber; (
16f71323e297a928af368937089d3ed71239786f86Andreas Huber;   short *qcoeff       - 0
17f71323e297a928af368937089d3ed71239786f86Andreas Huber;   short *dequant      - 1
18f71323e297a928af368937089d3ed71239786f86Andreas Huber;   unsigned char *pre  - 2
19f71323e297a928af368937089d3ed71239786f86Andreas Huber;   unsigned char *dst  - 3
20f71323e297a928af368937089d3ed71239786f86Andreas Huber;   int dst_stride      - 4
21f71323e297a928af368937089d3ed71239786f86Andreas Huber;   int blk_stride      - 5
22f71323e297a928af368937089d3ed71239786f86Andreas Huber; )
23f71323e297a928af368937089d3ed71239786f86Andreas Huber
24f71323e297a928af368937089d3ed71239786f86Andreas Huberglobal sym(idct_dequant_0_2x_sse2)
25f71323e297a928af368937089d3ed71239786f86Andreas Hubersym(idct_dequant_0_2x_sse2):
26f71323e297a928af368937089d3ed71239786f86Andreas Huber    push        rbp
27f71323e297a928af368937089d3ed71239786f86Andreas Huber    mov         rbp, rsp
28f71323e297a928af368937089d3ed71239786f86Andreas Huber    SHADOW_ARGS_TO_STACK 6
29f71323e297a928af368937089d3ed71239786f86Andreas Huber    GET_GOT     rbx
30f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; end prolog
31f71323e297a928af368937089d3ed71239786f86Andreas Huber
32f71323e297a928af368937089d3ed71239786f86Andreas Huber        mov         rdx,            arg(1) ; dequant
33f71323e297a928af368937089d3ed71239786f86Andreas Huber        mov         rax,            arg(0) ; qcoeff
34f71323e297a928af368937089d3ed71239786f86Andreas Huber
35f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; Zero out xmm7, for use unpacking
36f71323e297a928af368937089d3ed71239786f86Andreas Huber        pxor        xmm7,           xmm7
37f71323e297a928af368937089d3ed71239786f86Andreas Huber
38f71323e297a928af368937089d3ed71239786f86Andreas Huber        movd        xmm4,           [rax]
39f71323e297a928af368937089d3ed71239786f86Andreas Huber        movd        xmm5,           [rdx]
40f71323e297a928af368937089d3ed71239786f86Andreas Huber
41f71323e297a928af368937089d3ed71239786f86Andreas Huber        pinsrw      xmm4,           [rax+32],   4
42f71323e297a928af368937089d3ed71239786f86Andreas Huber        pinsrw      xmm5,           [rdx],      4
43f71323e297a928af368937089d3ed71239786f86Andreas Huber
44f71323e297a928af368937089d3ed71239786f86Andreas Huber        pmullw      xmm4,           xmm5
45f71323e297a928af368937089d3ed71239786f86Andreas Huber
46f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; clear coeffs
47f71323e297a928af368937089d3ed71239786f86Andreas Huber        movd        [rax],          xmm7
48f71323e297a928af368937089d3ed71239786f86Andreas Huber        movd        [rax+32],       xmm7
49f71323e297a928af368937089d3ed71239786f86Andreas Huber;pshufb
50f71323e297a928af368937089d3ed71239786f86Andreas Huber        pshuflw     xmm4,           xmm4,       00000000b
51f71323e297a928af368937089d3ed71239786f86Andreas Huber        pshufhw     xmm4,           xmm4,       00000000b
52f71323e297a928af368937089d3ed71239786f86Andreas Huber
53f71323e297a928af368937089d3ed71239786f86Andreas Huber        mov         rax,            arg(2) ; pre
54538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw       xmm4,           [GLOBAL(fours)]
55f71323e297a928af368937089d3ed71239786f86Andreas Huber
56f71323e297a928af368937089d3ed71239786f86Andreas Huber        movsxd      rcx,            dword ptr arg(5) ; blk_stride
57f71323e297a928af368937089d3ed71239786f86Andreas Huber        psraw       xmm4,           3
58f71323e297a928af368937089d3ed71239786f86Andreas Huber
59f71323e297a928af368937089d3ed71239786f86Andreas Huber        movq        xmm0,           [rax]
60f71323e297a928af368937089d3ed71239786f86Andreas Huber        movq        xmm1,           [rax+rcx]
61f71323e297a928af368937089d3ed71239786f86Andreas Huber        movq        xmm2,           [rax+2*rcx]
62f71323e297a928af368937089d3ed71239786f86Andreas Huber        lea         rcx,            [3*rcx]
63f71323e297a928af368937089d3ed71239786f86Andreas Huber        movq        xmm3,           [rax+rcx]
64f71323e297a928af368937089d3ed71239786f86Andreas Huber
65f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklbw   xmm0,           xmm7
66f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklbw   xmm1,           xmm7
67f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklbw   xmm2,           xmm7
68f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklbw   xmm3,           xmm7
69f71323e297a928af368937089d3ed71239786f86Andreas Huber
70f71323e297a928af368937089d3ed71239786f86Andreas Huber        mov         rax,            arg(3) ; dst
71f71323e297a928af368937089d3ed71239786f86Andreas Huber        movsxd      rdx,            dword ptr arg(4) ; dst_stride
72f71323e297a928af368937089d3ed71239786f86Andreas Huber
73f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; Add to predict buffer
74f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm0,           xmm4
75f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm1,           xmm4
76f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm2,           xmm4
77f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm3,           xmm4
78f71323e297a928af368937089d3ed71239786f86Andreas Huber
79f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; pack up before storing
80f71323e297a928af368937089d3ed71239786f86Andreas Huber        packuswb    xmm0,           xmm7
81f71323e297a928af368937089d3ed71239786f86Andreas Huber        packuswb    xmm1,           xmm7
82f71323e297a928af368937089d3ed71239786f86Andreas Huber        packuswb    xmm2,           xmm7
83f71323e297a928af368937089d3ed71239786f86Andreas Huber        packuswb    xmm3,           xmm7
84f71323e297a928af368937089d3ed71239786f86Andreas Huber
85f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; store blocks back out
86f71323e297a928af368937089d3ed71239786f86Andreas Huber        movq        [rax],          xmm0
87f71323e297a928af368937089d3ed71239786f86Andreas Huber        movq        [rax + rdx],    xmm1
88f71323e297a928af368937089d3ed71239786f86Andreas Huber
89f71323e297a928af368937089d3ed71239786f86Andreas Huber        lea         rax,            [rax + 2*rdx]
90f71323e297a928af368937089d3ed71239786f86Andreas Huber
91f71323e297a928af368937089d3ed71239786f86Andreas Huber        movq        [rax],          xmm2
92f71323e297a928af368937089d3ed71239786f86Andreas Huber        movq        [rax + rdx],    xmm3
93f71323e297a928af368937089d3ed71239786f86Andreas Huber
94f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; begin epilog
95f71323e297a928af368937089d3ed71239786f86Andreas Huber    RESTORE_GOT
96f71323e297a928af368937089d3ed71239786f86Andreas Huber    UNSHADOW_ARGS
97f71323e297a928af368937089d3ed71239786f86Andreas Huber    pop         rbp
98f71323e297a928af368937089d3ed71239786f86Andreas Huber    ret
99f71323e297a928af368937089d3ed71239786f86Andreas Huber
100f71323e297a928af368937089d3ed71239786f86Andreas Huberglobal sym(idct_dequant_full_2x_sse2)
101f71323e297a928af368937089d3ed71239786f86Andreas Hubersym(idct_dequant_full_2x_sse2):
102f71323e297a928af368937089d3ed71239786f86Andreas Huber    push        rbp
103f71323e297a928af368937089d3ed71239786f86Andreas Huber    mov         rbp, rsp
104f71323e297a928af368937089d3ed71239786f86Andreas Huber    SHADOW_ARGS_TO_STACK 7
105f71323e297a928af368937089d3ed71239786f86Andreas Huber    GET_GOT     rbx
106f71323e297a928af368937089d3ed71239786f86Andreas Huber    push        rsi
107f71323e297a928af368937089d3ed71239786f86Andreas Huber    push        rdi
108f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; end prolog
109f71323e297a928af368937089d3ed71239786f86Andreas Huber
110f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; special case when 2 blocks have 0 or 1 coeffs
111f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; dc is set as first coeff, so no need to load qcoeff
112f71323e297a928af368937089d3ed71239786f86Andreas Huber        mov         rax,            arg(0) ; qcoeff
113f71323e297a928af368937089d3ed71239786f86Andreas Huber        mov         rsi,            arg(2) ; pre
114f71323e297a928af368937089d3ed71239786f86Andreas Huber        mov         rdi,            arg(3) ; dst
115f71323e297a928af368937089d3ed71239786f86Andreas Huber        movsxd      rcx,            dword ptr arg(5) ; blk_stride
116f71323e297a928af368937089d3ed71239786f86Andreas Huber
117f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; Zero out xmm7, for use unpacking
118f71323e297a928af368937089d3ed71239786f86Andreas Huber        pxor        xmm7,           xmm7
119f71323e297a928af368937089d3ed71239786f86Andreas Huber
120f71323e297a928af368937089d3ed71239786f86Andreas Huber        mov         rdx,            arg(1)  ; dequant
121f71323e297a928af368937089d3ed71239786f86Andreas Huber
122f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; note the transpose of xmm1 and xmm2, necessary for shuffle
123f71323e297a928af368937089d3ed71239786f86Andreas Huber    ;   to spit out sensicle data
124f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm0,           [rax]
125f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm2,           [rax+16]
126f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm1,           [rax+32]
127f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm3,           [rax+48]
128f71323e297a928af368937089d3ed71239786f86Andreas Huber
129f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; Clear out coeffs
130f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      [rax],          xmm7
131f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      [rax+16],       xmm7
132f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      [rax+32],       xmm7
133f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      [rax+48],       xmm7
134f71323e297a928af368937089d3ed71239786f86Andreas Huber
135f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; dequantize qcoeff buffer
136f71323e297a928af368937089d3ed71239786f86Andreas Huber        pmullw      xmm0,           [rdx]
137f71323e297a928af368937089d3ed71239786f86Andreas Huber        pmullw      xmm2,           [rdx+16]
138f71323e297a928af368937089d3ed71239786f86Andreas Huber        pmullw      xmm1,           [rdx]
139f71323e297a928af368937089d3ed71239786f86Andreas Huber        pmullw      xmm3,           [rdx+16]
140f71323e297a928af368937089d3ed71239786f86Andreas Huber
141f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; repack so block 0 row x and block 1 row x are together
142f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm4,           xmm0
143f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckldq   xmm0,           xmm1
144f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhdq   xmm4,           xmm1
145f71323e297a928af368937089d3ed71239786f86Andreas Huber
146f71323e297a928af368937089d3ed71239786f86Andreas Huber        pshufd      xmm0,           xmm0,       11011000b
147f71323e297a928af368937089d3ed71239786f86Andreas Huber        pshufd      xmm1,           xmm4,       11011000b
148f71323e297a928af368937089d3ed71239786f86Andreas Huber
149f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm4,           xmm2
150f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckldq   xmm2,           xmm3
151f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhdq   xmm4,           xmm3
152f71323e297a928af368937089d3ed71239786f86Andreas Huber
153f71323e297a928af368937089d3ed71239786f86Andreas Huber        pshufd      xmm2,           xmm2,       11011000b
154f71323e297a928af368937089d3ed71239786f86Andreas Huber        pshufd      xmm3,           xmm4,       11011000b
155f71323e297a928af368937089d3ed71239786f86Andreas Huber
156f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; first pass
157f71323e297a928af368937089d3ed71239786f86Andreas Huber        psubw       xmm0,           xmm2        ; b1 = 0-2
158f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm2,           xmm2        ;
159f71323e297a928af368937089d3ed71239786f86Andreas Huber
160f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm5,           xmm1
161f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm2,           xmm0        ; a1 = 0+2
162f71323e297a928af368937089d3ed71239786f86Andreas Huber
163538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
164f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm5,           xmm1        ; ip1 * sin(pi/8) * sqrt(2)
165f71323e297a928af368937089d3ed71239786f86Andreas Huber
166f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm7,           xmm3
167538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
168f71323e297a928af368937089d3ed71239786f86Andreas Huber
169f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm7,           xmm3        ; ip3 * cos(pi/8) * sqrt(2)
170f71323e297a928af368937089d3ed71239786f86Andreas Huber        psubw       xmm7,           xmm5        ; c1
171f71323e297a928af368937089d3ed71239786f86Andreas Huber
172f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm5,           xmm1
173f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm4,           xmm3
174f71323e297a928af368937089d3ed71239786f86Andreas Huber
175538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
176f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm5,           xmm1
177f71323e297a928af368937089d3ed71239786f86Andreas Huber
178538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
179f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm3,           xmm4
180f71323e297a928af368937089d3ed71239786f86Andreas Huber
181f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm3,           xmm5        ; d1
182f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm6,           xmm2        ; a1
183f71323e297a928af368937089d3ed71239786f86Andreas Huber
184f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm4,           xmm0        ; b1
185f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm2,           xmm3        ;0
186f71323e297a928af368937089d3ed71239786f86Andreas Huber
187f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm4,           xmm7        ;1
188f71323e297a928af368937089d3ed71239786f86Andreas Huber        psubw       xmm0,           xmm7        ;2
189f71323e297a928af368937089d3ed71239786f86Andreas Huber
190f71323e297a928af368937089d3ed71239786f86Andreas Huber        psubw       xmm6,           xmm3        ;3
191f71323e297a928af368937089d3ed71239786f86Andreas Huber
192f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; transpose for the second pass
193f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
194f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
195f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
196f71323e297a928af368937089d3ed71239786f86Andreas Huber
197f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
198f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
199f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
200f71323e297a928af368937089d3ed71239786f86Andreas Huber
201f71323e297a928af368937089d3ed71239786f86Andreas Huber
202f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
203f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
204f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
205f71323e297a928af368937089d3ed71239786f86Andreas Huber
206f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
207f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
208f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
209f71323e297a928af368937089d3ed71239786f86Andreas Huber
210f71323e297a928af368937089d3ed71239786f86Andreas Huber
211f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
212f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
213f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
214f71323e297a928af368937089d3ed71239786f86Andreas Huber
215f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
216f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
217f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
218f71323e297a928af368937089d3ed71239786f86Andreas Huber
219f71323e297a928af368937089d3ed71239786f86Andreas Huber        pshufd      xmm0,           xmm2,       11011000b
220f71323e297a928af368937089d3ed71239786f86Andreas Huber        pshufd      xmm2,           xmm1,       11011000b
221f71323e297a928af368937089d3ed71239786f86Andreas Huber
222f71323e297a928af368937089d3ed71239786f86Andreas Huber        pshufd      xmm1,           xmm5,       11011000b
223f71323e297a928af368937089d3ed71239786f86Andreas Huber        pshufd      xmm3,           xmm7,       11011000b
224f71323e297a928af368937089d3ed71239786f86Andreas Huber
225f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; second pass
226f71323e297a928af368937089d3ed71239786f86Andreas Huber        psubw       xmm0,           xmm2            ; b1 = 0-2
227f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm2,           xmm2
228f71323e297a928af368937089d3ed71239786f86Andreas Huber
229f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm5,           xmm1
230f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm2,           xmm0            ; a1 = 0+2
231f71323e297a928af368937089d3ed71239786f86Andreas Huber
232538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
233f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm5,           xmm1            ; ip1 * sin(pi/8) * sqrt(2)
234f71323e297a928af368937089d3ed71239786f86Andreas Huber
235f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm7,           xmm3
236538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
237f71323e297a928af368937089d3ed71239786f86Andreas Huber
238f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm7,           xmm3            ; ip3 * cos(pi/8) * sqrt(2)
239f71323e297a928af368937089d3ed71239786f86Andreas Huber        psubw       xmm7,           xmm5            ; c1
240f71323e297a928af368937089d3ed71239786f86Andreas Huber
241f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm5,           xmm1
242f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm4,           xmm3
243f71323e297a928af368937089d3ed71239786f86Andreas Huber
244538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
245f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm5,           xmm1
246f71323e297a928af368937089d3ed71239786f86Andreas Huber
247538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
248f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm3,           xmm4
249f71323e297a928af368937089d3ed71239786f86Andreas Huber
250f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm3,           xmm5            ; d1
251538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw       xmm0,           [GLOBAL(fours)]
252f71323e297a928af368937089d3ed71239786f86Andreas Huber
253538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw       xmm2,           [GLOBAL(fours)]
254f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm6,           xmm2            ; a1
255f71323e297a928af368937089d3ed71239786f86Andreas Huber
256f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm4,           xmm0            ; b1
257f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm2,           xmm3            ;0
258f71323e297a928af368937089d3ed71239786f86Andreas Huber
259f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm4,           xmm7            ;1
260f71323e297a928af368937089d3ed71239786f86Andreas Huber        psubw       xmm0,           xmm7            ;2
261f71323e297a928af368937089d3ed71239786f86Andreas Huber
262f71323e297a928af368937089d3ed71239786f86Andreas Huber        psubw       xmm6,           xmm3            ;3
263f71323e297a928af368937089d3ed71239786f86Andreas Huber        psraw       xmm2,           3
264f71323e297a928af368937089d3ed71239786f86Andreas Huber
265f71323e297a928af368937089d3ed71239786f86Andreas Huber        psraw       xmm0,           3
266f71323e297a928af368937089d3ed71239786f86Andreas Huber        psraw       xmm4,           3
267f71323e297a928af368937089d3ed71239786f86Andreas Huber
268f71323e297a928af368937089d3ed71239786f86Andreas Huber        psraw       xmm6,           3
269f71323e297a928af368937089d3ed71239786f86Andreas Huber
270f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; transpose to save
271f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
272f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
273f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
274f71323e297a928af368937089d3ed71239786f86Andreas Huber
275f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
276f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
277f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
278f71323e297a928af368937089d3ed71239786f86Andreas Huber
279f71323e297a928af368937089d3ed71239786f86Andreas Huber
280f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
281f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
282f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
283f71323e297a928af368937089d3ed71239786f86Andreas Huber
284f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
285f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
286f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
287f71323e297a928af368937089d3ed71239786f86Andreas Huber
288f71323e297a928af368937089d3ed71239786f86Andreas Huber
289f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
290f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
291f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
292f71323e297a928af368937089d3ed71239786f86Andreas Huber
293f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
294f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
295f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
296f71323e297a928af368937089d3ed71239786f86Andreas Huber
297f71323e297a928af368937089d3ed71239786f86Andreas Huber        pshufd      xmm0,           xmm2,       11011000b
298f71323e297a928af368937089d3ed71239786f86Andreas Huber        pshufd      xmm2,           xmm1,       11011000b
299f71323e297a928af368937089d3ed71239786f86Andreas Huber
300f71323e297a928af368937089d3ed71239786f86Andreas Huber        pshufd      xmm1,           xmm5,       11011000b
301f71323e297a928af368937089d3ed71239786f86Andreas Huber        pshufd      xmm3,           xmm7,       11011000b
302f71323e297a928af368937089d3ed71239786f86Andreas Huber
303f71323e297a928af368937089d3ed71239786f86Andreas Huber        pxor        xmm7,           xmm7
304f71323e297a928af368937089d3ed71239786f86Andreas Huber
305f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; Load up predict blocks
306f71323e297a928af368937089d3ed71239786f86Andreas Huber        movq        xmm4,           [rsi]
307f71323e297a928af368937089d3ed71239786f86Andreas Huber        movq        xmm5,           [rsi+rcx]
308f71323e297a928af368937089d3ed71239786f86Andreas Huber
309f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklbw   xmm4,           xmm7
310f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklbw   xmm5,           xmm7
311f71323e297a928af368937089d3ed71239786f86Andreas Huber
312f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm0,           xmm4
313f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm1,           xmm5
314f71323e297a928af368937089d3ed71239786f86Andreas Huber
315f71323e297a928af368937089d3ed71239786f86Andreas Huber        movq        xmm4,           [rsi+2*rcx]
316f71323e297a928af368937089d3ed71239786f86Andreas Huber        lea         rcx,            [3*rcx]
317f71323e297a928af368937089d3ed71239786f86Andreas Huber        movq        xmm5,           [rsi+rcx]
318f71323e297a928af368937089d3ed71239786f86Andreas Huber
319f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklbw   xmm4,           xmm7
320f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklbw   xmm5,           xmm7
321f71323e297a928af368937089d3ed71239786f86Andreas Huber
322f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm2,           xmm4
323f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm3,           xmm5
324f71323e297a928af368937089d3ed71239786f86Andreas Huber
325f71323e297a928af368937089d3ed71239786f86Andreas Huber.finish:
326f71323e297a928af368937089d3ed71239786f86Andreas Huber
327f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; pack up before storing
328f71323e297a928af368937089d3ed71239786f86Andreas Huber        packuswb    xmm0,           xmm7
329f71323e297a928af368937089d3ed71239786f86Andreas Huber        packuswb    xmm1,           xmm7
330f71323e297a928af368937089d3ed71239786f86Andreas Huber        packuswb    xmm2,           xmm7
331f71323e297a928af368937089d3ed71239786f86Andreas Huber        packuswb    xmm3,           xmm7
332f71323e297a928af368937089d3ed71239786f86Andreas Huber
333f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; Load destination stride before writing out,
334f71323e297a928af368937089d3ed71239786f86Andreas Huber    ;   doesn't need to persist
335f71323e297a928af368937089d3ed71239786f86Andreas Huber        movsxd      rdx,            dword ptr arg(4) ; dst_stride
336f71323e297a928af368937089d3ed71239786f86Andreas Huber
337f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; store blocks back out
338f71323e297a928af368937089d3ed71239786f86Andreas Huber        movq        [rdi],          xmm0
339f71323e297a928af368937089d3ed71239786f86Andreas Huber        movq        [rdi + rdx],    xmm1
340f71323e297a928af368937089d3ed71239786f86Andreas Huber
341f71323e297a928af368937089d3ed71239786f86Andreas Huber        lea         rdi,            [rdi + 2*rdx]
342f71323e297a928af368937089d3ed71239786f86Andreas Huber
343f71323e297a928af368937089d3ed71239786f86Andreas Huber        movq        [rdi],          xmm2
344f71323e297a928af368937089d3ed71239786f86Andreas Huber        movq        [rdi + rdx],    xmm3
345f71323e297a928af368937089d3ed71239786f86Andreas Huber
346f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; begin epilog
347f71323e297a928af368937089d3ed71239786f86Andreas Huber    pop         rdi
348f71323e297a928af368937089d3ed71239786f86Andreas Huber    pop         rsi
349f71323e297a928af368937089d3ed71239786f86Andreas Huber    RESTORE_GOT
350f71323e297a928af368937089d3ed71239786f86Andreas Huber    UNSHADOW_ARGS
351f71323e297a928af368937089d3ed71239786f86Andreas Huber    pop         rbp
352f71323e297a928af368937089d3ed71239786f86Andreas Huber    ret
353f71323e297a928af368937089d3ed71239786f86Andreas Huber
354f71323e297a928af368937089d3ed71239786f86Andreas Huber;void idct_dequant_dc_0_2x_sse2
355f71323e297a928af368937089d3ed71239786f86Andreas Huber; (
356f71323e297a928af368937089d3ed71239786f86Andreas Huber;   short *qcoeff       - 0
357f71323e297a928af368937089d3ed71239786f86Andreas Huber;   short *dequant      - 1
358f71323e297a928af368937089d3ed71239786f86Andreas Huber;   unsigned char *pre  - 2
359f71323e297a928af368937089d3ed71239786f86Andreas Huber;   unsigned char *dst  - 3
360f71323e297a928af368937089d3ed71239786f86Andreas Huber;   int dst_stride      - 4
361f71323e297a928af368937089d3ed71239786f86Andreas Huber;   short *dc           - 5
362f71323e297a928af368937089d3ed71239786f86Andreas Huber; )
363f71323e297a928af368937089d3ed71239786f86Andreas Huberglobal sym(idct_dequant_dc_0_2x_sse2)
364f71323e297a928af368937089d3ed71239786f86Andreas Hubersym(idct_dequant_dc_0_2x_sse2):
365f71323e297a928af368937089d3ed71239786f86Andreas Huber    push        rbp
366f71323e297a928af368937089d3ed71239786f86Andreas Huber    mov         rbp, rsp
367f71323e297a928af368937089d3ed71239786f86Andreas Huber    SHADOW_ARGS_TO_STACK 7
368f71323e297a928af368937089d3ed71239786f86Andreas Huber    GET_GOT     rbx
369f71323e297a928af368937089d3ed71239786f86Andreas Huber    push        rsi
370f71323e297a928af368937089d3ed71239786f86Andreas Huber    push        rdi
371f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; end prolog
372f71323e297a928af368937089d3ed71239786f86Andreas Huber
373f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; special case when 2 blocks have 0 or 1 coeffs
374f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; dc is set as first coeff, so no need to load qcoeff
375f71323e297a928af368937089d3ed71239786f86Andreas Huber        mov         rax,            arg(0) ; qcoeff
376f71323e297a928af368937089d3ed71239786f86Andreas Huber        mov         rsi,            arg(2) ; pre
377f71323e297a928af368937089d3ed71239786f86Andreas Huber        mov         rdi,            arg(3) ; dst
378f71323e297a928af368937089d3ed71239786f86Andreas Huber        mov         rdx,            arg(5) ; dc
379f71323e297a928af368937089d3ed71239786f86Andreas Huber
380f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; Zero out xmm7, for use unpacking
381f71323e297a928af368937089d3ed71239786f86Andreas Huber        pxor        xmm7,           xmm7
382f71323e297a928af368937089d3ed71239786f86Andreas Huber
383f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; load up 2 dc words here == 2*16 = doubleword
384f71323e297a928af368937089d3ed71239786f86Andreas Huber        movd        xmm4,           [rdx]
385f71323e297a928af368937089d3ed71239786f86Andreas Huber
386f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; Load up predict blocks
387f71323e297a928af368937089d3ed71239786f86Andreas Huber        movq        xmm0,           [rsi]
388f71323e297a928af368937089d3ed71239786f86Andreas Huber        movq        xmm1,           [rsi+16]
389f71323e297a928af368937089d3ed71239786f86Andreas Huber        movq        xmm2,           [rsi+32]
390f71323e297a928af368937089d3ed71239786f86Andreas Huber        movq        xmm3,           [rsi+48]
391f71323e297a928af368937089d3ed71239786f86Andreas Huber
392f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; Duplicate and expand dc across
393f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklwd   xmm4,           xmm4
394f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckldq   xmm4,           xmm4
395f71323e297a928af368937089d3ed71239786f86Andreas Huber
396f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; Rounding to dequant and downshift
397538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw       xmm4,           [GLOBAL(fours)]
398f71323e297a928af368937089d3ed71239786f86Andreas Huber        psraw       xmm4,           3
399f71323e297a928af368937089d3ed71239786f86Andreas Huber
400f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; Predict buffer needs to be expanded from bytes to words
401f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklbw   xmm0,           xmm7
402f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklbw   xmm1,           xmm7
403f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklbw   xmm2,           xmm7
404f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklbw   xmm3,           xmm7
405f71323e297a928af368937089d3ed71239786f86Andreas Huber
406f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; Add to predict buffer
407f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm0,           xmm4
408f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm1,           xmm4
409f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm2,           xmm4
410f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm3,           xmm4
411f71323e297a928af368937089d3ed71239786f86Andreas Huber
412f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; pack up before storing
413f71323e297a928af368937089d3ed71239786f86Andreas Huber        packuswb    xmm0,           xmm7
414f71323e297a928af368937089d3ed71239786f86Andreas Huber        packuswb    xmm1,           xmm7
415f71323e297a928af368937089d3ed71239786f86Andreas Huber        packuswb    xmm2,           xmm7
416f71323e297a928af368937089d3ed71239786f86Andreas Huber        packuswb    xmm3,           xmm7
417f71323e297a928af368937089d3ed71239786f86Andreas Huber
418f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; Load destination stride before writing out,
419f71323e297a928af368937089d3ed71239786f86Andreas Huber    ;   doesn't need to persist
420f71323e297a928af368937089d3ed71239786f86Andreas Huber        movsxd      rdx,            dword ptr arg(4) ; dst_stride
421f71323e297a928af368937089d3ed71239786f86Andreas Huber
422f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; store blocks back out
423f71323e297a928af368937089d3ed71239786f86Andreas Huber        movq        [rdi],          xmm0
424f71323e297a928af368937089d3ed71239786f86Andreas Huber        movq        [rdi + rdx],    xmm1
425f71323e297a928af368937089d3ed71239786f86Andreas Huber
426f71323e297a928af368937089d3ed71239786f86Andreas Huber        lea         rdi,            [rdi + 2*rdx]
427f71323e297a928af368937089d3ed71239786f86Andreas Huber
428f71323e297a928af368937089d3ed71239786f86Andreas Huber        movq        [rdi],          xmm2
429f71323e297a928af368937089d3ed71239786f86Andreas Huber        movq        [rdi + rdx],    xmm3
430f71323e297a928af368937089d3ed71239786f86Andreas Huber
431f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; begin epilog
432f71323e297a928af368937089d3ed71239786f86Andreas Huber    pop         rdi
433f71323e297a928af368937089d3ed71239786f86Andreas Huber    pop         rsi
434f71323e297a928af368937089d3ed71239786f86Andreas Huber    RESTORE_GOT
435f71323e297a928af368937089d3ed71239786f86Andreas Huber    UNSHADOW_ARGS
436f71323e297a928af368937089d3ed71239786f86Andreas Huber    pop         rbp
437f71323e297a928af368937089d3ed71239786f86Andreas Huber    ret
438f71323e297a928af368937089d3ed71239786f86Andreas Huber
439f71323e297a928af368937089d3ed71239786f86Andreas Huberglobal sym(idct_dequant_dc_full_2x_sse2)
440f71323e297a928af368937089d3ed71239786f86Andreas Hubersym(idct_dequant_dc_full_2x_sse2):
441f71323e297a928af368937089d3ed71239786f86Andreas Huber    push        rbp
442f71323e297a928af368937089d3ed71239786f86Andreas Huber    mov         rbp, rsp
443f71323e297a928af368937089d3ed71239786f86Andreas Huber    SHADOW_ARGS_TO_STACK 7
444f71323e297a928af368937089d3ed71239786f86Andreas Huber    GET_GOT     rbx
445f71323e297a928af368937089d3ed71239786f86Andreas Huber    push        rsi
446f71323e297a928af368937089d3ed71239786f86Andreas Huber    push        rdi
447f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; end prolog
448f71323e297a928af368937089d3ed71239786f86Andreas Huber
449f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; special case when 2 blocks have 0 or 1 coeffs
450f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; dc is set as first coeff, so no need to load qcoeff
451f71323e297a928af368937089d3ed71239786f86Andreas Huber        mov         rax,            arg(0) ; qcoeff
452f71323e297a928af368937089d3ed71239786f86Andreas Huber        mov         rsi,            arg(2) ; pre
453f71323e297a928af368937089d3ed71239786f86Andreas Huber        mov         rdi,            arg(3) ; dst
454f71323e297a928af368937089d3ed71239786f86Andreas Huber
455f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; Zero out xmm7, for use unpacking
456f71323e297a928af368937089d3ed71239786f86Andreas Huber        pxor        xmm7,           xmm7
457f71323e297a928af368937089d3ed71239786f86Andreas Huber
458f71323e297a928af368937089d3ed71239786f86Andreas Huber        mov         rdx,            arg(1)  ; dequant
459f71323e297a928af368937089d3ed71239786f86Andreas Huber
460f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; note the transpose of xmm1 and xmm2, necessary for shuffle
461f71323e297a928af368937089d3ed71239786f86Andreas Huber    ;   to spit out sensicle data
462f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm0,           [rax]
463f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm2,           [rax+16]
464f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm1,           [rax+32]
465f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm3,           [rax+48]
466f71323e297a928af368937089d3ed71239786f86Andreas Huber
467f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; Clear out coeffs
468f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      [rax],          xmm7
469f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      [rax+16],       xmm7
470f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      [rax+32],       xmm7
471f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      [rax+48],       xmm7
472f71323e297a928af368937089d3ed71239786f86Andreas Huber
473f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; dequantize qcoeff buffer
474f71323e297a928af368937089d3ed71239786f86Andreas Huber        pmullw      xmm0,           [rdx]
475f71323e297a928af368937089d3ed71239786f86Andreas Huber        pmullw      xmm2,           [rdx+16]
476f71323e297a928af368937089d3ed71239786f86Andreas Huber        pmullw      xmm1,           [rdx]
477f71323e297a928af368937089d3ed71239786f86Andreas Huber        pmullw      xmm3,           [rdx+16]
478f71323e297a928af368937089d3ed71239786f86Andreas Huber
479f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; DC component
480f71323e297a928af368937089d3ed71239786f86Andreas Huber        mov         rdx,            arg(5)
481f71323e297a928af368937089d3ed71239786f86Andreas Huber
482f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; repack so block 0 row x and block 1 row x are together
483f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm4,           xmm0
484f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckldq   xmm0,           xmm1
485f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhdq   xmm4,           xmm1
486f71323e297a928af368937089d3ed71239786f86Andreas Huber
487f71323e297a928af368937089d3ed71239786f86Andreas Huber        pshufd      xmm0,           xmm0,       11011000b
488f71323e297a928af368937089d3ed71239786f86Andreas Huber        pshufd      xmm1,           xmm4,       11011000b
489f71323e297a928af368937089d3ed71239786f86Andreas Huber
490f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm4,           xmm2
491f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckldq   xmm2,           xmm3
492f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhdq   xmm4,           xmm3
493f71323e297a928af368937089d3ed71239786f86Andreas Huber
494f71323e297a928af368937089d3ed71239786f86Andreas Huber        pshufd      xmm2,           xmm2,       11011000b
495f71323e297a928af368937089d3ed71239786f86Andreas Huber        pshufd      xmm3,           xmm4,       11011000b
496f71323e297a928af368937089d3ed71239786f86Andreas Huber
497f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; insert DC component
498f71323e297a928af368937089d3ed71239786f86Andreas Huber        pinsrw      xmm0,           [rdx],      0
499f71323e297a928af368937089d3ed71239786f86Andreas Huber        pinsrw      xmm0,           [rdx+2],    4
500f71323e297a928af368937089d3ed71239786f86Andreas Huber
501f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; first pass
502f71323e297a928af368937089d3ed71239786f86Andreas Huber        psubw       xmm0,           xmm2        ; b1 = 0-2
503f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm2,           xmm2        ;
504f71323e297a928af368937089d3ed71239786f86Andreas Huber
505f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm5,           xmm1
506f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm2,           xmm0        ; a1 = 0+2
507f71323e297a928af368937089d3ed71239786f86Andreas Huber
508538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
509f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm5,           xmm1        ; ip1 * sin(pi/8) * sqrt(2)
510f71323e297a928af368937089d3ed71239786f86Andreas Huber
511f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm7,           xmm3
512538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
513f71323e297a928af368937089d3ed71239786f86Andreas Huber
514f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm7,           xmm3        ; ip3 * cos(pi/8) * sqrt(2)
515f71323e297a928af368937089d3ed71239786f86Andreas Huber        psubw       xmm7,           xmm5        ; c1
516f71323e297a928af368937089d3ed71239786f86Andreas Huber
517f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm5,           xmm1
518f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm4,           xmm3
519f71323e297a928af368937089d3ed71239786f86Andreas Huber
520538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
521f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm5,           xmm1
522f71323e297a928af368937089d3ed71239786f86Andreas Huber
523538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
524f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm3,           xmm4
525f71323e297a928af368937089d3ed71239786f86Andreas Huber
526f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm3,           xmm5        ; d1
527f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm6,           xmm2        ; a1
528f71323e297a928af368937089d3ed71239786f86Andreas Huber
529f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm4,           xmm0        ; b1
530f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm2,           xmm3        ;0
531f71323e297a928af368937089d3ed71239786f86Andreas Huber
532f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm4,           xmm7        ;1
533f71323e297a928af368937089d3ed71239786f86Andreas Huber        psubw       xmm0,           xmm7        ;2
534f71323e297a928af368937089d3ed71239786f86Andreas Huber
535f71323e297a928af368937089d3ed71239786f86Andreas Huber        psubw       xmm6,           xmm3        ;3
536f71323e297a928af368937089d3ed71239786f86Andreas Huber
537f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; transpose for the second pass
538f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
539f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
540f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
541f71323e297a928af368937089d3ed71239786f86Andreas Huber
542f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
543f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
544f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
545f71323e297a928af368937089d3ed71239786f86Andreas Huber
546f71323e297a928af368937089d3ed71239786f86Andreas Huber
547f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
548f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
549f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
550f71323e297a928af368937089d3ed71239786f86Andreas Huber
551f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
552f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
553f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
554f71323e297a928af368937089d3ed71239786f86Andreas Huber
555f71323e297a928af368937089d3ed71239786f86Andreas Huber
556f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
557f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
558f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
559f71323e297a928af368937089d3ed71239786f86Andreas Huber
560f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
561f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
562f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
563f71323e297a928af368937089d3ed71239786f86Andreas Huber
564f71323e297a928af368937089d3ed71239786f86Andreas Huber        pshufd      xmm0,           xmm2,       11011000b
565f71323e297a928af368937089d3ed71239786f86Andreas Huber        pshufd      xmm2,           xmm1,       11011000b
566f71323e297a928af368937089d3ed71239786f86Andreas Huber
567f71323e297a928af368937089d3ed71239786f86Andreas Huber        pshufd      xmm1,           xmm5,       11011000b
568f71323e297a928af368937089d3ed71239786f86Andreas Huber        pshufd      xmm3,           xmm7,       11011000b
569f71323e297a928af368937089d3ed71239786f86Andreas Huber
570f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; second pass
571f71323e297a928af368937089d3ed71239786f86Andreas Huber        psubw       xmm0,           xmm2            ; b1 = 0-2
572f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm2,           xmm2
573f71323e297a928af368937089d3ed71239786f86Andreas Huber
574f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm5,           xmm1
575f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm2,           xmm0            ; a1 = 0+2
576f71323e297a928af368937089d3ed71239786f86Andreas Huber
577538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
578f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm5,           xmm1            ; ip1 * sin(pi/8) * sqrt(2)
579f71323e297a928af368937089d3ed71239786f86Andreas Huber
580f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm7,           xmm3
581538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
582f71323e297a928af368937089d3ed71239786f86Andreas Huber
583f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm7,           xmm3            ; ip3 * cos(pi/8) * sqrt(2)
584f71323e297a928af368937089d3ed71239786f86Andreas Huber        psubw       xmm7,           xmm5            ; c1
585f71323e297a928af368937089d3ed71239786f86Andreas Huber
586f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm5,           xmm1
587f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm4,           xmm3
588f71323e297a928af368937089d3ed71239786f86Andreas Huber
589538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
590f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm5,           xmm1
591f71323e297a928af368937089d3ed71239786f86Andreas Huber
592538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
593f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm3,           xmm4
594f71323e297a928af368937089d3ed71239786f86Andreas Huber
595f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm3,           xmm5            ; d1
596538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw       xmm0,           [GLOBAL(fours)]
597f71323e297a928af368937089d3ed71239786f86Andreas Huber
598538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw       xmm2,           [GLOBAL(fours)]
599f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm6,           xmm2            ; a1
600f71323e297a928af368937089d3ed71239786f86Andreas Huber
601f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm4,           xmm0            ; b1
602f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm2,           xmm3            ;0
603f71323e297a928af368937089d3ed71239786f86Andreas Huber
604f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm4,           xmm7            ;1
605f71323e297a928af368937089d3ed71239786f86Andreas Huber        psubw       xmm0,           xmm7            ;2
606f71323e297a928af368937089d3ed71239786f86Andreas Huber
607f71323e297a928af368937089d3ed71239786f86Andreas Huber        psubw       xmm6,           xmm3            ;3
608f71323e297a928af368937089d3ed71239786f86Andreas Huber        psraw       xmm2,           3
609f71323e297a928af368937089d3ed71239786f86Andreas Huber
610f71323e297a928af368937089d3ed71239786f86Andreas Huber        psraw       xmm0,           3
611f71323e297a928af368937089d3ed71239786f86Andreas Huber        psraw       xmm4,           3
612f71323e297a928af368937089d3ed71239786f86Andreas Huber
613f71323e297a928af368937089d3ed71239786f86Andreas Huber        psraw       xmm6,           3
614f71323e297a928af368937089d3ed71239786f86Andreas Huber
615f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; transpose to save
616f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
617f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
618f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
619f71323e297a928af368937089d3ed71239786f86Andreas Huber
620f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
621f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
622f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
623f71323e297a928af368937089d3ed71239786f86Andreas Huber
624f71323e297a928af368937089d3ed71239786f86Andreas Huber
625f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
626f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
627f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
628f71323e297a928af368937089d3ed71239786f86Andreas Huber
629f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
630f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
631f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
632f71323e297a928af368937089d3ed71239786f86Andreas Huber
633f71323e297a928af368937089d3ed71239786f86Andreas Huber
634f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
635f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
636f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
637f71323e297a928af368937089d3ed71239786f86Andreas Huber
638f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
639f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
640f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
641f71323e297a928af368937089d3ed71239786f86Andreas Huber
642f71323e297a928af368937089d3ed71239786f86Andreas Huber        pshufd      xmm0,           xmm2,       11011000b
643f71323e297a928af368937089d3ed71239786f86Andreas Huber        pshufd      xmm2,           xmm1,       11011000b
644f71323e297a928af368937089d3ed71239786f86Andreas Huber
645f71323e297a928af368937089d3ed71239786f86Andreas Huber        pshufd      xmm1,           xmm5,       11011000b
646f71323e297a928af368937089d3ed71239786f86Andreas Huber        pshufd      xmm3,           xmm7,       11011000b
647f71323e297a928af368937089d3ed71239786f86Andreas Huber
648f71323e297a928af368937089d3ed71239786f86Andreas Huber        pxor        xmm7,           xmm7
649f71323e297a928af368937089d3ed71239786f86Andreas Huber
650f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; Load up predict blocks
651f71323e297a928af368937089d3ed71239786f86Andreas Huber        movq        xmm4,           [rsi]
652f71323e297a928af368937089d3ed71239786f86Andreas Huber        movq        xmm5,           [rsi+16]
653f71323e297a928af368937089d3ed71239786f86Andreas Huber
654f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklbw   xmm4,           xmm7
655f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklbw   xmm5,           xmm7
656f71323e297a928af368937089d3ed71239786f86Andreas Huber
657f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm0,           xmm4
658f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm1,           xmm5
659f71323e297a928af368937089d3ed71239786f86Andreas Huber
660f71323e297a928af368937089d3ed71239786f86Andreas Huber        movq        xmm4,           [rsi+32]
661f71323e297a928af368937089d3ed71239786f86Andreas Huber        movq        xmm5,           [rsi+48]
662f71323e297a928af368937089d3ed71239786f86Andreas Huber
663f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklbw   xmm4,           xmm7
664f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklbw   xmm5,           xmm7
665f71323e297a928af368937089d3ed71239786f86Andreas Huber
666f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm2,           xmm4
667f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddw       xmm3,           xmm5
668f71323e297a928af368937089d3ed71239786f86Andreas Huber
669f71323e297a928af368937089d3ed71239786f86Andreas Huber.finish:
670f71323e297a928af368937089d3ed71239786f86Andreas Huber
671f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; pack up before storing
672f71323e297a928af368937089d3ed71239786f86Andreas Huber        packuswb    xmm0,           xmm7
673f71323e297a928af368937089d3ed71239786f86Andreas Huber        packuswb    xmm1,           xmm7
674f71323e297a928af368937089d3ed71239786f86Andreas Huber        packuswb    xmm2,           xmm7
675f71323e297a928af368937089d3ed71239786f86Andreas Huber        packuswb    xmm3,           xmm7
676f71323e297a928af368937089d3ed71239786f86Andreas Huber
677f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; Load destination stride before writing out,
678f71323e297a928af368937089d3ed71239786f86Andreas Huber    ;   doesn't need to persist
679f71323e297a928af368937089d3ed71239786f86Andreas Huber        movsxd      rdx,            dword ptr arg(4) ; dst_stride
680f71323e297a928af368937089d3ed71239786f86Andreas Huber
681f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; store blocks back out
682f71323e297a928af368937089d3ed71239786f86Andreas Huber        movq        [rdi],          xmm0
683f71323e297a928af368937089d3ed71239786f86Andreas Huber        movq        [rdi + rdx],    xmm1
684f71323e297a928af368937089d3ed71239786f86Andreas Huber
685f71323e297a928af368937089d3ed71239786f86Andreas Huber        lea         rdi,            [rdi + 2*rdx]
686f71323e297a928af368937089d3ed71239786f86Andreas Huber
687f71323e297a928af368937089d3ed71239786f86Andreas Huber        movq        [rdi],          xmm2
688f71323e297a928af368937089d3ed71239786f86Andreas Huber        movq        [rdi + rdx],    xmm3
689f71323e297a928af368937089d3ed71239786f86Andreas Huber
690f71323e297a928af368937089d3ed71239786f86Andreas Huber
691f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; begin epilog
692f71323e297a928af368937089d3ed71239786f86Andreas Huber    pop         rdi
693f71323e297a928af368937089d3ed71239786f86Andreas Huber    pop         rsi
694f71323e297a928af368937089d3ed71239786f86Andreas Huber    RESTORE_GOT
695f71323e297a928af368937089d3ed71239786f86Andreas Huber    UNSHADOW_ARGS
696f71323e297a928af368937089d3ed71239786f86Andreas Huber    pop         rbp
697f71323e297a928af368937089d3ed71239786f86Andreas Huber    ret
698f71323e297a928af368937089d3ed71239786f86Andreas Huber
699f71323e297a928af368937089d3ed71239786f86Andreas HuberSECTION_RODATA
700f71323e297a928af368937089d3ed71239786f86Andreas Huberalign 16
701f71323e297a928af368937089d3ed71239786f86Andreas Huberfours:
702f71323e297a928af368937089d3ed71239786f86Andreas Huber    times 8 dw 0x0004
703f71323e297a928af368937089d3ed71239786f86Andreas Huberalign 16
704f71323e297a928af368937089d3ed71239786f86Andreas Huberx_s1sqr2:
705f71323e297a928af368937089d3ed71239786f86Andreas Huber    times 8 dw 0x8A8C
706f71323e297a928af368937089d3ed71239786f86Andreas Huberalign 16
707f71323e297a928af368937089d3ed71239786f86Andreas Huberx_c1sqr2less1:
708f71323e297a928af368937089d3ed71239786f86Andreas Huber    times 8 dw 0x4E7B
709