1233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
2233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
4233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  Use of this source code is governed by a BSD-style license
5233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  that can be found in the LICENSE file in the root of the source
6233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  tree. An additional intellectual property rights grant can be found
7233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  in the file PATENTS.  All contributing project authors may
8233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  be found in the AUTHORS file in the root of the source tree.
9233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
10233d2500723e5594f3e7c70896ffeeef32b9c950ywan
11233d2500723e5594f3e7c70896ffeeef32b9c950ywan
12233d2500723e5594f3e7c70896ffeeef32b9c950ywan%include "vpx_ports/x86_abi_support.asm"
13233d2500723e5594f3e7c70896ffeeef32b9c950ywan
14233d2500723e5594f3e7c70896ffeeef32b9c950ywan;unsigned int vp9_get_mb_ss_sse2
15233d2500723e5594f3e7c70896ffeeef32b9c950ywan;(
16233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    short *src_ptr
17233d2500723e5594f3e7c70896ffeeef32b9c950ywan;)
18233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp9_get_mb_ss_sse2) PRIVATE
19233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp9_get_mb_ss_sse2):
20233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rbp
21233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         rbp, rsp
22233d2500723e5594f3e7c70896ffeeef32b9c950ywan    SHADOW_ARGS_TO_STACK 1
23233d2500723e5594f3e7c70896ffeeef32b9c950ywan    GET_GOT     rbx
24233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push rsi
25233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push rdi
26233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         rsp, 16
27233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; end prolog
28233d2500723e5594f3e7c70896ffeeef32b9c950ywan
29233d2500723e5594f3e7c70896ffeeef32b9c950ywan
30233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rax, arg(0) ;[src_ptr]
31233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rcx, 8
32233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm4, xmm4
33233d2500723e5594f3e7c70896ffeeef32b9c950ywan
34233d2500723e5594f3e7c70896ffeeef32b9c950ywan.NEXTROW:
35233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm0, [rax]
36233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm1, [rax+16]
37233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm2, [rax+32]
38233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm3, [rax+48]
39233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaddwd     xmm0, xmm0
40233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaddwd     xmm1, xmm1
41233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaddwd     xmm2, xmm2
42233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaddwd     xmm3, xmm3
43233d2500723e5594f3e7c70896ffeeef32b9c950ywan
44233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd       xmm0, xmm1
45233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd       xmm2, xmm3
46233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd       xmm4, xmm0
47233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd       xmm4, xmm2
48233d2500723e5594f3e7c70896ffeeef32b9c950ywan
49233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add         rax, 0x40
50233d2500723e5594f3e7c70896ffeeef32b9c950ywan        dec         rcx
51233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ja          .NEXTROW
52233d2500723e5594f3e7c70896ffeeef32b9c950ywan
53233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm3,xmm4
54233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrldq      xmm4,8
55233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd       xmm4,xmm3
56233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm3,xmm4
57233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrldq      xmm4,4
58233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd       xmm4,xmm3
59233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        rax,xmm4
60233d2500723e5594f3e7c70896ffeeef32b9c950ywan
61233d2500723e5594f3e7c70896ffeeef32b9c950ywan
62233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; begin epilog
63233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add rsp, 16
64233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rdi
65233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rsi
66233d2500723e5594f3e7c70896ffeeef32b9c950ywan    RESTORE_GOT
67233d2500723e5594f3e7c70896ffeeef32b9c950ywan    UNSHADOW_ARGS
68233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop         rbp
69233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ret
70233d2500723e5594f3e7c70896ffeeef32b9c950ywan
71233d2500723e5594f3e7c70896ffeeef32b9c950ywan
72233d2500723e5594f3e7c70896ffeeef32b9c950ywan;unsigned int vp9_get16x16var_sse2
73233d2500723e5594f3e7c70896ffeeef32b9c950ywan;(
74233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned char   *  src_ptr,
75233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    int             source_stride,
76233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned char   *  ref_ptr,
77233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    int             recon_stride,
78233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned int    *  SSE,
79233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    int             *  Sum
80233d2500723e5594f3e7c70896ffeeef32b9c950ywan;)
81233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp9_get16x16var_sse2) PRIVATE
82233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp9_get16x16var_sse2):
83233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rbp
84233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         rbp, rsp
85233d2500723e5594f3e7c70896ffeeef32b9c950ywan    SHADOW_ARGS_TO_STACK 6
86233d2500723e5594f3e7c70896ffeeef32b9c950ywan    SAVE_XMM 7
87233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push rbx
88233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push rsi
89233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push rdi
90233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; end prolog
91233d2500723e5594f3e7c70896ffeeef32b9c950ywan
92233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rsi,            arg(0) ;[src_ptr]
93233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rdi,            arg(2) ;[ref_ptr]
94233d2500723e5594f3e7c70896ffeeef32b9c950ywan
95233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
96233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
97233d2500723e5594f3e7c70896ffeeef32b9c950ywan
98233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; Prefetch data
99233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea             rcx,    [rax+rax*2]
100233d2500723e5594f3e7c70896ffeeef32b9c950ywan        prefetcht0      [rsi]
101233d2500723e5594f3e7c70896ffeeef32b9c950ywan        prefetcht0      [rsi+rax]
102233d2500723e5594f3e7c70896ffeeef32b9c950ywan        prefetcht0      [rsi+rax*2]
103233d2500723e5594f3e7c70896ffeeef32b9c950ywan        prefetcht0      [rsi+rcx]
104233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea             rbx,    [rsi+rax*4]
105233d2500723e5594f3e7c70896ffeeef32b9c950ywan        prefetcht0      [rbx]
106233d2500723e5594f3e7c70896ffeeef32b9c950ywan        prefetcht0      [rbx+rax]
107233d2500723e5594f3e7c70896ffeeef32b9c950ywan        prefetcht0      [rbx+rax*2]
108233d2500723e5594f3e7c70896ffeeef32b9c950ywan        prefetcht0      [rbx+rcx]
109233d2500723e5594f3e7c70896ffeeef32b9c950ywan
110233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea             rcx,    [rdx+rdx*2]
111233d2500723e5594f3e7c70896ffeeef32b9c950ywan        prefetcht0      [rdi]
112233d2500723e5594f3e7c70896ffeeef32b9c950ywan        prefetcht0      [rdi+rdx]
113233d2500723e5594f3e7c70896ffeeef32b9c950ywan        prefetcht0      [rdi+rdx*2]
114233d2500723e5594f3e7c70896ffeeef32b9c950ywan        prefetcht0      [rdi+rcx]
115233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea             rbx,    [rdi+rdx*4]
116233d2500723e5594f3e7c70896ffeeef32b9c950ywan        prefetcht0      [rbx]
117233d2500723e5594f3e7c70896ffeeef32b9c950ywan        prefetcht0      [rbx+rdx]
118233d2500723e5594f3e7c70896ffeeef32b9c950ywan        prefetcht0      [rbx+rdx*2]
119233d2500723e5594f3e7c70896ffeeef32b9c950ywan        prefetcht0      [rbx+rcx]
120233d2500723e5594f3e7c70896ffeeef32b9c950ywan
121233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm0,           xmm0                        ; clear xmm0 for unpack
122233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm7,           xmm7                        ; clear xmm7 for accumulating diffs
123233d2500723e5594f3e7c70896ffeeef32b9c950ywan
124233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm6,           xmm6                        ; clear xmm6 for accumulating sse
125233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rcx,            16
126233d2500723e5594f3e7c70896ffeeef32b9c950ywan
127233d2500723e5594f3e7c70896ffeeef32b9c950ywan.var16loop:
128233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqu      xmm1,           XMMWORD PTR [rsi]
129233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqu      xmm2,           XMMWORD PTR [rdi]
130233d2500723e5594f3e7c70896ffeeef32b9c950ywan
131233d2500723e5594f3e7c70896ffeeef32b9c950ywan        prefetcht0      [rsi+rax*8]
132233d2500723e5594f3e7c70896ffeeef32b9c950ywan        prefetcht0      [rdi+rdx*8]
133233d2500723e5594f3e7c70896ffeeef32b9c950ywan
134233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm3,           xmm1
135233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm4,           xmm2
136233d2500723e5594f3e7c70896ffeeef32b9c950ywan
137233d2500723e5594f3e7c70896ffeeef32b9c950ywan
138233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm1,           xmm0
139233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhbw   xmm3,           xmm0
140233d2500723e5594f3e7c70896ffeeef32b9c950ywan
141233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm2,           xmm0
142233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhbw   xmm4,           xmm0
143233d2500723e5594f3e7c70896ffeeef32b9c950ywan
144233d2500723e5594f3e7c70896ffeeef32b9c950ywan
145233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubw       xmm1,           xmm2
146233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubw       xmm3,           xmm4
147233d2500723e5594f3e7c70896ffeeef32b9c950ywan
148233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm7,           xmm1
149233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaddwd     xmm1,           xmm1
150233d2500723e5594f3e7c70896ffeeef32b9c950ywan
151233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm7,           xmm3
152233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaddwd     xmm3,           xmm3
153233d2500723e5594f3e7c70896ffeeef32b9c950ywan
154233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd       xmm6,           xmm1
155233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd       xmm6,           xmm3
156233d2500723e5594f3e7c70896ffeeef32b9c950ywan
157233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add         rsi,            rax
158233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add         rdi,            rdx
159233d2500723e5594f3e7c70896ffeeef32b9c950ywan
160233d2500723e5594f3e7c70896ffeeef32b9c950ywan        sub         rcx,            1
161233d2500723e5594f3e7c70896ffeeef32b9c950ywan        jnz         .var16loop
162233d2500723e5594f3e7c70896ffeeef32b9c950ywan
163233d2500723e5594f3e7c70896ffeeef32b9c950ywan
164233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm1,           xmm6
165233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm6,           xmm6
166233d2500723e5594f3e7c70896ffeeef32b9c950ywan
167233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm5,           xmm5
168233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklwd   xmm6,           xmm7
169233d2500723e5594f3e7c70896ffeeef32b9c950ywan
170233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhwd   xmm5,           xmm7
171233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrad       xmm5,           16
172233d2500723e5594f3e7c70896ffeeef32b9c950ywan
173233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrad       xmm6,           16
174233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd       xmm6,           xmm5
175233d2500723e5594f3e7c70896ffeeef32b9c950ywan
176233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm2,           xmm1
177233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckldq   xmm1,           xmm0
178233d2500723e5594f3e7c70896ffeeef32b9c950ywan
179233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhdq   xmm2,           xmm0
180233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm7,           xmm6
181233d2500723e5594f3e7c70896ffeeef32b9c950ywan
182233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd       xmm1,           xmm2
183233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckldq   xmm6,           xmm0
184233d2500723e5594f3e7c70896ffeeef32b9c950ywan
185233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhdq   xmm7,           xmm0
186233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd       xmm6,           xmm7
187233d2500723e5594f3e7c70896ffeeef32b9c950ywan
188233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm2,           xmm1
189233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm7,           xmm6
190233d2500723e5594f3e7c70896ffeeef32b9c950ywan
191233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrldq      xmm1,           8
192233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrldq      xmm6,           8
193233d2500723e5594f3e7c70896ffeeef32b9c950ywan
194233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd       xmm7,           xmm6
195233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd       xmm1,           xmm2
196233d2500723e5594f3e7c70896ffeeef32b9c950ywan
197233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rax,            arg(5) ;[Sum]
198233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rdi,            arg(4) ;[SSE]
199233d2500723e5594f3e7c70896ffeeef32b9c950ywan
200233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd DWORD PTR [rax],       xmm7
201233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd DWORD PTR [rdi],       xmm1
202233d2500723e5594f3e7c70896ffeeef32b9c950ywan
203233d2500723e5594f3e7c70896ffeeef32b9c950ywan
204233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; begin epilog
205233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rdi
206233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rsi
207233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rbx
208233d2500723e5594f3e7c70896ffeeef32b9c950ywan    RESTORE_XMM
209233d2500723e5594f3e7c70896ffeeef32b9c950ywan    UNSHADOW_ARGS
210233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop         rbp
211233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ret
212233d2500723e5594f3e7c70896ffeeef32b9c950ywan
213233d2500723e5594f3e7c70896ffeeef32b9c950ywan
214233d2500723e5594f3e7c70896ffeeef32b9c950ywan
215233d2500723e5594f3e7c70896ffeeef32b9c950ywan
216233d2500723e5594f3e7c70896ffeeef32b9c950ywan;unsigned int vp9_get8x8var_sse2
217233d2500723e5594f3e7c70896ffeeef32b9c950ywan;(
218233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned char   *  src_ptr,
219233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    int             source_stride,
220233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned char   *  ref_ptr,
221233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    int             recon_stride,
222233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned int    *  SSE,
223233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    int             *  Sum
224233d2500723e5594f3e7c70896ffeeef32b9c950ywan;)
225233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp9_get8x8var_sse2) PRIVATE
226233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp9_get8x8var_sse2):
227233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rbp
228233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         rbp, rsp
229233d2500723e5594f3e7c70896ffeeef32b9c950ywan    SHADOW_ARGS_TO_STACK 6
230233d2500723e5594f3e7c70896ffeeef32b9c950ywan    SAVE_XMM 7
231233d2500723e5594f3e7c70896ffeeef32b9c950ywan    GET_GOT     rbx
232233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push rsi
233233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push rdi
234233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         rsp, 16
235233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; end prolog
236233d2500723e5594f3e7c70896ffeeef32b9c950ywan
237233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rsi,            arg(0) ;[src_ptr]
238233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rdi,            arg(2) ;[ref_ptr]
239233d2500723e5594f3e7c70896ffeeef32b9c950ywan
240233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
241233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
242233d2500723e5594f3e7c70896ffeeef32b9c950ywan
243233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm0,           xmm0                        ; clear xmm0 for unpack
244233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm7,           xmm7                        ; clear xmm7 for accumulating diffs
245233d2500723e5594f3e7c70896ffeeef32b9c950ywan
246233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        xmm1,           QWORD PTR [rsi]
247233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        xmm2,           QWORD PTR [rdi]
248233d2500723e5594f3e7c70896ffeeef32b9c950ywan
249233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm1,           xmm0
250233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm2,           xmm0
251233d2500723e5594f3e7c70896ffeeef32b9c950ywan
252233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubsw      xmm1,           xmm2
253233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm7,           xmm1
254233d2500723e5594f3e7c70896ffeeef32b9c950ywan
255233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaddwd     xmm1,           xmm1
256233d2500723e5594f3e7c70896ffeeef32b9c950ywan
257233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        xmm2,           QWORD PTR[rsi + rax]
258233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        xmm3,           QWORD PTR[rdi + rdx]
259233d2500723e5594f3e7c70896ffeeef32b9c950ywan
260233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm2,           xmm0
261233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm3,           xmm0
262233d2500723e5594f3e7c70896ffeeef32b9c950ywan
263233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubsw      xmm2,           xmm3
264233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm7,           xmm2
265233d2500723e5594f3e7c70896ffeeef32b9c950ywan
266233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaddwd     xmm2,           xmm2
267233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd       xmm1,           xmm2
268233d2500723e5594f3e7c70896ffeeef32b9c950ywan
269233d2500723e5594f3e7c70896ffeeef32b9c950ywan
270233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        xmm2,           QWORD PTR[rsi + rax * 2]
271233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        xmm3,           QWORD PTR[rdi + rdx * 2]
272233d2500723e5594f3e7c70896ffeeef32b9c950ywan
273233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm2,           xmm0
274233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm3,           xmm0
275233d2500723e5594f3e7c70896ffeeef32b9c950ywan
276233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubsw      xmm2,           xmm3
277233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm7,           xmm2
278233d2500723e5594f3e7c70896ffeeef32b9c950ywan
279233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaddwd     xmm2,           xmm2
280233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd       xmm1,           xmm2
281233d2500723e5594f3e7c70896ffeeef32b9c950ywan
282233d2500723e5594f3e7c70896ffeeef32b9c950ywan
283233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rsi,            [rsi + rax * 2]
284233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rdi,            [rdi + rdx * 2]
285233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        xmm2,           QWORD PTR[rsi + rax]
286233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        xmm3,           QWORD PTR[rdi + rdx]
287233d2500723e5594f3e7c70896ffeeef32b9c950ywan
288233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm2,           xmm0
289233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm3,           xmm0
290233d2500723e5594f3e7c70896ffeeef32b9c950ywan
291233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubsw      xmm2,           xmm3
292233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm7,           xmm2
293233d2500723e5594f3e7c70896ffeeef32b9c950ywan
294233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaddwd     xmm2,           xmm2
295233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd       xmm1,           xmm2
296233d2500723e5594f3e7c70896ffeeef32b9c950ywan
297233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        xmm2,           QWORD PTR[rsi + rax *2]
298233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        xmm3,           QWORD PTR[rdi + rdx *2]
299233d2500723e5594f3e7c70896ffeeef32b9c950ywan
300233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm2,           xmm0
301233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm3,           xmm0
302233d2500723e5594f3e7c70896ffeeef32b9c950ywan
303233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubsw      xmm2,           xmm3
304233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm7,           xmm2
305233d2500723e5594f3e7c70896ffeeef32b9c950ywan
306233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaddwd     xmm2,           xmm2
307233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd       xmm1,           xmm2
308233d2500723e5594f3e7c70896ffeeef32b9c950ywan
309233d2500723e5594f3e7c70896ffeeef32b9c950ywan
310233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rsi,            [rsi + rax * 2]
311233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rdi,            [rdi + rdx * 2]
312233d2500723e5594f3e7c70896ffeeef32b9c950ywan
313233d2500723e5594f3e7c70896ffeeef32b9c950ywan
314233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        xmm2,           QWORD PTR[rsi + rax]
315233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        xmm3,           QWORD PTR[rdi + rdx]
316233d2500723e5594f3e7c70896ffeeef32b9c950ywan
317233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm2,           xmm0
318233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm3,           xmm0
319233d2500723e5594f3e7c70896ffeeef32b9c950ywan
320233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubsw      xmm2,           xmm3
321233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm7,           xmm2
322233d2500723e5594f3e7c70896ffeeef32b9c950ywan
323233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaddwd     xmm2,           xmm2
324233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd       xmm1,           xmm2
325233d2500723e5594f3e7c70896ffeeef32b9c950ywan
326233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        xmm2,           QWORD PTR[rsi + rax *2]
327233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        xmm3,           QWORD PTR[rdi + rdx *2]
328233d2500723e5594f3e7c70896ffeeef32b9c950ywan
329233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm2,           xmm0
330233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm3,           xmm0
331233d2500723e5594f3e7c70896ffeeef32b9c950ywan
332233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubsw      xmm2,           xmm3
333233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm7,           xmm2
334233d2500723e5594f3e7c70896ffeeef32b9c950ywan
335233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaddwd     xmm2,           xmm2
336233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd       xmm1,           xmm2
337233d2500723e5594f3e7c70896ffeeef32b9c950ywan
338233d2500723e5594f3e7c70896ffeeef32b9c950ywan
339233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rsi,            [rsi + rax * 2]
340233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rdi,            [rdi + rdx * 2]
341233d2500723e5594f3e7c70896ffeeef32b9c950ywan
342233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        xmm2,           QWORD PTR[rsi + rax]
343233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        xmm3,           QWORD PTR[rdi + rdx]
344233d2500723e5594f3e7c70896ffeeef32b9c950ywan
345233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm2,           xmm0
346233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm3,           xmm0
347233d2500723e5594f3e7c70896ffeeef32b9c950ywan
348233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubsw      xmm2,           xmm3
349233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm7,           xmm2
350233d2500723e5594f3e7c70896ffeeef32b9c950ywan
351233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaddwd     xmm2,           xmm2
352233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd       xmm1,           xmm2
353233d2500723e5594f3e7c70896ffeeef32b9c950ywan
354233d2500723e5594f3e7c70896ffeeef32b9c950ywan
355233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm6,           xmm7
356233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklwd   xmm6,           xmm0
357233d2500723e5594f3e7c70896ffeeef32b9c950ywan
358233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhwd   xmm7,           xmm0
359233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm2,           xmm1
360233d2500723e5594f3e7c70896ffeeef32b9c950ywan
361233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm6,           xmm7
362233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckldq   xmm1,           xmm0
363233d2500723e5594f3e7c70896ffeeef32b9c950ywan
364233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhdq   xmm2,           xmm0
365233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm7,           xmm6
366233d2500723e5594f3e7c70896ffeeef32b9c950ywan
367233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd       xmm1,           xmm2
368233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckldq   xmm6,           xmm0
369233d2500723e5594f3e7c70896ffeeef32b9c950ywan
370233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhdq   xmm7,           xmm0
371233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm6,           xmm7
372233d2500723e5594f3e7c70896ffeeef32b9c950ywan
373233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm2,           xmm1
374233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm7,           xmm6
375233d2500723e5594f3e7c70896ffeeef32b9c950ywan
376233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrldq      xmm1,           8
377233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrldq      xmm6,           8
378233d2500723e5594f3e7c70896ffeeef32b9c950ywan
379233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm7,           xmm6
380233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd       xmm1,           xmm2
381233d2500723e5594f3e7c70896ffeeef32b9c950ywan
382233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rax,            arg(5) ;[Sum]
383233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rdi,            arg(4) ;[SSE]
384233d2500723e5594f3e7c70896ffeeef32b9c950ywan
385233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        rdx,            xmm7
386233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movsx       rcx,            dx
387233d2500723e5594f3e7c70896ffeeef32b9c950ywan
388233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov  dword ptr [rax],       ecx
389233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd DWORD PTR [rdi],       xmm1
390233d2500723e5594f3e7c70896ffeeef32b9c950ywan
391233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; begin epilog
392233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add rsp, 16
393233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rdi
394233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rsi
395233d2500723e5594f3e7c70896ffeeef32b9c950ywan    RESTORE_GOT
396233d2500723e5594f3e7c70896ffeeef32b9c950ywan    RESTORE_XMM
397233d2500723e5594f3e7c70896ffeeef32b9c950ywan    UNSHADOW_ARGS
398233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop         rbp
399233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ret
400233d2500723e5594f3e7c70896ffeeef32b9c950ywan
401233d2500723e5594f3e7c70896ffeeef32b9c950ywan;void vp9_half_horiz_vert_variance8x_h_sse2
402233d2500723e5594f3e7c70896ffeeef32b9c950ywan;(
403233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned char *ref_ptr,
404233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    int ref_pixels_per_line,
405233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned char *src_ptr,
406233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    int src_pixels_per_line,
407233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned int Height,
408233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    int *sum,
409233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned int *sumsquared
410233d2500723e5594f3e7c70896ffeeef32b9c950ywan;)
411233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp9_half_horiz_vert_variance8x_h_sse2) PRIVATE
412233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp9_half_horiz_vert_variance8x_h_sse2):
413233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rbp
414233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         rbp, rsp
415233d2500723e5594f3e7c70896ffeeef32b9c950ywan    SHADOW_ARGS_TO_STACK 7
416233d2500723e5594f3e7c70896ffeeef32b9c950ywan    SAVE_XMM 7
417233d2500723e5594f3e7c70896ffeeef32b9c950ywan    GET_GOT     rbx
418233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push rsi
419233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push rdi
420233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; end prolog
421233d2500723e5594f3e7c70896ffeeef32b9c950ywan
422233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if ABI_IS_32BIT=0
423233d2500723e5594f3e7c70896ffeeef32b9c950ywan    movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line
424233d2500723e5594f3e7c70896ffeeef32b9c950ywan    movsxd          r9, dword ptr arg(3) ;src_pixels_per_line
425233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif
426233d2500723e5594f3e7c70896ffeeef32b9c950ywan
427233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor            xmm6,           xmm6                ;  error accumulator
428233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor            xmm7,           xmm7                ;  sse eaccumulator
429233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov             rsi,            arg(0) ;ref_ptr              ;
430233d2500723e5594f3e7c70896ffeeef32b9c950ywan
431233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov             rdi,            arg(2) ;src_ptr              ;
432233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movsxd          rcx,            dword ptr arg(4) ;Height              ;
433233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
434233d2500723e5594f3e7c70896ffeeef32b9c950ywan
435233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor            xmm0,           xmm0                ;
436233d2500723e5594f3e7c70896ffeeef32b9c950ywan
437233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
438233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq            xmm3,           QWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s9
439233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3) horizontal line 1
440233d2500723e5594f3e7c70896ffeeef32b9c950ywan
441233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if ABI_IS_32BIT
442233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
443233d2500723e5594f3e7c70896ffeeef32b9c950ywan%else
444233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add             rsi, r8
445233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif
446233d2500723e5594f3e7c70896ffeeef32b9c950ywan
447233d2500723e5594f3e7c70896ffeeef32b9c950ywan.half_horiz_vert_variance8x_h_1:
448233d2500723e5594f3e7c70896ffeeef32b9c950ywan
449233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq            xmm1,           QWORD PTR [rsi]     ;
450233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq            xmm2,           QWORD PTR [rsi+1]   ;
451233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pavgb           xmm1,           xmm2                ;  xmm1 = avg(xmm1,xmm3) horizontal line i+1
452233d2500723e5594f3e7c70896ffeeef32b9c950ywan
453233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pavgb           xmm5,           xmm1                ;  xmm = vertical average of the above
454233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
455233d2500723e5594f3e7c70896ffeeef32b9c950ywan
456233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8
457233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
458233d2500723e5594f3e7c70896ffeeef32b9c950ywan
459233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
460233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
461233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
462233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
463233d2500723e5594f3e7c70896ffeeef32b9c950ywan
464233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa          xmm5,           xmm1                ;  save xmm1 for use on the next row
465233d2500723e5594f3e7c70896ffeeef32b9c950ywan
466233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if ABI_IS_32BIT
467233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
468233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination
469233d2500723e5594f3e7c70896ffeeef32b9c950ywan%else
470233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add             rsi, r8
471233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add             rdi, r9
472233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif
473233d2500723e5594f3e7c70896ffeeef32b9c950ywan
474233d2500723e5594f3e7c70896ffeeef32b9c950ywan        sub             rcx,            1                   ;
475233d2500723e5594f3e7c70896ffeeef32b9c950ywan        jnz             .half_horiz_vert_variance8x_h_1     ;
476233d2500723e5594f3e7c70896ffeeef32b9c950ywan
477233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdq2q         mm6,            xmm6                ;
478233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdq2q         mm7,            xmm7                ;
479233d2500723e5594f3e7c70896ffeeef32b9c950ywan
480233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrldq          xmm6,           8
481233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrldq          xmm7,           8
482233d2500723e5594f3e7c70896ffeeef32b9c950ywan
483233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdq2q         mm2,            xmm6
484233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdq2q         mm3,            xmm7
485233d2500723e5594f3e7c70896ffeeef32b9c950ywan
486233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw           mm6,            mm2
487233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd           mm7,            mm3
488233d2500723e5594f3e7c70896ffeeef32b9c950ywan
489233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor            mm3,            mm3                 ;
490233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor            mm2,            mm2                 ;
491233d2500723e5594f3e7c70896ffeeef32b9c950ywan
492233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklwd       mm2,            mm6                 ;
493233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhwd       mm3,            mm6                 ;
494233d2500723e5594f3e7c70896ffeeef32b9c950ywan
495233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd           mm2,            mm3                 ;
496233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq            mm6,            mm2                 ;
497233d2500723e5594f3e7c70896ffeeef32b9c950ywan
498233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrlq           mm6,            32                  ;
499233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd           mm2,            mm6                 ;
500233d2500723e5594f3e7c70896ffeeef32b9c950ywan
501233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrad           mm2,            16                  ;
502233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq            mm4,            mm7                 ;
503233d2500723e5594f3e7c70896ffeeef32b9c950ywan
504233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrlq           mm4,            32                  ;
505233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd           mm4,            mm7                 ;
506233d2500723e5594f3e7c70896ffeeef32b9c950ywan
507233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov             rsi,            arg(5) ; sum
508233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov             rdi,            arg(6) ; sumsquared
509233d2500723e5594f3e7c70896ffeeef32b9c950ywan
510233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd            [rsi],          mm2                 ;
511233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd            [rdi],          mm4                 ;
512233d2500723e5594f3e7c70896ffeeef32b9c950ywan
513233d2500723e5594f3e7c70896ffeeef32b9c950ywan
514233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; begin epilog
515233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rdi
516233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rsi
517233d2500723e5594f3e7c70896ffeeef32b9c950ywan    RESTORE_GOT
518233d2500723e5594f3e7c70896ffeeef32b9c950ywan    RESTORE_XMM
519233d2500723e5594f3e7c70896ffeeef32b9c950ywan    UNSHADOW_ARGS
520233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop         rbp
521233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ret
522233d2500723e5594f3e7c70896ffeeef32b9c950ywan
523233d2500723e5594f3e7c70896ffeeef32b9c950ywan;void vp9_half_vert_variance8x_h_sse2
524233d2500723e5594f3e7c70896ffeeef32b9c950ywan;(
525233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned char *ref_ptr,
526233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    int ref_pixels_per_line,
527233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned char *src_ptr,
528233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    int src_pixels_per_line,
529233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned int Height,
530233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    int *sum,
531233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned int *sumsquared
532233d2500723e5594f3e7c70896ffeeef32b9c950ywan;)
533233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp9_half_vert_variance8x_h_sse2) PRIVATE
534233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp9_half_vert_variance8x_h_sse2):
535233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rbp
536233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         rbp, rsp
537233d2500723e5594f3e7c70896ffeeef32b9c950ywan    SHADOW_ARGS_TO_STACK 7
538233d2500723e5594f3e7c70896ffeeef32b9c950ywan    SAVE_XMM 7
539233d2500723e5594f3e7c70896ffeeef32b9c950ywan    GET_GOT     rbx
540233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push rsi
541233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push rdi
542233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; end prolog
543233d2500723e5594f3e7c70896ffeeef32b9c950ywan
544233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if ABI_IS_32BIT=0
545233d2500723e5594f3e7c70896ffeeef32b9c950ywan    movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line
546233d2500723e5594f3e7c70896ffeeef32b9c950ywan    movsxd          r9, dword ptr arg(3) ;src_pixels_per_line
547233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif
548233d2500723e5594f3e7c70896ffeeef32b9c950ywan
549233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor            xmm6,           xmm6                ;  error accumulator
550233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor            xmm7,           xmm7                ;  sse eaccumulator
551233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov             rsi,            arg(0) ;ref_ptr              ;
552233d2500723e5594f3e7c70896ffeeef32b9c950ywan
553233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov             rdi,            arg(2) ;src_ptr              ;
554233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movsxd          rcx,            dword ptr arg(4) ;Height              ;
555233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
556233d2500723e5594f3e7c70896ffeeef32b9c950ywan
557233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor            xmm0,           xmm0                ;
558233d2500723e5594f3e7c70896ffeeef32b9c950ywan.half_vert_variance8x_h_1:
559233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
560233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq            xmm3,           QWORD PTR [rsi+rax] ;  xmm3 = s1,s2,s3..s9
561233d2500723e5594f3e7c70896ffeeef32b9c950ywan
562233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
563233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
564233d2500723e5594f3e7c70896ffeeef32b9c950ywan
565233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8
566233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
567233d2500723e5594f3e7c70896ffeeef32b9c950ywan
568233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
569233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
570233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
571233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
572233d2500723e5594f3e7c70896ffeeef32b9c950ywan
573233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if ABI_IS_32BIT
574233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
575233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination
576233d2500723e5594f3e7c70896ffeeef32b9c950ywan%else
577233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add             rsi, r8
578233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add             rdi, r9
579233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif
580233d2500723e5594f3e7c70896ffeeef32b9c950ywan
581233d2500723e5594f3e7c70896ffeeef32b9c950ywan        sub             rcx,            1                   ;
582233d2500723e5594f3e7c70896ffeeef32b9c950ywan        jnz             .half_vert_variance8x_h_1          ;
583233d2500723e5594f3e7c70896ffeeef32b9c950ywan
584233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdq2q         mm6,            xmm6                ;
585233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdq2q         mm7,            xmm7                ;
586233d2500723e5594f3e7c70896ffeeef32b9c950ywan
587233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrldq          xmm6,           8
588233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrldq          xmm7,           8
589233d2500723e5594f3e7c70896ffeeef32b9c950ywan
590233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdq2q         mm2,            xmm6
591233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdq2q         mm3,            xmm7
592233d2500723e5594f3e7c70896ffeeef32b9c950ywan
593233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw           mm6,            mm2
594233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd           mm7,            mm3
595233d2500723e5594f3e7c70896ffeeef32b9c950ywan
596233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor            mm3,            mm3                 ;
597233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor            mm2,            mm2                 ;
598233d2500723e5594f3e7c70896ffeeef32b9c950ywan
599233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklwd       mm2,            mm6                 ;
600233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhwd       mm3,            mm6                 ;
601233d2500723e5594f3e7c70896ffeeef32b9c950ywan
602233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd           mm2,            mm3                 ;
603233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq            mm6,            mm2                 ;
604233d2500723e5594f3e7c70896ffeeef32b9c950ywan
605233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrlq           mm6,            32                  ;
606233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd           mm2,            mm6                 ;
607233d2500723e5594f3e7c70896ffeeef32b9c950ywan
608233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrad           mm2,            16                  ;
609233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq            mm4,            mm7                 ;
610233d2500723e5594f3e7c70896ffeeef32b9c950ywan
611233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrlq           mm4,            32                  ;
612233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd           mm4,            mm7                 ;
613233d2500723e5594f3e7c70896ffeeef32b9c950ywan
614233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov             rsi,            arg(5) ; sum
615233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov             rdi,            arg(6) ; sumsquared
616233d2500723e5594f3e7c70896ffeeef32b9c950ywan
617233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd            [rsi],          mm2                 ;
618233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd            [rdi],          mm4                 ;
619233d2500723e5594f3e7c70896ffeeef32b9c950ywan
620233d2500723e5594f3e7c70896ffeeef32b9c950ywan
621233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; begin epilog
622233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rdi
623233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rsi
624233d2500723e5594f3e7c70896ffeeef32b9c950ywan    RESTORE_GOT
625233d2500723e5594f3e7c70896ffeeef32b9c950ywan    RESTORE_XMM
626233d2500723e5594f3e7c70896ffeeef32b9c950ywan    UNSHADOW_ARGS
627233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop         rbp
628233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ret
629233d2500723e5594f3e7c70896ffeeef32b9c950ywan
630233d2500723e5594f3e7c70896ffeeef32b9c950ywan
631233d2500723e5594f3e7c70896ffeeef32b9c950ywan;void vp9_half_horiz_variance8x_h_sse2
632233d2500723e5594f3e7c70896ffeeef32b9c950ywan;(
633233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned char *ref_ptr,
634233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    int ref_pixels_per_line,
635233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned char *src_ptr,
636233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    int src_pixels_per_line,
637233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned int Height,
638233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    int *sum,
639233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned int *sumsquared
640233d2500723e5594f3e7c70896ffeeef32b9c950ywan;)
641233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp9_half_horiz_variance8x_h_sse2) PRIVATE
642233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp9_half_horiz_variance8x_h_sse2):
643233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rbp
644233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         rbp, rsp
645233d2500723e5594f3e7c70896ffeeef32b9c950ywan    SHADOW_ARGS_TO_STACK 7
646233d2500723e5594f3e7c70896ffeeef32b9c950ywan    SAVE_XMM 7
647233d2500723e5594f3e7c70896ffeeef32b9c950ywan    GET_GOT     rbx
648233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push rsi
649233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push rdi
650233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; end prolog
651233d2500723e5594f3e7c70896ffeeef32b9c950ywan
652233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if ABI_IS_32BIT=0
653233d2500723e5594f3e7c70896ffeeef32b9c950ywan    movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line
654233d2500723e5594f3e7c70896ffeeef32b9c950ywan    movsxd          r9, dword ptr arg(3) ;src_pixels_per_line
655233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif
656233d2500723e5594f3e7c70896ffeeef32b9c950ywan
657233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor            xmm6,           xmm6                ;  error accumulator
658233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor            xmm7,           xmm7                ;  sse eaccumulator
659233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov             rsi,            arg(0) ;ref_ptr              ;
660233d2500723e5594f3e7c70896ffeeef32b9c950ywan
661233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov             rdi,            arg(2) ;src_ptr              ;
662233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movsxd          rcx,            dword ptr arg(4) ;Height              ;
663233d2500723e5594f3e7c70896ffeeef32b9c950ywan
664233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor            xmm0,           xmm0                ;
665233d2500723e5594f3e7c70896ffeeef32b9c950ywan.half_horiz_variance8x_h_1:
666233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
667233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq            xmm3,           QWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s9
668233d2500723e5594f3e7c70896ffeeef32b9c950ywan
669233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
670233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
671233d2500723e5594f3e7c70896ffeeef32b9c950ywan
672233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8
673233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
674233d2500723e5594f3e7c70896ffeeef32b9c950ywan
675233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
676233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
677233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
678233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
679233d2500723e5594f3e7c70896ffeeef32b9c950ywan
680233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if ABI_IS_32BIT
681233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
682233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination
683233d2500723e5594f3e7c70896ffeeef32b9c950ywan%else
684233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add             rsi, r8
685233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add             rdi, r9
686233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif
687233d2500723e5594f3e7c70896ffeeef32b9c950ywan        sub             rcx,            1                   ;
688233d2500723e5594f3e7c70896ffeeef32b9c950ywan        jnz             .half_horiz_variance8x_h_1          ;
689233d2500723e5594f3e7c70896ffeeef32b9c950ywan
690233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdq2q         mm6,            xmm6                ;
691233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdq2q         mm7,            xmm7                ;
692233d2500723e5594f3e7c70896ffeeef32b9c950ywan
693233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrldq          xmm6,           8
694233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrldq          xmm7,           8
695233d2500723e5594f3e7c70896ffeeef32b9c950ywan
696233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdq2q         mm2,            xmm6
697233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdq2q         mm3,            xmm7
698233d2500723e5594f3e7c70896ffeeef32b9c950ywan
699233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw           mm6,            mm2
700233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd           mm7,            mm3
701233d2500723e5594f3e7c70896ffeeef32b9c950ywan
702233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor            mm3,            mm3                 ;
703233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor            mm2,            mm2                 ;
704233d2500723e5594f3e7c70896ffeeef32b9c950ywan
705233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklwd       mm2,            mm6                 ;
706233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhwd       mm3,            mm6                 ;
707233d2500723e5594f3e7c70896ffeeef32b9c950ywan
708233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd           mm2,            mm3                 ;
709233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq            mm6,            mm2                 ;
710233d2500723e5594f3e7c70896ffeeef32b9c950ywan
711233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrlq           mm6,            32                  ;
712233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd           mm2,            mm6                 ;
713233d2500723e5594f3e7c70896ffeeef32b9c950ywan
714233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrad           mm2,            16                  ;
715233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq            mm4,            mm7                 ;
716233d2500723e5594f3e7c70896ffeeef32b9c950ywan
717233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrlq           mm4,            32                  ;
718233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd           mm4,            mm7                 ;
719233d2500723e5594f3e7c70896ffeeef32b9c950ywan
720233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov             rsi,            arg(5) ; sum
721233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov             rdi,            arg(6) ; sumsquared
722233d2500723e5594f3e7c70896ffeeef32b9c950ywan
723233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd            [rsi],          mm2                 ;
724233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd            [rdi],          mm4                 ;
725233d2500723e5594f3e7c70896ffeeef32b9c950ywan
726233d2500723e5594f3e7c70896ffeeef32b9c950ywan
727233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; begin epilog
728233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rdi
729233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rsi
730233d2500723e5594f3e7c70896ffeeef32b9c950ywan    RESTORE_GOT
731233d2500723e5594f3e7c70896ffeeef32b9c950ywan    RESTORE_XMM
732233d2500723e5594f3e7c70896ffeeef32b9c950ywan    UNSHADOW_ARGS
733233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop         rbp
734233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ret
735