1ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;
2ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;
4ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;  Use of this source code is governed by a BSD-style license
5ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;  that can be found in the LICENSE file in the root of the source
6ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;  tree. An additional intellectual property rights grant can be found
7ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;  in the file PATENTS.  All contributing project authors may
8ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;  be found in the AUTHORS file in the root of the source tree.
9ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;
10ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
11ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
12ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%include "vpx_ports/x86_abi_support.asm"
13ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
14ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;unsigned int vp9_get_mb_ss_sse2
15ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;(
16ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    short *src_ptr
17ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;)
18ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangglobal sym(vp9_get_mb_ss_sse2) PRIVATE
19ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangsym(vp9_get_mb_ss_sse2):
20ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push        rbp
21ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    mov         rbp, rsp
22ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    SHADOW_ARGS_TO_STACK 1
23ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    GET_GOT     rbx
24ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push rsi
25ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push rdi
26ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    sub         rsp, 16
27ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ; end prolog
28ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
29ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
30ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        mov         rax, arg(0) ;[src_ptr]
31ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        mov         rcx, 8
32ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pxor        xmm4, xmm4
33ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
34ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang.NEXTROW:
35ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movdqa      xmm0, [rax]
36ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movdqa      xmm1, [rax+16]
37ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movdqa      xmm2, [rax+32]
38ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movdqa      xmm3, [rax+48]
39ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pmaddwd     xmm0, xmm0
40ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pmaddwd     xmm1, xmm1
41ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pmaddwd     xmm2, xmm2
42ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pmaddwd     xmm3, xmm3
43ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
44ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd       xmm0, xmm1
45ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd       xmm2, xmm3
46ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd       xmm4, xmm0
47ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd       xmm4, xmm2
48ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
49ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add         rax, 0x40
50ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        dec         rcx
51ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        ja          .NEXTROW
52ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
53ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movdqa      xmm3,xmm4
54ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psrldq      xmm4,8
55ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd       xmm4,xmm3
56ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movdqa      xmm3,xmm4
57ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psrldq      xmm4,4
58ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd       xmm4,xmm3
59ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        rax,xmm4
60ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
61ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
62ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ; begin epilog
63ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    add rsp, 16
64ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop rdi
65ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop rsi
66ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RESTORE_GOT
67ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    UNSHADOW_ARGS
68ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop         rbp
69ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ret
70ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
71ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
72ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;unsigned int vp9_get16x16var_sse2
73ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;(
74ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    unsigned char   *  src_ptr,
75ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    int             source_stride,
76ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    unsigned char   *  ref_ptr,
77ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    int             recon_stride,
78ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    unsigned int    *  SSE,
79ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    int             *  Sum
80ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;)
81ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangglobal sym(vp9_get16x16var_sse2) PRIVATE
82ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangsym(vp9_get16x16var_sse2):
83ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push        rbp
84ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    mov         rbp, rsp
85ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    SHADOW_ARGS_TO_STACK 6
86ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    SAVE_XMM 7
87ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push rbx
88ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push rsi
89ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push rdi
90ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ; end prolog
91ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
92ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        mov         rsi,            arg(0) ;[src_ptr]
93ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        mov         rdi,            arg(2) ;[ref_ptr]
94ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
95ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
96ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
97ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
98ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        ; Prefetch data
99ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        lea             rcx,    [rax+rax*2]
100ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        prefetcht0      [rsi]
101ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        prefetcht0      [rsi+rax]
102ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        prefetcht0      [rsi+rax*2]
103ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        prefetcht0      [rsi+rcx]
104ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        lea             rbx,    [rsi+rax*4]
105ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        prefetcht0      [rbx]
106ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        prefetcht0      [rbx+rax]
107ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        prefetcht0      [rbx+rax*2]
108ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        prefetcht0      [rbx+rcx]
109ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
110ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        lea             rcx,    [rdx+rdx*2]
111ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        prefetcht0      [rdi]
112ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        prefetcht0      [rdi+rdx]
113ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        prefetcht0      [rdi+rdx*2]
114ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        prefetcht0      [rdi+rcx]
115ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        lea             rbx,    [rdi+rdx*4]
116ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        prefetcht0      [rbx]
117ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        prefetcht0      [rbx+rdx]
118ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        prefetcht0      [rbx+rdx*2]
119ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        prefetcht0      [rbx+rcx]
120ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
121ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pxor        xmm0,           xmm0                        ; clear xmm0 for unpack
122ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pxor        xmm7,           xmm7                        ; clear xmm7 for accumulating diffs
123ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
124ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pxor        xmm6,           xmm6                        ; clear xmm6 for accumulating sse
125ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        mov         rcx,            16
126ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
127ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang.var16loop:
128ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movdqu      xmm1,           XMMWORD PTR [rsi]
129ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movdqu      xmm2,           XMMWORD PTR [rdi]
130ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
131ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        prefetcht0      [rsi+rax*8]
132ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        prefetcht0      [rdi+rdx*8]
133ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
134ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movdqa      xmm3,           xmm1
135ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movdqa      xmm4,           xmm2
136ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
137ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
138ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   xmm1,           xmm0
139ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpckhbw   xmm3,           xmm0
140ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
141ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   xmm2,           xmm0
142ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpckhbw   xmm4,           xmm0
143ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
144ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
145ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubw       xmm1,           xmm2
146ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubw       xmm3,           xmm4
147ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
148ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw       xmm7,           xmm1
149ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pmaddwd     xmm1,           xmm1
150ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
151ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw       xmm7,           xmm3
152ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pmaddwd     xmm3,           xmm3
153ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
154ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd       xmm6,           xmm1
155ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd       xmm6,           xmm3
156ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
157ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add         rsi,            rax
158ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add         rdi,            rdx
159ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
160ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        sub         rcx,            1
161ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        jnz         .var16loop
162ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
163ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
164ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movdqa      xmm1,           xmm6
165ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pxor        xmm6,           xmm6
166ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
167ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pxor        xmm5,           xmm5
168ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklwd   xmm6,           xmm7
169ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
170ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpckhwd   xmm5,           xmm7
171ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psrad       xmm5,           16
172ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
173ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psrad       xmm6,           16
174ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd       xmm6,           xmm5
175ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
176ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movdqa      xmm2,           xmm1
177ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpckldq   xmm1,           xmm0
178ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
179ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpckhdq   xmm2,           xmm0
180ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movdqa      xmm7,           xmm6
181ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
182ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd       xmm1,           xmm2
183ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpckldq   xmm6,           xmm0
184ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
185ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpckhdq   xmm7,           xmm0
186ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd       xmm6,           xmm7
187ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
188ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movdqa      xmm2,           xmm1
189ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movdqa      xmm7,           xmm6
190ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
191ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psrldq      xmm1,           8
192ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psrldq      xmm6,           8
193ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
194ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd       xmm7,           xmm6
195ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd       xmm1,           xmm2
196ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
197ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        mov         rax,            arg(5) ;[Sum]
198ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        mov         rdi,            arg(4) ;[SSE]
199ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
200ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movd DWORD PTR [rax],       xmm7
201ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movd DWORD PTR [rdi],       xmm1
202ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
203ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
204ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ; begin epilog
205ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop rdi
206ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop rsi
207ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop rbx
208ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RESTORE_XMM
209ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    UNSHADOW_ARGS
210ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop         rbp
211ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ret
212ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
213ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
214ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
215ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
216ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;unsigned int vp9_get8x8var_sse2
217ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;(
218ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    unsigned char   *  src_ptr,
219ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    int             source_stride,
220ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    unsigned char   *  ref_ptr,
221ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    int             recon_stride,
222ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    unsigned int    *  SSE,
223ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    int             *  Sum
224ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;)
225ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangglobal sym(vp9_get8x8var_sse2) PRIVATE
226ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangsym(vp9_get8x8var_sse2):
227ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push        rbp
228ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    mov         rbp, rsp
229ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    SHADOW_ARGS_TO_STACK 6
230ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    SAVE_XMM 7
231ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    GET_GOT     rbx
232ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push rsi
233ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push rdi
234ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    sub         rsp, 16
235ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ; end prolog
236ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
237ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        mov         rsi,            arg(0) ;[src_ptr]
238ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        mov         rdi,            arg(2) ;[ref_ptr]
239ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
240ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
241ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
242ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
243ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pxor        xmm0,           xmm0                        ; clear xmm0 for unpack
244ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pxor        xmm7,           xmm7                        ; clear xmm7 for accumulating diffs
245ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
246ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        xmm1,           QWORD PTR [rsi]
247ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        xmm2,           QWORD PTR [rdi]
248ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
249ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   xmm1,           xmm0
250ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   xmm2,           xmm0
251ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
252ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubsw      xmm1,           xmm2
253ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw       xmm7,           xmm1
254ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
255ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pmaddwd     xmm1,           xmm1
256ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
257ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        xmm2,           QWORD PTR[rsi + rax]
258ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        xmm3,           QWORD PTR[rdi + rdx]
259ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
260ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   xmm2,           xmm0
261ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   xmm3,           xmm0
262ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
263ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubsw      xmm2,           xmm3
264ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw       xmm7,           xmm2
265ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
266ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pmaddwd     xmm2,           xmm2
267ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd       xmm1,           xmm2
268ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
269ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
270ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        xmm2,           QWORD PTR[rsi + rax * 2]
271ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        xmm3,           QWORD PTR[rdi + rdx * 2]
272ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
273ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   xmm2,           xmm0
274ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   xmm3,           xmm0
275ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
276ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubsw      xmm2,           xmm3
277ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw       xmm7,           xmm2
278ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
279ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pmaddwd     xmm2,           xmm2
280ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd       xmm1,           xmm2
281ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
282ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
283ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        lea         rsi,            [rsi + rax * 2]
284ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        lea         rdi,            [rdi + rdx * 2]
285ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        xmm2,           QWORD PTR[rsi + rax]
286ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        xmm3,           QWORD PTR[rdi + rdx]
287ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
288ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   xmm2,           xmm0
289ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   xmm3,           xmm0
290ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
291ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubsw      xmm2,           xmm3
292ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw       xmm7,           xmm2
293ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
294ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pmaddwd     xmm2,           xmm2
295ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd       xmm1,           xmm2
296ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
297ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        xmm2,           QWORD PTR[rsi + rax *2]
298ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        xmm3,           QWORD PTR[rdi + rdx *2]
299ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
300ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   xmm2,           xmm0
301ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   xmm3,           xmm0
302ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
303ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubsw      xmm2,           xmm3
304ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw       xmm7,           xmm2
305ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
306ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pmaddwd     xmm2,           xmm2
307ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd       xmm1,           xmm2
308ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
309ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
310ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        lea         rsi,            [rsi + rax * 2]
311ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        lea         rdi,            [rdi + rdx * 2]
312ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
313ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
314ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        xmm2,           QWORD PTR[rsi + rax]
315ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        xmm3,           QWORD PTR[rdi + rdx]
316ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
317ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   xmm2,           xmm0
318ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   xmm3,           xmm0
319ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
320ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubsw      xmm2,           xmm3
321ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw       xmm7,           xmm2
322ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
323ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pmaddwd     xmm2,           xmm2
324ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd       xmm1,           xmm2
325ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
326ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        xmm2,           QWORD PTR[rsi + rax *2]
327ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        xmm3,           QWORD PTR[rdi + rdx *2]
328ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
329ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   xmm2,           xmm0
330ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   xmm3,           xmm0
331ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
332ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubsw      xmm2,           xmm3
333ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw       xmm7,           xmm2
334ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
335ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pmaddwd     xmm2,           xmm2
336ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd       xmm1,           xmm2
337ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
338ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
339ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        lea         rsi,            [rsi + rax * 2]
340ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        lea         rdi,            [rdi + rdx * 2]
341ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
342ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        xmm2,           QWORD PTR[rsi + rax]
343ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        xmm3,           QWORD PTR[rdi + rdx]
344ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
345ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   xmm2,           xmm0
346ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   xmm3,           xmm0
347ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
348ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubsw      xmm2,           xmm3
349ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw       xmm7,           xmm2
350ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
351ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pmaddwd     xmm2,           xmm2
352ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd       xmm1,           xmm2
353ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
354ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
355ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movdqa      xmm6,           xmm7
356ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklwd   xmm6,           xmm0
357ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
358ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpckhwd   xmm7,           xmm0
359ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movdqa      xmm2,           xmm1
360ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
361ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw       xmm6,           xmm7
362ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpckldq   xmm1,           xmm0
363ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
364ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpckhdq   xmm2,           xmm0
365ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movdqa      xmm7,           xmm6
366ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
367ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd       xmm1,           xmm2
368ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpckldq   xmm6,           xmm0
369ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
370ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpckhdq   xmm7,           xmm0
371ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw       xmm6,           xmm7
372ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
373ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movdqa      xmm2,           xmm1
374ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movdqa      xmm7,           xmm6
375ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
376ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psrldq      xmm1,           8
377ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psrldq      xmm6,           8
378ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
379ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw       xmm7,           xmm6
380ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd       xmm1,           xmm2
381ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
382ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        mov         rax,            arg(5) ;[Sum]
383ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        mov         rdi,            arg(4) ;[SSE]
384ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
385ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        rdx,            xmm7
386ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movsx       rcx,            dx
387ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
388ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        mov  dword ptr [rax],       ecx
389ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movd DWORD PTR [rdi],       xmm1
390ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
391ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ; begin epilog
392ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    add rsp, 16
393ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop rdi
394ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop rsi
395ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RESTORE_GOT
396ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RESTORE_XMM
397ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    UNSHADOW_ARGS
398ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop         rbp
399ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ret
400ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
401ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;void vp9_half_horiz_vert_variance8x_h_sse2
402ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;(
403ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    unsigned char *ref_ptr,
404ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    int ref_pixels_per_line,
405ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    unsigned char *src_ptr,
406ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    int src_pixels_per_line,
407ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    unsigned int Height,
408ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    int *sum,
409ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    unsigned int *sumsquared
410ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;)
411ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangglobal sym(vp9_half_horiz_vert_variance8x_h_sse2) PRIVATE
412ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangsym(vp9_half_horiz_vert_variance8x_h_sse2):
413ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push        rbp
414ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    mov         rbp, rsp
415ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    SHADOW_ARGS_TO_STACK 7
416ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    SAVE_XMM 7
417ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    GET_GOT     rbx
418ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push rsi
419ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push rdi
420ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ; end prolog
421ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
422ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%if ABI_IS_32BIT=0
423ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line
424ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    movsxd          r9, dword ptr arg(3) ;src_pixels_per_line
425ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endif
426ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
427ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pxor            xmm6,           xmm6                ;  error accumulator
428ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pxor            xmm7,           xmm7                ;  sse eaccumulator
429ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        mov             rsi,            arg(0) ;ref_ptr              ;
430ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
431ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        mov             rdi,            arg(2) ;src_ptr              ;
432ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movsxd          rcx,            dword ptr arg(4) ;Height              ;
433ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
434ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
435ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pxor            xmm0,           xmm0                ;
436ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
437ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
438ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            xmm3,           QWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s9
439ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3) horizontal line 1
440ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
441ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%if ABI_IS_32BIT
442ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
443ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%else
444ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add             rsi, r8
445ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endif
446ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
447ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang.half_horiz_vert_variance8x_h_1:
448ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
449ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            xmm1,           QWORD PTR [rsi]     ;
450ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            xmm2,           QWORD PTR [rsi+1]   ;
451ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pavgb           xmm1,           xmm2                ;  xmm1 = avg(xmm1,xmm3) horizontal line i+1
452ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
453ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pavgb           xmm5,           xmm1                ;  xmm = vertical average of the above
454ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
455ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
456ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8
457ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
458ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
459ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
460ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
461ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
462ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
463ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
464ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movdqa          xmm5,           xmm1                ;  save xmm1 for use on the next row
465ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
466ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%if ABI_IS_32BIT
467ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
468ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination
469ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%else
470ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add             rsi, r8
471ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add             rdi, r9
472ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endif
473ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
474ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        sub             rcx,            1                   ;
475ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        jnz             .half_horiz_vert_variance8x_h_1     ;
476ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
477ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movdq2q         mm6,            xmm6                ;
478ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movdq2q         mm7,            xmm7                ;
479ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
480ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psrldq          xmm6,           8
481ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psrldq          xmm7,           8
482ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
483ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movdq2q         mm2,            xmm6
484ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movdq2q         mm3,            xmm7
485ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
486ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw           mm6,            mm2
487ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd           mm7,            mm3
488ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
489ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pxor            mm3,            mm3                 ;
490ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pxor            mm2,            mm2                 ;
491ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
492ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklwd       mm2,            mm6                 ;
493ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpckhwd       mm3,            mm6                 ;
494ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
495ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd           mm2,            mm3                 ;
496ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            mm6,            mm2                 ;
497ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
498ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psrlq           mm6,            32                  ;
499ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd           mm2,            mm6                 ;
500ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
501ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psrad           mm2,            16                  ;
502ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            mm4,            mm7                 ;
503ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
504ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psrlq           mm4,            32                  ;
505ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd           mm4,            mm7                 ;
506ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
507ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        mov             rsi,            arg(5) ; sum
508ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        mov             rdi,            arg(6) ; sumsquared
509ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
510ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movd            [rsi],          mm2                 ;
511ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movd            [rdi],          mm4                 ;
512ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
513ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
514ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ; begin epilog
515ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop rdi
516ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop rsi
517ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RESTORE_GOT
518ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RESTORE_XMM
519ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    UNSHADOW_ARGS
520ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop         rbp
521ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ret
522ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
523ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;void vp9_half_vert_variance8x_h_sse2
524ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;(
525ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    unsigned char *ref_ptr,
526ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    int ref_pixels_per_line,
527ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    unsigned char *src_ptr,
528ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    int src_pixels_per_line,
529ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    unsigned int Height,
530ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    int *sum,
531ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    unsigned int *sumsquared
532ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;)
533ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangglobal sym(vp9_half_vert_variance8x_h_sse2) PRIVATE
534ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangsym(vp9_half_vert_variance8x_h_sse2):
535ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push        rbp
536ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    mov         rbp, rsp
537ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    SHADOW_ARGS_TO_STACK 7
538ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    SAVE_XMM 7
539ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    GET_GOT     rbx
540ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push rsi
541ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push rdi
542ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ; end prolog
543ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
544ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%if ABI_IS_32BIT=0
545ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line
546ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    movsxd          r9, dword ptr arg(3) ;src_pixels_per_line
547ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endif
548ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
549ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pxor            xmm6,           xmm6                ;  error accumulator
550ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pxor            xmm7,           xmm7                ;  sse eaccumulator
551ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        mov             rsi,            arg(0) ;ref_ptr              ;
552ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
553ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        mov             rdi,            arg(2) ;src_ptr              ;
554ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movsxd          rcx,            dword ptr arg(4) ;Height              ;
555ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
556ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
557ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pxor            xmm0,           xmm0                ;
558ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang.half_vert_variance8x_h_1:
559ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
560ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            xmm3,           QWORD PTR [rsi+rax] ;  xmm3 = s1,s2,s3..s9
561ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
562ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
563ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
564ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
565ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8
566ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
567ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
568ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
569ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
570ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
571ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
572ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
573ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%if ABI_IS_32BIT
574ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
575ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination
576ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%else
577ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add             rsi, r8
578ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add             rdi, r9
579ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endif
580ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
581ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        sub             rcx,            1                   ;
582ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        jnz             .half_vert_variance8x_h_1          ;
583ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
584ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movdq2q         mm6,            xmm6                ;
585ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movdq2q         mm7,            xmm7                ;
586ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
587ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psrldq          xmm6,           8
588ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psrldq          xmm7,           8
589ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
590ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movdq2q         mm2,            xmm6
591ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movdq2q         mm3,            xmm7
592ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
593ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw           mm6,            mm2
594ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd           mm7,            mm3
595ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
596ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pxor            mm3,            mm3                 ;
597ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pxor            mm2,            mm2                 ;
598ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
599ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklwd       mm2,            mm6                 ;
600ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpckhwd       mm3,            mm6                 ;
601ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
602ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd           mm2,            mm3                 ;
603ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            mm6,            mm2                 ;
604ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
605ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psrlq           mm6,            32                  ;
606ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd           mm2,            mm6                 ;
607ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
608ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psrad           mm2,            16                  ;
609ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            mm4,            mm7                 ;
610ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
611ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psrlq           mm4,            32                  ;
612ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd           mm4,            mm7                 ;
613ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
614ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        mov             rsi,            arg(5) ; sum
615ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        mov             rdi,            arg(6) ; sumsquared
616ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
617ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movd            [rsi],          mm2                 ;
618ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movd            [rdi],          mm4                 ;
619ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
620ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
621ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ; begin epilog
622ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop rdi
623ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop rsi
624ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RESTORE_GOT
625ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RESTORE_XMM
626ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    UNSHADOW_ARGS
627ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop         rbp
628ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ret
629ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
630ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
631ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;void vp9_half_horiz_variance8x_h_sse2
632ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;(
633ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    unsigned char *ref_ptr,
634ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    int ref_pixels_per_line,
635ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    unsigned char *src_ptr,
636ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    int src_pixels_per_line,
637ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    unsigned int Height,
638ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    int *sum,
639ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    unsigned int *sumsquared
640ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;)
641ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangglobal sym(vp9_half_horiz_variance8x_h_sse2) PRIVATE
642ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangsym(vp9_half_horiz_variance8x_h_sse2):
643ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push        rbp
644ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    mov         rbp, rsp
645ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    SHADOW_ARGS_TO_STACK 7
646ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    SAVE_XMM 7
647ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    GET_GOT     rbx
648ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push rsi
649ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push rdi
650ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ; end prolog
651ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
652ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%if ABI_IS_32BIT=0
653ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line
654ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    movsxd          r9, dword ptr arg(3) ;src_pixels_per_line
655ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endif
656ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
657ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pxor            xmm6,           xmm6                ;  error accumulator
658ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pxor            xmm7,           xmm7                ;  sse eaccumulator
659ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        mov             rsi,            arg(0) ;ref_ptr              ;
660ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
661ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        mov             rdi,            arg(2) ;src_ptr              ;
662ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movsxd          rcx,            dword ptr arg(4) ;Height              ;
663ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
664ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pxor            xmm0,           xmm0                ;
665ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang.half_horiz_variance8x_h_1:
666ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
667ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            xmm3,           QWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s9
668ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
669ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
670ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
671ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
672ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8
673ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
674ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
675ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
676ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
677ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
678ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
679ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
680ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%if ABI_IS_32BIT
681ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
682ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination
683ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%else
684ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add             rsi, r8
685ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add             rdi, r9
686ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endif
687ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        sub             rcx,            1                   ;
688ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        jnz             .half_horiz_variance8x_h_1          ;
689ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
690ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movdq2q         mm6,            xmm6                ;
691ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movdq2q         mm7,            xmm7                ;
692ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
693ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psrldq          xmm6,           8
694ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psrldq          xmm7,           8
695ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
696ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movdq2q         mm2,            xmm6
697ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movdq2q         mm3,            xmm7
698ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
699ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw           mm6,            mm2
700ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd           mm7,            mm3
701ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
702ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pxor            mm3,            mm3                 ;
703ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pxor            mm2,            mm2                 ;
704ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
705ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklwd       mm2,            mm6                 ;
706ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpckhwd       mm3,            mm6                 ;
707ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
708ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd           mm2,            mm3                 ;
709ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            mm6,            mm2                 ;
710ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
711ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psrlq           mm6,            32                  ;
712ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd           mm2,            mm6                 ;
713ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
714ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psrad           mm2,            16                  ;
715ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            mm4,            mm7                 ;
716ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
717ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psrlq           mm4,            32                  ;
718ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd           mm4,            mm7                 ;
719ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
720ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        mov             rsi,            arg(5) ; sum
721ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        mov             rdi,            arg(6) ; sumsquared
722ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
723ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movd            [rsi],          mm2                 ;
724ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movd            [rdi],          mm4                 ;
725ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
726ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
727ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ; begin epilog
728ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop rdi
729ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop rsi
730ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RESTORE_GOT
731ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RESTORE_XMM
732ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    UNSHADOW_ARGS
733ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop         rbp
734ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ret
735