1ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;
2ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;
4ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;  Use of this source code is governed by a BSD-style license
5ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;  that can be found in the LICENSE file in the root of the source
6ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;  tree. An additional intellectual property rights grant can be found
7ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;  in the file PATENTS.  All contributing project authors may
8ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;  be found in the AUTHORS file in the root of the source tree.
9ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;
10ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
11ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
12ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%include "vpx_ports/x86_abi_support.asm"
13ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
14ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;unsigned int vp9_get_mb_ss_mmx( short *src_ptr )
15ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangglobal sym(vp9_get_mb_ss_mmx) PRIVATE
16ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangsym(vp9_get_mb_ss_mmx):
17ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push        rbp
18ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    mov         rbp, rsp
19ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    SHADOW_ARGS_TO_STACK 7
20ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    GET_GOT     rbx
21ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push rsi
22ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push rdi
23ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    sub         rsp, 8
24ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ; end prolog
25ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
26ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        mov         rax, arg(0) ;src_ptr
27ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        mov         rcx, 16
28ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pxor        mm4, mm4
29ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
30ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang.NEXTROW:
31ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        mm0, [rax]
32ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        mm1, [rax+8]
33ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        mm2, [rax+16]
34ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        mm3, [rax+24]
35ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pmaddwd     mm0, mm0
36ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pmaddwd     mm1, mm1
37ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pmaddwd     mm2, mm2
38ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pmaddwd     mm3, mm3
39ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
40ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd       mm4, mm0
41ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd       mm4, mm1
42ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd       mm4, mm2
43ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd       mm4, mm3
44ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
45ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add         rax, 32
46ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        dec         rcx
47ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        ja          .NEXTROW
48ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        QWORD PTR [rsp], mm4
49ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
50ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        ;return sum[0]+sum[1];
51ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movsxd      rax, dword ptr [rsp]
52ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movsxd      rcx, dword ptr [rsp+4]
53ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add         rax, rcx
54ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
55ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
56ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ; begin epilog
57ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    add rsp, 8
58ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop rdi
59ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop rsi
60ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RESTORE_GOT
61ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    UNSHADOW_ARGS
62ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop         rbp
63ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ret
64ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
65ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
66ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;unsigned int vp9_get8x8var_mmx
67ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;(
68ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    unsigned char *src_ptr,
69ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    int  source_stride,
70ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    unsigned char *ref_ptr,
71ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    int  recon_stride,
72ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    unsigned int *SSE,
73ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    int *Sum
74ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;)
75ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangglobal sym(vp9_get8x8var_mmx) PRIVATE
76ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangsym(vp9_get8x8var_mmx):
77ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push        rbp
78ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    mov         rbp, rsp
79ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    SHADOW_ARGS_TO_STACK 6
80ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push rsi
81ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push rdi
82ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push rbx
83ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    sub         rsp, 16
84ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ; end prolog
85ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
86ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
87ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pxor        mm5, mm5                    ; Blank mmx6
88ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pxor        mm6, mm6                    ; Blank mmx7
89ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pxor        mm7, mm7                    ; Blank mmx7
90ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
91ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
92ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        mov         rbx, arg(2) ;[ref_ptr]
93ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movsxd      rcx, dword ptr arg(1) ;[source_stride]
94ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
95ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
96ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        ; Row 1
97ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        mm0, [rax]                  ; Copy eight bytes to mm0
98ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
99ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        mm2, mm0                    ; Take copies
100ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        mm3, mm1                    ; Take copies
101ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
102ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
103ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   mm1, mm6
104ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
105ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpckhbw   mm3, mm6
106ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubsw      mm0, mm1                    ; A-B (low order) to MM0
107ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubsw      mm2, mm3                    ; A-B (high order) to MM2
108ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
109ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw       mm5, mm0                    ; accumulate differences in mm5
110ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw       mm5, mm2                    ; accumulate differences in mm5
111ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
112ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pmaddwd     mm0, mm0                    ; square and accumulate
113ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pmaddwd     mm2, mm2                    ; square and accumulate
114ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add         rbx,rdx                     ; Inc pointer into ref data
115ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add         rax,rcx                     ; Inc pointer into the new data
116ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
117ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd       mm7, mm0                    ; accumulate in mm7
118ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd       mm7, mm2                    ; accumulate in mm7
119ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
120ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
121ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        ; Row 2
122ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        mm0, [rax]                  ; Copy eight bytes to mm0
123ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        mm2, mm0                    ; Take copies
124ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        mm3, mm1                    ; Take copies
125ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
126ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
127ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   mm1, mm6
128ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
129ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpckhbw   mm3, mm6
130ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubsw      mm0, mm1                    ; A-B (low order) to MM0
131ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubsw      mm2, mm3                    ; A-B (high order) to MM2
132ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
133ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw       mm5, mm0                    ; accumulate differences in mm5
134ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw       mm5, mm2                    ; accumulate differences in mm5
135ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
136ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pmaddwd     mm0, mm0                    ; square and accumulate
137ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pmaddwd     mm2, mm2                    ; square and accumulate
138ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add         rbx,rdx                     ; Inc pointer into ref data
139ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add         rax,rcx                     ; Inc pointer into the new data
140ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
141ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd       mm7, mm0                    ; accumulate in mm7
142ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd       mm7, mm2                    ; accumulate in mm7
143ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
144ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        ; Row 3
145ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        mm0, [rax]                  ; Copy eight bytes to mm0
146ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        mm2, mm0                    ; Take copies
147ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        mm3, mm1                    ; Take copies
148ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
149ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
150ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   mm1, mm6
151ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
152ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpckhbw   mm3, mm6
153ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubsw      mm0, mm1                    ; A-B (low order) to MM0
154ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubsw      mm2, mm3                    ; A-B (high order) to MM2
155ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
156ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw       mm5, mm0                    ; accumulate differences in mm5
157ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw       mm5, mm2                    ; accumulate differences in mm5
158ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
159ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pmaddwd     mm0, mm0                    ; square and accumulate
160ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pmaddwd     mm2, mm2                    ; square and accumulate
161ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add         rbx,rdx                     ; Inc pointer into ref data
162ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add         rax,rcx                     ; Inc pointer into the new data
163ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
164ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd       mm7, mm0                    ; accumulate in mm7
165ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd       mm7, mm2                    ; accumulate in mm7
166ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
167ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        ; Row 4
168ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        mm0, [rax]                  ; Copy eight bytes to mm0
169ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        mm2, mm0                    ; Take copies
170ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        mm3, mm1                    ; Take copies
171ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
172ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
173ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   mm1, mm6
174ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
175ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpckhbw   mm3, mm6
176ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubsw      mm0, mm1                    ; A-B (low order) to MM0
177ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubsw      mm2, mm3                    ; A-B (high order) to MM2
178ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
179ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw       mm5, mm0                    ; accumulate differences in mm5
180ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw       mm5, mm2                    ; accumulate differences in mm5
181ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
182ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pmaddwd     mm0, mm0                    ; square and accumulate
183ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pmaddwd     mm2, mm2                    ; square and accumulate
184ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add         rbx,rdx                     ; Inc pointer into ref data
185ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add         rax,rcx                     ; Inc pointer into the new data
186ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
187ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd       mm7, mm0                    ; accumulate in mm7
188ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd       mm7, mm2                    ; accumulate in mm7
189ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
190ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        ; Row 5
191ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        mm0, [rax]                  ; Copy eight bytes to mm0
192ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        mm2, mm0                    ; Take copies
193ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        mm3, mm1                    ; Take copies
194ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
195ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
196ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   mm1, mm6
197ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
198ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpckhbw   mm3, mm6
199ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubsw      mm0, mm1                    ; A-B (low order) to MM0
200ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubsw      mm2, mm3                    ; A-B (high order) to MM2
201ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
202ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw       mm5, mm0                    ; accumulate differences in mm5
203ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw       mm5, mm2                    ; accumulate differences in mm5
204ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
205ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pmaddwd     mm0, mm0                    ; square and accumulate
206ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pmaddwd     mm2, mm2                    ; square and accumulate
207ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add         rbx,rdx                     ; Inc pointer into ref data
208ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add         rax,rcx                     ; Inc pointer into the new data
209ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
210ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        ;              movq        mm4, [rbx + rdx]
211ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd       mm7, mm0                    ; accumulate in mm7
212ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd       mm7, mm2                    ; accumulate in mm7
213ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
214ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        ; Row 6
215ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        mm0, [rax]                  ; Copy eight bytes to mm0
216ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        mm2, mm0                    ; Take copies
217ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        mm3, mm1                    ; Take copies
218ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
219ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
220ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   mm1, mm6
221ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
222ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpckhbw   mm3, mm6
223ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubsw      mm0, mm1                    ; A-B (low order) to MM0
224ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubsw      mm2, mm3                    ; A-B (high order) to MM2
225ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
226ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw       mm5, mm0                    ; accumulate differences in mm5
227ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw       mm5, mm2                    ; accumulate differences in mm5
228ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
229ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pmaddwd     mm0, mm0                    ; square and accumulate
230ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pmaddwd     mm2, mm2                    ; square and accumulate
231ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add         rbx,rdx                     ; Inc pointer into ref data
232ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add         rax,rcx                     ; Inc pointer into the new data
233ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
234ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd       mm7, mm0                    ; accumulate in mm7
235ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd       mm7, mm2                    ; accumulate in mm7
236ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
237ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        ; Row 7
238ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        mm0, [rax]                  ; Copy eight bytes to mm0
239ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        mm2, mm0                    ; Take copies
240ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        mm3, mm1                    ; Take copies
241ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
242ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
243ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   mm1, mm6
244ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
245ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpckhbw   mm3, mm6
246ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubsw      mm0, mm1                    ; A-B (low order) to MM0
247ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubsw      mm2, mm3                    ; A-B (high order) to MM2
248ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
249ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw       mm5, mm0                    ; accumulate differences in mm5
250ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw       mm5, mm2                    ; accumulate differences in mm5
251ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
252ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pmaddwd     mm0, mm0                    ; square and accumulate
253ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pmaddwd     mm2, mm2                    ; square and accumulate
254ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add         rbx,rdx                     ; Inc pointer into ref data
255ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add         rax,rcx                     ; Inc pointer into the new data
256ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
257ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd       mm7, mm0                    ; accumulate in mm7
258ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd       mm7, mm2                    ; accumulate in mm7
259ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
260ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        ; Row 8
261ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        mm0, [rax]                  ; Copy eight bytes to mm0
262ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        mm2, mm0                    ; Take copies
263ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        mm3, mm1                    ; Take copies
264ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
265ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
266ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   mm1, mm6
267ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
268ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpckhbw   mm3, mm6
269ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubsw      mm0, mm1                    ; A-B (low order) to MM0
270ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubsw      mm2, mm3                    ; A-B (high order) to MM2
271ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
272ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw       mm5, mm0                    ; accumulate differences in mm5
273ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw       mm5, mm2                    ; accumulate differences in mm5
274ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
275ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pmaddwd     mm0, mm0                    ; square and accumulate
276ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pmaddwd     mm2, mm2                    ; square and accumulate
277ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add         rbx,rdx                     ; Inc pointer into ref data
278ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add         rax,rcx                     ; Inc pointer into the new data
279ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd       mm7, mm0                    ; accumulate in mm7
280ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd       mm7, mm2                    ; accumulate in mm7
281ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
282ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        ; Now accumulate the final results.
283ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
284ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
285ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movsx       rdx, WORD PTR [rsp+8]
286ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movsx       rcx, WORD PTR [rsp+10]
287ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movsx       rbx, WORD PTR [rsp+12]
288ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movsx       rax, WORD PTR [rsp+14]
289ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add         rdx, rcx
290ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add         rbx, rax
291ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add         rdx, rbx    ;XSum
292ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movsxd      rax, DWORD PTR [rsp]
293ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movsxd      rcx, DWORD PTR [rsp+4]
294ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add         rax, rcx    ;XXSum
295ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        mov         rsi, arg(4) ;SSE
296ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        mov         rdi, arg(5) ;Sum
297ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        mov         dword ptr [rsi], eax
298ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        mov         dword ptr [rdi], edx
299ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        xor         rax, rax    ; return 0
300ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
301ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
302ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ; begin epilog
303ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    add rsp, 16
304ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop rbx
305ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop rdi
306ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop rsi
307ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    UNSHADOW_ARGS
308ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop         rbp
309ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ret
310ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
311ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
312ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
313ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;unsigned int
314ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;vp9_get4x4var_mmx
315ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;(
316ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    unsigned char *src_ptr,
317ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    int  source_stride,
318ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    unsigned char *ref_ptr,
319ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    int  recon_stride,
320ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    unsigned int *SSE,
321ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    int *Sum
322ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;)
323ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangglobal sym(vp9_get4x4var_mmx) PRIVATE
324ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangsym(vp9_get4x4var_mmx):
325ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push        rbp
326ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    mov         rbp, rsp
327ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    SHADOW_ARGS_TO_STACK 6
328ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push rsi
329ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push rdi
330ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push rbx
331ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    sub         rsp, 16
332ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ; end prolog
333ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
334ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
335ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pxor        mm5, mm5                    ; Blank mmx6
336ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pxor        mm6, mm6                    ; Blank mmx7
337ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pxor        mm7, mm7                    ; Blank mmx7
338ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
339ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
340ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        mov         rbx, arg(2) ;[ref_ptr]
341ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movsxd      rcx, dword ptr arg(1) ;[source_stride]
342ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
343ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
344ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        ; Row 1
3453df0563f1b24dac6c0bd122fc922a48211269061hkuang        movd        mm0, [rax]                  ; Copy 4 bytes to mm0
3463df0563f1b24dac6c0bd122fc922a48211269061hkuang        movd        mm1, [rbx]                  ; Copy 4 bytes to mm1
347ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
348ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   mm1, mm6
349ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubsw      mm0, mm1                    ; A-B (low order) to MM0
350ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw       mm5, mm0                    ; accumulate differences in mm5
351ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pmaddwd     mm0, mm0                    ; square and accumulate
352ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add         rbx,rdx                     ; Inc pointer into ref data
353ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add         rax,rcx                     ; Inc pointer into the new data
3543df0563f1b24dac6c0bd122fc922a48211269061hkuang        movd        mm1, [rbx]                  ; Copy 4 bytes to mm1
355ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd       mm7, mm0                    ; accumulate in mm7
356ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
357ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
358ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        ; Row 2
3593df0563f1b24dac6c0bd122fc922a48211269061hkuang        movd        mm0, [rax]                  ; Copy 4 bytes to mm0
360ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
361ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   mm1, mm6
362ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubsw      mm0, mm1                    ; A-B (low order) to MM0
363ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw       mm5, mm0                    ; accumulate differences in mm5
364ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
365ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pmaddwd     mm0, mm0                    ; square and accumulate
366ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add         rbx,rdx                     ; Inc pointer into ref data
367ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add         rax,rcx                     ; Inc pointer into the new data
3683df0563f1b24dac6c0bd122fc922a48211269061hkuang        movd        mm1, [rbx]                  ; Copy 4 bytes to mm1
369ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd       mm7, mm0                    ; accumulate in mm7
370ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
371ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        ; Row 3
3723df0563f1b24dac6c0bd122fc922a48211269061hkuang        movd        mm0, [rax]                  ; Copy 4 bytes to mm0
373ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
374ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   mm1, mm6
375ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubsw      mm0, mm1                    ; A-B (low order) to MM0
376ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw       mm5, mm0                    ; accumulate differences in mm5
377ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
378ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pmaddwd     mm0, mm0                    ; square and accumulate
379ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add         rbx,rdx                     ; Inc pointer into ref data
380ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add         rax,rcx                     ; Inc pointer into the new data
3813df0563f1b24dac6c0bd122fc922a48211269061hkuang        movd        mm1, [rbx]                  ; Copy 4 bytes to mm1
382ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd       mm7, mm0                    ; accumulate in mm7
383ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
384ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        ; Row 4
3853df0563f1b24dac6c0bd122fc922a48211269061hkuang        movd        mm0, [rax]                  ; Copy 4 bytes to mm0
386ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
387ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
388ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   mm1, mm6
389ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubsw      mm0, mm1                    ; A-B (low order) to MM0
390ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
391ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw       mm5, mm0                    ; accumulate differences in mm5
392ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
393ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pmaddwd     mm0, mm0                    ; square and accumulate
394ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd       mm7, mm0                    ; accumulate in mm7
395ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
396ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
397ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        ; Now accumulate the final results.
398ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
399ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
400ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movsx       rdx, WORD PTR [rsp+8]
401ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movsx       rcx, WORD PTR [rsp+10]
402ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movsx       rbx, WORD PTR [rsp+12]
403ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movsx       rax, WORD PTR [rsp+14]
404ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add         rdx, rcx
405ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add         rbx, rax
406ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add         rdx, rbx    ;XSum
407ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movsxd      rax, DWORD PTR [rsp]
408ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movsxd      rcx, DWORD PTR [rsp+4]
409ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add         rax, rcx    ;XXSum
410ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        mov         rsi, arg(4) ;SSE
411ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        mov         rdi, arg(5) ;Sum
412ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        mov         dword ptr [rsi], eax
413ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        mov         dword ptr [rdi], edx
414ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        xor         rax, rax    ; return 0
415ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
416ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
417ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ; begin epilog
418ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    add rsp, 16
419ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop rbx
420ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop rdi
421ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop rsi
422ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    UNSHADOW_ARGS
423ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop         rbp
424ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ret
425ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
426ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
427ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
428ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;unsigned int
429ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;vp9_get4x4sse_cs_mmx
430ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;(
431ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    unsigned char *src_ptr,
432ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    int  source_stride,
433ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    unsigned char *ref_ptr,
434ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    int  recon_stride
435ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;)
436ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangglobal sym(vp9_get4x4sse_cs_mmx) PRIVATE
437ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangsym(vp9_get4x4sse_cs_mmx):
438ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push        rbp
439ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    mov         rbp, rsp
440ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    SHADOW_ARGS_TO_STACK 4
441ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push rsi
442ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push rdi
443ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push rbx
444ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ; end prolog
445ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
446ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
447ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pxor        mm6, mm6                    ; Blank mmx7
448ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pxor        mm7, mm7                    ; Blank mmx7
449ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
450ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
451ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        mov         rbx, arg(2) ;[ref_ptr]
452ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movsxd      rcx, dword ptr arg(1) ;[source_stride]
453ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
454ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        ; Row 1
455ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movd        mm0, [rax]                  ; Copy eight bytes to mm0
456ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
457ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
458ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   mm1, mm6
459ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubsw      mm0, mm1                    ; A-B (low order) to MM0
460ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pmaddwd     mm0, mm0                    ; square and accumulate
461ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add         rbx,rdx                     ; Inc pointer into ref data
462ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add         rax,rcx                     ; Inc pointer into the new data
463ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
464ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd       mm7, mm0                    ; accumulate in mm7
465ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
466ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        ; Row 2
467ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movd        mm0, [rax]                  ; Copy eight bytes to mm0
468ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
469ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   mm1, mm6
470ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubsw      mm0, mm1                    ; A-B (low order) to MM0
471ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pmaddwd     mm0, mm0                    ; square and accumulate
472ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add         rbx,rdx                     ; Inc pointer into ref data
473ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add         rax,rcx                     ; Inc pointer into the new data
474ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
475ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd       mm7, mm0                    ; accumulate in mm7
476ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
477ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        ; Row 3
478ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movd        mm0, [rax]                  ; Copy eight bytes to mm0
479ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   mm1, mm6
480ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
481ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubsw      mm0, mm1                    ; A-B (low order) to MM0
482ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
483ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pmaddwd     mm0, mm0                    ; square and accumulate
484ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add         rbx,rdx                     ; Inc pointer into ref data
485ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add         rax,rcx                     ; Inc pointer into the new data
486ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
487ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd       mm7, mm0                    ; accumulate in mm7
488ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
489ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        ; Row 4
490ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movd        mm0, [rax]                  ; Copy eight bytes to mm0
491ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
492ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   mm1, mm6
493ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubsw      mm0, mm1                    ; A-B (low order) to MM0
494ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pmaddwd     mm0, mm0                    ; square and accumulate
495ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd       mm7, mm0                    ; accumulate in mm7
496ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
497ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        mm0,    mm7                 ;
498ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psrlq       mm7,    32
499ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
500ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddd       mm0,    mm7
501ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        rax,    mm0
502ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
503ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
504ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ; begin epilog
505ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop rbx
506ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop rdi
507ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop rsi
508ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    UNSHADOW_ARGS
509ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop         rbp
510ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ret
511