1ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;
2ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;
4ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;  Use of this source code is governed by a BSD-style license
5ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;  that can be found in the LICENSE file in the root of the source
6ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;  tree. An additional intellectual property rights grant can be found
7ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;  in the file PATENTS.  All contributing project authors may
8ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;  be found in the AUTHORS file in the root of the source tree.
9ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;
10ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
11ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
12ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%include "vpx_ports/x86_abi_support.asm"
13ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
14ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangglobal sym(vp9_sad16x16_mmx) PRIVATE
15ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangglobal sym(vp9_sad8x16_mmx) PRIVATE
16ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangglobal sym(vp9_sad8x8_mmx) PRIVATE
17ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangglobal sym(vp9_sad4x4_mmx) PRIVATE
18ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangglobal sym(vp9_sad16x8_mmx) PRIVATE
19ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
20ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;unsigned int vp9_sad16x16_mmx(
21ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    unsigned char *src_ptr,
22ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    int  src_stride,
23ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    unsigned char *ref_ptr,
24ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    int  ref_stride)
25ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangsym(vp9_sad16x16_mmx):
26ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push        rbp
27ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    mov         rbp, rsp
28ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    SHADOW_ARGS_TO_STACK 4
29ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push rsi
30ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push rdi
31ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ; end prolog
32ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
33ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        mov             rsi,        arg(0) ;src_ptr
34ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        mov             rdi,        arg(2) ;ref_ptr
35ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
36ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movsxd          rax,        dword ptr arg(1) ;src_stride
37ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movsxd          rdx,        dword ptr arg(3) ;ref_stride
38ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
39ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        lea             rcx,        [rsi+rax*8]
40ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
41ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        lea             rcx,        [rcx+rax*8]
42ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pxor            mm7,        mm7
43ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
44ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pxor            mm6,        mm6
45ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
46ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang.x16x16sad_mmx_loop:
47ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
48ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            mm0,        QWORD PTR [rsi]
49ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            mm2,        QWORD PTR [rsi+8]
50ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
51ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            mm1,        QWORD PTR [rdi]
52ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            mm3,        QWORD PTR [rdi+8]
53ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
54ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            mm4,        mm0
55ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            mm5,        mm2
56ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
57ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubusb         mm0,        mm1
58ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubusb         mm1,        mm4
59ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
60ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubusb         mm2,        mm3
61ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubusb         mm3,        mm5
62ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
63ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        por             mm0,        mm1
64ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        por             mm2,        mm3
65ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
66ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            mm1,        mm0
67ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            mm3,        mm2
68ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
69ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw       mm0,        mm6
70ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw       mm2,        mm6
71ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
72ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpckhbw       mm1,        mm6
73ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpckhbw       mm3,        mm6
74ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
75ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw           mm0,        mm2
76ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw           mm1,        mm3
77ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
78ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
79ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        lea             rsi,        [rsi+rax]
80ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add             rdi,        rdx
81ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
82ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw           mm7,        mm0
83ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw           mm7,        mm1
84ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
85ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        cmp             rsi,        rcx
86ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        jne             .x16x16sad_mmx_loop
87ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
88ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
89ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            mm0,        mm7
90ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
91ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklwd       mm0,        mm6
92ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpckhwd       mm7,        mm6
93ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
94ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw           mm0,        mm7
95ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            mm7,        mm0
96ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
97ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
98ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psrlq           mm0,        32
99ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw           mm7,        mm0
100ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
101ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            rax,        mm7
102ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
103ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop rdi
104ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop rsi
105ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    mov rsp, rbp
106ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ; begin epilog
107ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    UNSHADOW_ARGS
108ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop         rbp
109ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ret
110ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
111ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
112ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;unsigned int vp9_sad8x16_mmx(
113ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    unsigned char *src_ptr,
114ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    int  src_stride,
115ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    unsigned char *ref_ptr,
116ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    int  ref_stride)
117ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangsym(vp9_sad8x16_mmx):
118ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push        rbp
119ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    mov         rbp, rsp
120ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    SHADOW_ARGS_TO_STACK 4
121ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push rsi
122ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push rdi
123ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ; end prolog
124ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
125ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        mov             rsi,        arg(0) ;src_ptr
126ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        mov             rdi,        arg(2) ;ref_ptr
127ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
128ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movsxd          rax,        dword ptr arg(1) ;src_stride
129ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movsxd          rdx,        dword ptr arg(3) ;ref_stride
130ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
131ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        lea             rcx,        [rsi+rax*8]
132ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
133ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        lea             rcx,        [rcx+rax*8]
134ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pxor            mm7,        mm7
135ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
136ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pxor            mm6,        mm6
137ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
138ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang.x8x16sad_mmx_loop:
139ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
140ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            mm0,        QWORD PTR [rsi]
141ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            mm1,        QWORD PTR [rdi]
142ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
143ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            mm2,        mm0
144ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubusb         mm0,        mm1
145ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
146ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubusb         mm1,        mm2
147ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        por             mm0,        mm1
148ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
149ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            mm2,        mm0
150ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw       mm0,        mm6
151ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
152ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpckhbw       mm2,        mm6
153ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        lea             rsi,        [rsi+rax]
154ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
155ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add             rdi,        rdx
156ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw           mm7,        mm0
157ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
158ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw           mm7,        mm2
159ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        cmp             rsi,        rcx
160ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
161ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        jne             .x8x16sad_mmx_loop
162ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
163ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            mm0,        mm7
164ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklwd       mm0,        mm6
165ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
166ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpckhwd       mm7,        mm6
167ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw           mm0,        mm7
168ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
169ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            mm7,        mm0
170ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psrlq           mm0,        32
171ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
172ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw           mm7,        mm0
173ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            rax,        mm7
174ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
175ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop rdi
176ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop rsi
177ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    mov rsp, rbp
178ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ; begin epilog
179ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    UNSHADOW_ARGS
180ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop         rbp
181ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ret
182ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
183ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
184ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;unsigned int vp9_sad8x8_mmx(
185ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    unsigned char *src_ptr,
186ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    int  src_stride,
187ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    unsigned char *ref_ptr,
188ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    int  ref_stride)
189ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangsym(vp9_sad8x8_mmx):
190ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push        rbp
191ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    mov         rbp, rsp
192ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    SHADOW_ARGS_TO_STACK 4
193ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push rsi
194ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push rdi
195ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ; end prolog
196ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
197ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        mov             rsi,        arg(0) ;src_ptr
198ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        mov             rdi,        arg(2) ;ref_ptr
199ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
200ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movsxd          rax,        dword ptr arg(1) ;src_stride
201ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movsxd          rdx,        dword ptr arg(3) ;ref_stride
202ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
203ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        lea             rcx,        [rsi+rax*8]
204ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pxor            mm7,        mm7
205ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
206ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pxor            mm6,        mm6
207ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
208ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang.x8x8sad_mmx_loop:
209ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
210ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            mm0,        QWORD PTR [rsi]
211ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            mm1,        QWORD PTR [rdi]
212ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
213ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            mm2,        mm0
214ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubusb         mm0,        mm1
215ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
216ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubusb         mm1,        mm2
217ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        por             mm0,        mm1
218ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
219ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            mm2,        mm0
220ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw       mm0,        mm6
221ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
222ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpckhbw       mm2,        mm6
223ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw           mm0,        mm2
224ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
225ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        lea             rsi,       [rsi+rax]
226ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add             rdi,        rdx
227ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
228ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw           mm7,       mm0
229ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        cmp             rsi,        rcx
230ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
231ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        jne             .x8x8sad_mmx_loop
232ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
233ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            mm0,        mm7
234ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklwd       mm0,        mm6
235ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
236ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpckhwd       mm7,        mm6
237ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw           mm0,        mm7
238ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
239ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            mm7,        mm0
240ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psrlq           mm0,        32
241ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
242ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw           mm7,        mm0
243ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            rax,        mm7
244ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
245ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop rdi
246ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop rsi
247ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    mov rsp, rbp
248ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ; begin epilog
249ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    UNSHADOW_ARGS
250ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop         rbp
251ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ret
252ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
253ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
254ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;unsigned int vp9_sad4x4_mmx(
255ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    unsigned char *src_ptr,
256ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    int  src_stride,
257ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    unsigned char *ref_ptr,
258ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    int  ref_stride)
259ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangsym(vp9_sad4x4_mmx):
260ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push        rbp
261ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    mov         rbp, rsp
262ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    SHADOW_ARGS_TO_STACK 4
263ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push rsi
264ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push rdi
265ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ; end prolog
266ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
267ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        mov             rsi,        arg(0) ;src_ptr
268ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        mov             rdi,        arg(2) ;ref_ptr
269ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
270ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movsxd          rax,        dword ptr arg(1) ;src_stride
271ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movsxd          rdx,        dword ptr arg(3) ;ref_stride
272ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
273ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movd            mm0,        DWORD PTR [rsi]
274ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movd            mm1,        DWORD PTR [rdi]
275ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
276ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movd            mm2,        DWORD PTR [rsi+rax]
277ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movd            mm3,        DWORD PTR [rdi+rdx]
278ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
279ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw       mm0,        mm2
280ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw       mm1,        mm3
281ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
282ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            mm2,        mm0
283ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubusb         mm0,        mm1
284ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
285ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubusb         mm1,        mm2
286ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        por             mm0,        mm1
287ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
288ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            mm2,        mm0
289ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pxor            mm3,        mm3
290ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
291ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw       mm0,        mm3
292ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpckhbw       mm2,        mm3
293ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
294ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw           mm0,        mm2
295ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
296ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        lea             rsi,        [rsi+rax*2]
297ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        lea             rdi,        [rdi+rdx*2]
298ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
299ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movd            mm4,        DWORD PTR [rsi]
300ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movd            mm5,        DWORD PTR [rdi]
301ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
302ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movd            mm6,        DWORD PTR [rsi+rax]
303ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movd            mm7,        DWORD PTR [rdi+rdx]
304ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
305ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw       mm4,        mm6
306ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw       mm5,        mm7
307ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
308ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            mm6,        mm4
309ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubusb         mm4,        mm5
310ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
311ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubusb         mm5,        mm6
312ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        por             mm4,        mm5
313ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
314ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            mm5,        mm4
315ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw       mm4,        mm3
316ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
317ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpckhbw       mm5,        mm3
318ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw           mm4,        mm5
319ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
320ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw           mm0,        mm4
321ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            mm1,        mm0
322ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
323ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklwd       mm0,        mm3
324ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpckhwd       mm1,        mm3
325ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
326ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw           mm0,        mm1
327ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            mm1,        mm0
328ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
329ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psrlq           mm0,        32
330ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw           mm0,        mm1
331ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
332ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            rax,        mm0
333ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
334ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop rdi
335ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop rsi
336ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    mov rsp, rbp
337ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ; begin epilog
338ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    UNSHADOW_ARGS
339ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop         rbp
340ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ret
341ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
342ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
343ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;unsigned int vp9_sad16x8_mmx(
344ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    unsigned char *src_ptr,
345ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    int  src_stride,
346ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    unsigned char *ref_ptr,
347ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    int  ref_stride)
348ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangsym(vp9_sad16x8_mmx):
349ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push        rbp
350ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    mov         rbp, rsp
351ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    SHADOW_ARGS_TO_STACK 4
352ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push rsi
353ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push rdi
354ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ; end prolog
355ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
356ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        mov             rsi,        arg(0) ;src_ptr
357ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        mov             rdi,        arg(2) ;ref_ptr
358ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
359ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movsxd          rax,        dword ptr arg(1) ;src_stride
360ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movsxd          rdx,        dword ptr arg(3) ;ref_stride
361ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
362ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        lea             rcx,        [rsi+rax*8]
363ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pxor            mm7,        mm7
364ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
365ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pxor            mm6,        mm6
366ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
367ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang.x16x8sad_mmx_loop:
368ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
369ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            mm0,       [rsi]
370ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            mm1,       [rdi]
371ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
372ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            mm2,        [rsi+8]
373ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            mm3,        [rdi+8]
374ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
375ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            mm4,        mm0
376ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            mm5,        mm2
377ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
378ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubusb         mm0,        mm1
379ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubusb         mm1,        mm4
380ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
381ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubusb         mm2,        mm3
382ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubusb         mm3,        mm5
383ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
384ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        por             mm0,        mm1
385ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        por             mm2,        mm3
386ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
387ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            mm1,        mm0
388ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            mm3,        mm2
389ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
390ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw       mm0,        mm6
391ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpckhbw       mm1,        mm6
392ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
393ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw       mm2,        mm6
394ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpckhbw       mm3,        mm6
395ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
396ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
397ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw           mm0,        mm2
398ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw           mm1,        mm3
399ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
400ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw           mm0,        mm1
401ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        lea             rsi,        [rsi+rax]
402ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
403ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add             rdi,        rdx
404ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw           mm7,        mm0
405ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
406ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        cmp             rsi,        rcx
407ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        jne             .x16x8sad_mmx_loop
408ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
409ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            mm0,        mm7
410ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklwd       mm0,        mm6
411ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
412ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpckhwd       mm7,        mm6
413ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw           mm0,        mm7
414ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
415ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            mm7,        mm0
416ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psrlq           mm0,        32
417ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
418ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddw           mm7,        mm0
419ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq            rax,        mm7
420ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
421ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop rdi
422ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop rsi
423ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    mov rsp, rbp
424ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ; begin epilog
425ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    UNSHADOW_ARGS
426ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop         rbp
427ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ret
428