1233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
2233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
4233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  Use of this source code is governed by a BSD-style license
5233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  that can be found in the LICENSE file in the root of the source
6233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  tree. An additional intellectual property rights grant can be found
7233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  in the file PATENTS.  All contributing project authors may
8233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  be found in the AUTHORS file in the root of the source tree.
9233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
10233d2500723e5594f3e7c70896ffeeef32b9c950ywan
11233d2500723e5594f3e7c70896ffeeef32b9c950ywan
12233d2500723e5594f3e7c70896ffeeef32b9c950ywan%include "vpx_ports/x86_abi_support.asm"
13233d2500723e5594f3e7c70896ffeeef32b9c950ywan
14233d2500723e5594f3e7c70896ffeeef32b9c950ywan;unsigned int vp8_get_mb_ss_mmx( short *src_ptr )
15233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp8_get_mb_ss_mmx) PRIVATE
16233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp8_get_mb_ss_mmx):
17233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rbp
18233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         rbp, rsp
19233d2500723e5594f3e7c70896ffeeef32b9c950ywan    SHADOW_ARGS_TO_STACK 7
20233d2500723e5594f3e7c70896ffeeef32b9c950ywan    GET_GOT     rbx
21233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push rsi
22233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push rdi
23233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         rsp, 8
24233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; end prolog
25233d2500723e5594f3e7c70896ffeeef32b9c950ywan
26233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rax, arg(0) ;src_ptr
27233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rcx, 16
28233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        mm4, mm4
29233d2500723e5594f3e7c70896ffeeef32b9c950ywan
30233d2500723e5594f3e7c70896ffeeef32b9c950ywan.NEXTROW:
31233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        mm0, [rax]
32233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        mm1, [rax+8]
33233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        mm2, [rax+16]
34233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        mm3, [rax+24]
35233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaddwd     mm0, mm0
36233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaddwd     mm1, mm1
37233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaddwd     mm2, mm2
38233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaddwd     mm3, mm3
39233d2500723e5594f3e7c70896ffeeef32b9c950ywan
40233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd       mm4, mm0
41233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd       mm4, mm1
42233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd       mm4, mm2
43233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd       mm4, mm3
44233d2500723e5594f3e7c70896ffeeef32b9c950ywan
45233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add         rax, 32
46233d2500723e5594f3e7c70896ffeeef32b9c950ywan        dec         rcx
47233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ja          .NEXTROW
48233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        QWORD PTR [rsp], mm4
49233d2500723e5594f3e7c70896ffeeef32b9c950ywan
50233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ;return sum[0]+sum[1];
51233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movsxd      rax, dword ptr [rsp]
52233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movsxd      rcx, dword ptr [rsp+4]
53233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add         rax, rcx
54233d2500723e5594f3e7c70896ffeeef32b9c950ywan
55233d2500723e5594f3e7c70896ffeeef32b9c950ywan
56233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; begin epilog
57233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add rsp, 8
58233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rdi
59233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rsi
60233d2500723e5594f3e7c70896ffeeef32b9c950ywan    RESTORE_GOT
61233d2500723e5594f3e7c70896ffeeef32b9c950ywan    UNSHADOW_ARGS
62233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop         rbp
63233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ret
64233d2500723e5594f3e7c70896ffeeef32b9c950ywan
65233d2500723e5594f3e7c70896ffeeef32b9c950ywan
66233d2500723e5594f3e7c70896ffeeef32b9c950ywan;unsigned int vp8_get8x8var_mmx
67233d2500723e5594f3e7c70896ffeeef32b9c950ywan;(
68233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned char *src_ptr,
69233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    int  source_stride,
70233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned char *ref_ptr,
71233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    int  recon_stride,
72233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned int *SSE,
73233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    int *Sum
74233d2500723e5594f3e7c70896ffeeef32b9c950ywan;)
75233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp8_get8x8var_mmx) PRIVATE
76233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp8_get8x8var_mmx):
77233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rbp
78233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         rbp, rsp
79233d2500723e5594f3e7c70896ffeeef32b9c950ywan    SHADOW_ARGS_TO_STACK 6
80233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push rsi
81233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push rdi
82233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push rbx
83233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         rsp, 16
84233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; end prolog
85233d2500723e5594f3e7c70896ffeeef32b9c950ywan
86233d2500723e5594f3e7c70896ffeeef32b9c950ywan
87233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        mm5, mm5                    ; Blank mmx6
88233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        mm6, mm6                    ; Blank mmx7
89233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        mm7, mm7                    ; Blank mmx7
90233d2500723e5594f3e7c70896ffeeef32b9c950ywan
91233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
92233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rbx, arg(2) ;[ref_ptr]
93233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movsxd      rcx, dword ptr arg(1) ;[source_stride]
94233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
95233d2500723e5594f3e7c70896ffeeef32b9c950ywan
96233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; Row 1
97233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        mm0, [rax]                  ; Copy eight bytes to mm0
98233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
99233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        mm2, mm0                    ; Take copies
100233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        mm3, mm1                    ; Take copies
101233d2500723e5594f3e7c70896ffeeef32b9c950ywan
102233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
103233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   mm1, mm6
104233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
105233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhbw   mm3, mm6
106233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubsw      mm0, mm1                    ; A-B (low order) to MM0
107233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubsw      mm2, mm3                    ; A-B (high order) to MM2
108233d2500723e5594f3e7c70896ffeeef32b9c950ywan
109233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       mm5, mm0                    ; accumulate differences in mm5
110233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       mm5, mm2                    ; accumulate differences in mm5
111233d2500723e5594f3e7c70896ffeeef32b9c950ywan
112233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaddwd     mm0, mm0                    ; square and accumulate
113233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaddwd     mm2, mm2                    ; square and accumulate
114233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add         rbx,rdx                     ; Inc pointer into ref data
115233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add         rax,rcx                     ; Inc pointer into the new data
116233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
117233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd       mm7, mm0                    ; accumulate in mm7
118233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd       mm7, mm2                    ; accumulate in mm7
119233d2500723e5594f3e7c70896ffeeef32b9c950ywan
120233d2500723e5594f3e7c70896ffeeef32b9c950ywan
121233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; Row 2
122233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        mm0, [rax]                  ; Copy eight bytes to mm0
123233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        mm2, mm0                    ; Take copies
124233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        mm3, mm1                    ; Take copies
125233d2500723e5594f3e7c70896ffeeef32b9c950ywan
126233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
127233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   mm1, mm6
128233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
129233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhbw   mm3, mm6
130233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubsw      mm0, mm1                    ; A-B (low order) to MM0
131233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubsw      mm2, mm3                    ; A-B (high order) to MM2
132233d2500723e5594f3e7c70896ffeeef32b9c950ywan
133233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       mm5, mm0                    ; accumulate differences in mm5
134233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       mm5, mm2                    ; accumulate differences in mm5
135233d2500723e5594f3e7c70896ffeeef32b9c950ywan
136233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaddwd     mm0, mm0                    ; square and accumulate
137233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaddwd     mm2, mm2                    ; square and accumulate
138233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add         rbx,rdx                     ; Inc pointer into ref data
139233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add         rax,rcx                     ; Inc pointer into the new data
140233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
141233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd       mm7, mm0                    ; accumulate in mm7
142233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd       mm7, mm2                    ; accumulate in mm7
143233d2500723e5594f3e7c70896ffeeef32b9c950ywan
144233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; Row 3
145233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        mm0, [rax]                  ; Copy eight bytes to mm0
146233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        mm2, mm0                    ; Take copies
147233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        mm3, mm1                    ; Take copies
148233d2500723e5594f3e7c70896ffeeef32b9c950ywan
149233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
150233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   mm1, mm6
151233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
152233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhbw   mm3, mm6
153233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubsw      mm0, mm1                    ; A-B (low order) to MM0
154233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubsw      mm2, mm3                    ; A-B (high order) to MM2
155233d2500723e5594f3e7c70896ffeeef32b9c950ywan
156233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       mm5, mm0                    ; accumulate differences in mm5
157233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       mm5, mm2                    ; accumulate differences in mm5
158233d2500723e5594f3e7c70896ffeeef32b9c950ywan
159233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaddwd     mm0, mm0                    ; square and accumulate
160233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaddwd     mm2, mm2                    ; square and accumulate
161233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add         rbx,rdx                     ; Inc pointer into ref data
162233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add         rax,rcx                     ; Inc pointer into the new data
163233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
164233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd       mm7, mm0                    ; accumulate in mm7
165233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd       mm7, mm2                    ; accumulate in mm7
166233d2500723e5594f3e7c70896ffeeef32b9c950ywan
167233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; Row 4
168233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        mm0, [rax]                  ; Copy eight bytes to mm0
169233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        mm2, mm0                    ; Take copies
170233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        mm3, mm1                    ; Take copies
171233d2500723e5594f3e7c70896ffeeef32b9c950ywan
172233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
173233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   mm1, mm6
174233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
175233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhbw   mm3, mm6
176233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubsw      mm0, mm1                    ; A-B (low order) to MM0
177233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubsw      mm2, mm3                    ; A-B (high order) to MM2
178233d2500723e5594f3e7c70896ffeeef32b9c950ywan
179233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       mm5, mm0                    ; accumulate differences in mm5
180233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       mm5, mm2                    ; accumulate differences in mm5
181233d2500723e5594f3e7c70896ffeeef32b9c950ywan
182233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaddwd     mm0, mm0                    ; square and accumulate
183233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaddwd     mm2, mm2                    ; square and accumulate
184233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add         rbx,rdx                     ; Inc pointer into ref data
185233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add         rax,rcx                     ; Inc pointer into the new data
186233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
187233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd       mm7, mm0                    ; accumulate in mm7
188233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd       mm7, mm2                    ; accumulate in mm7
189233d2500723e5594f3e7c70896ffeeef32b9c950ywan
190233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; Row 5
191233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        mm0, [rax]                  ; Copy eight bytes to mm0
192233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        mm2, mm0                    ; Take copies
193233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        mm3, mm1                    ; Take copies
194233d2500723e5594f3e7c70896ffeeef32b9c950ywan
195233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
196233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   mm1, mm6
197233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
198233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhbw   mm3, mm6
199233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubsw      mm0, mm1                    ; A-B (low order) to MM0
200233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubsw      mm2, mm3                    ; A-B (high order) to MM2
201233d2500723e5594f3e7c70896ffeeef32b9c950ywan
202233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       mm5, mm0                    ; accumulate differences in mm5
203233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       mm5, mm2                    ; accumulate differences in mm5
204233d2500723e5594f3e7c70896ffeeef32b9c950ywan
205233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaddwd     mm0, mm0                    ; square and accumulate
206233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaddwd     mm2, mm2                    ; square and accumulate
207233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add         rbx,rdx                     ; Inc pointer into ref data
208233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add         rax,rcx                     ; Inc pointer into the new data
209233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
210233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ;              movq        mm4, [rbx + rdx]
211233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd       mm7, mm0                    ; accumulate in mm7
212233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd       mm7, mm2                    ; accumulate in mm7
213233d2500723e5594f3e7c70896ffeeef32b9c950ywan
214233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; Row 6
215233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        mm0, [rax]                  ; Copy eight bytes to mm0
216233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        mm2, mm0                    ; Take copies
217233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        mm3, mm1                    ; Take copies
218233d2500723e5594f3e7c70896ffeeef32b9c950ywan
219233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
220233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   mm1, mm6
221233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
222233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhbw   mm3, mm6
223233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubsw      mm0, mm1                    ; A-B (low order) to MM0
224233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubsw      mm2, mm3                    ; A-B (high order) to MM2
225233d2500723e5594f3e7c70896ffeeef32b9c950ywan
226233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       mm5, mm0                    ; accumulate differences in mm5
227233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       mm5, mm2                    ; accumulate differences in mm5
228233d2500723e5594f3e7c70896ffeeef32b9c950ywan
229233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaddwd     mm0, mm0                    ; square and accumulate
230233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaddwd     mm2, mm2                    ; square and accumulate
231233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add         rbx,rdx                     ; Inc pointer into ref data
232233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add         rax,rcx                     ; Inc pointer into the new data
233233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
234233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd       mm7, mm0                    ; accumulate in mm7
235233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd       mm7, mm2                    ; accumulate in mm7
236233d2500723e5594f3e7c70896ffeeef32b9c950ywan
237233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; Row 7
238233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        mm0, [rax]                  ; Copy eight bytes to mm0
239233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        mm2, mm0                    ; Take copies
240233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        mm3, mm1                    ; Take copies
241233d2500723e5594f3e7c70896ffeeef32b9c950ywan
242233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
243233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   mm1, mm6
244233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
245233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhbw   mm3, mm6
246233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubsw      mm0, mm1                    ; A-B (low order) to MM0
247233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubsw      mm2, mm3                    ; A-B (high order) to MM2
248233d2500723e5594f3e7c70896ffeeef32b9c950ywan
249233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       mm5, mm0                    ; accumulate differences in mm5
250233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       mm5, mm2                    ; accumulate differences in mm5
251233d2500723e5594f3e7c70896ffeeef32b9c950ywan
252233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaddwd     mm0, mm0                    ; square and accumulate
253233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaddwd     mm2, mm2                    ; square and accumulate
254233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add         rbx,rdx                     ; Inc pointer into ref data
255233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add         rax,rcx                     ; Inc pointer into the new data
256233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
257233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd       mm7, mm0                    ; accumulate in mm7
258233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd       mm7, mm2                    ; accumulate in mm7
259233d2500723e5594f3e7c70896ffeeef32b9c950ywan
260233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; Row 8
261233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        mm0, [rax]                  ; Copy eight bytes to mm0
262233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        mm2, mm0                    ; Take copies
263233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        mm3, mm1                    ; Take copies
264233d2500723e5594f3e7c70896ffeeef32b9c950ywan
265233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
266233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   mm1, mm6
267233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
268233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhbw   mm3, mm6
269233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubsw      mm0, mm1                    ; A-B (low order) to MM0
270233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubsw      mm2, mm3                    ; A-B (high order) to MM2
271233d2500723e5594f3e7c70896ffeeef32b9c950ywan
272233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       mm5, mm0                    ; accumulate differences in mm5
273233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       mm5, mm2                    ; accumulate differences in mm5
274233d2500723e5594f3e7c70896ffeeef32b9c950ywan
275233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaddwd     mm0, mm0                    ; square and accumulate
276233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaddwd     mm2, mm2                    ; square and accumulate
277233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add         rbx,rdx                     ; Inc pointer into ref data
278233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add         rax,rcx                     ; Inc pointer into the new data
279233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd       mm7, mm0                    ; accumulate in mm7
280233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd       mm7, mm2                    ; accumulate in mm7
281233d2500723e5594f3e7c70896ffeeef32b9c950ywan
282233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; Now accumulate the final results.
283233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
284233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
285233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movsx       rdx, WORD PTR [rsp+8]
286233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movsx       rcx, WORD PTR [rsp+10]
287233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movsx       rbx, WORD PTR [rsp+12]
288233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movsx       rax, WORD PTR [rsp+14]
289233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add         rdx, rcx
290233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add         rbx, rax
291233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add         rdx, rbx    ;XSum
292233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movsxd      rax, DWORD PTR [rsp]
293233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movsxd      rcx, DWORD PTR [rsp+4]
294233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add         rax, rcx    ;XXSum
295233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rsi, arg(4) ;SSE
296233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rdi, arg(5) ;Sum
297233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         dword ptr [rsi], eax
298233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         dword ptr [rdi], edx
299233d2500723e5594f3e7c70896ffeeef32b9c950ywan        xor         rax, rax    ; return 0
300233d2500723e5594f3e7c70896ffeeef32b9c950ywan
301233d2500723e5594f3e7c70896ffeeef32b9c950ywan
302233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; begin epilog
303233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add rsp, 16
304233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rbx
305233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rdi
306233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rsi
307233d2500723e5594f3e7c70896ffeeef32b9c950ywan    UNSHADOW_ARGS
308233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop         rbp
309233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ret
310233d2500723e5594f3e7c70896ffeeef32b9c950ywan
311233d2500723e5594f3e7c70896ffeeef32b9c950ywan
312233d2500723e5594f3e7c70896ffeeef32b9c950ywan
313233d2500723e5594f3e7c70896ffeeef32b9c950ywan;unsigned int
314233d2500723e5594f3e7c70896ffeeef32b9c950ywan;vp8_get4x4var_mmx
315233d2500723e5594f3e7c70896ffeeef32b9c950ywan;(
316233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned char *src_ptr,
317233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    int  source_stride,
318233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned char *ref_ptr,
319233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    int  recon_stride,
320233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned int *SSE,
321233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    int *Sum
322233d2500723e5594f3e7c70896ffeeef32b9c950ywan;)
323233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp8_get4x4var_mmx) PRIVATE
324233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp8_get4x4var_mmx):
325233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rbp
326233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         rbp, rsp
327233d2500723e5594f3e7c70896ffeeef32b9c950ywan    SHADOW_ARGS_TO_STACK 6
328233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push rsi
329233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push rdi
330233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push rbx
331233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         rsp, 16
332233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; end prolog
333233d2500723e5594f3e7c70896ffeeef32b9c950ywan
334233d2500723e5594f3e7c70896ffeeef32b9c950ywan
335233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        mm5, mm5                    ; Blank mmx6
336233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        mm6, mm6                    ; Blank mmx7
337233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        mm7, mm7                    ; Blank mmx7
338233d2500723e5594f3e7c70896ffeeef32b9c950ywan
339233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
340233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rbx, arg(2) ;[ref_ptr]
341233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movsxd      rcx, dword ptr arg(1) ;[source_stride]
342233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
343233d2500723e5594f3e7c70896ffeeef32b9c950ywan
344233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; Row 1
345233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        mm0, [rax]                  ; Copy eight bytes to mm0
346233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
347233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
348233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   mm1, mm6
349233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubsw      mm0, mm1                    ; A-B (low order) to MM0
350233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       mm5, mm0                    ; accumulate differences in mm5
351233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaddwd     mm0, mm0                    ; square and accumulate
352233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add         rbx,rdx                     ; Inc pointer into ref data
353233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add         rax,rcx                     ; Inc pointer into the new data
354233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
355233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd       mm7, mm0                    ; accumulate in mm7
356233d2500723e5594f3e7c70896ffeeef32b9c950ywan
357233d2500723e5594f3e7c70896ffeeef32b9c950ywan
358233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; Row 2
359233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        mm0, [rax]                  ; Copy eight bytes to mm0
360233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
361233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   mm1, mm6
362233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubsw      mm0, mm1                    ; A-B (low order) to MM0
363233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       mm5, mm0                    ; accumulate differences in mm5
364233d2500723e5594f3e7c70896ffeeef32b9c950ywan
365233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaddwd     mm0, mm0                    ; square and accumulate
366233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add         rbx,rdx                     ; Inc pointer into ref data
367233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add         rax,rcx                     ; Inc pointer into the new data
368233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
369233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd       mm7, mm0                    ; accumulate in mm7
370233d2500723e5594f3e7c70896ffeeef32b9c950ywan
371233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; Row 3
372233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        mm0, [rax]                  ; Copy eight bytes to mm0
373233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
374233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   mm1, mm6
375233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubsw      mm0, mm1                    ; A-B (low order) to MM0
376233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       mm5, mm0                    ; accumulate differences in mm5
377233d2500723e5594f3e7c70896ffeeef32b9c950ywan
378233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaddwd     mm0, mm0                    ; square and accumulate
379233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add         rbx,rdx                     ; Inc pointer into ref data
380233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add         rax,rcx                     ; Inc pointer into the new data
381233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
382233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd       mm7, mm0                    ; accumulate in mm7
383233d2500723e5594f3e7c70896ffeeef32b9c950ywan
384233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; Row 4
385233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        mm0, [rax]                  ; Copy eight bytes to mm0
386233d2500723e5594f3e7c70896ffeeef32b9c950ywan
387233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
388233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   mm1, mm6
389233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubsw      mm0, mm1                    ; A-B (low order) to MM0
390233d2500723e5594f3e7c70896ffeeef32b9c950ywan
391233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       mm5, mm0                    ; accumulate differences in mm5
392233d2500723e5594f3e7c70896ffeeef32b9c950ywan
393233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaddwd     mm0, mm0                    ; square and accumulate
394233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd       mm7, mm0                    ; accumulate in mm7
395233d2500723e5594f3e7c70896ffeeef32b9c950ywan
396233d2500723e5594f3e7c70896ffeeef32b9c950ywan
397233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; Now accumulate the final results.
398233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
399233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
400233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movsx       rdx, WORD PTR [rsp+8]
401233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movsx       rcx, WORD PTR [rsp+10]
402233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movsx       rbx, WORD PTR [rsp+12]
403233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movsx       rax, WORD PTR [rsp+14]
404233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add         rdx, rcx
405233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add         rbx, rax
406233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add         rdx, rbx    ;XSum
407233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movsxd      rax, DWORD PTR [rsp]
408233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movsxd      rcx, DWORD PTR [rsp+4]
409233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add         rax, rcx    ;XXSum
410233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rsi, arg(4) ;SSE
411233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rdi, arg(5) ;Sum
412233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         dword ptr [rsi], eax
413233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         dword ptr [rdi], edx
414233d2500723e5594f3e7c70896ffeeef32b9c950ywan        xor         rax, rax    ; return 0
415233d2500723e5594f3e7c70896ffeeef32b9c950ywan
416233d2500723e5594f3e7c70896ffeeef32b9c950ywan
417233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; begin epilog
418233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add rsp, 16
419233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rbx
420233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rdi
421233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rsi
422233d2500723e5594f3e7c70896ffeeef32b9c950ywan    UNSHADOW_ARGS
423233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop         rbp
424233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ret
425233d2500723e5594f3e7c70896ffeeef32b9c950ywan
426233d2500723e5594f3e7c70896ffeeef32b9c950ywan
427233d2500723e5594f3e7c70896ffeeef32b9c950ywan
428233d2500723e5594f3e7c70896ffeeef32b9c950ywan;unsigned int
429233d2500723e5594f3e7c70896ffeeef32b9c950ywan;vp8_get4x4sse_cs_mmx
430233d2500723e5594f3e7c70896ffeeef32b9c950ywan;(
431233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned char *src_ptr,
432233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    int  source_stride,
433233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned char *ref_ptr,
434233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    int  recon_stride
435233d2500723e5594f3e7c70896ffeeef32b9c950ywan;)
436233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp8_get4x4sse_cs_mmx) PRIVATE
437233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp8_get4x4sse_cs_mmx):
438233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rbp
439233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         rbp, rsp
440233d2500723e5594f3e7c70896ffeeef32b9c950ywan    SHADOW_ARGS_TO_STACK 4
441233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push rsi
442233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push rdi
443233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push rbx
444233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; end prolog
445233d2500723e5594f3e7c70896ffeeef32b9c950ywan
446233d2500723e5594f3e7c70896ffeeef32b9c950ywan
447233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        mm6, mm6                    ; Blank mmx7
448233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        mm7, mm7                    ; Blank mmx7
449233d2500723e5594f3e7c70896ffeeef32b9c950ywan
450233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
451233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rbx, arg(2) ;[ref_ptr]
452233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movsxd      rcx, dword ptr arg(1) ;[source_stride]
453233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
454233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; Row 1
455233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd        mm0, [rax]                  ; Copy eight bytes to mm0
456233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
457233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
458233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   mm1, mm6
459233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubsw      mm0, mm1                    ; A-B (low order) to MM0
460233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaddwd     mm0, mm0                    ; square and accumulate
461233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add         rbx,rdx                     ; Inc pointer into ref data
462233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add         rax,rcx                     ; Inc pointer into the new data
463233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
464233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd       mm7, mm0                    ; accumulate in mm7
465233d2500723e5594f3e7c70896ffeeef32b9c950ywan
466233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; Row 2
467233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd        mm0, [rax]                  ; Copy eight bytes to mm0
468233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
469233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   mm1, mm6
470233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubsw      mm0, mm1                    ; A-B (low order) to MM0
471233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaddwd     mm0, mm0                    ; square and accumulate
472233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add         rbx,rdx                     ; Inc pointer into ref data
473233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add         rax,rcx                     ; Inc pointer into the new data
474233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
475233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd       mm7, mm0                    ; accumulate in mm7
476233d2500723e5594f3e7c70896ffeeef32b9c950ywan
477233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; Row 3
478233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd        mm0, [rax]                  ; Copy eight bytes to mm0
479233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   mm1, mm6
480233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
481233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubsw      mm0, mm1                    ; A-B (low order) to MM0
482233d2500723e5594f3e7c70896ffeeef32b9c950ywan
483233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaddwd     mm0, mm0                    ; square and accumulate
484233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add         rbx,rdx                     ; Inc pointer into ref data
485233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add         rax,rcx                     ; Inc pointer into the new data
486233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
487233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd       mm7, mm0                    ; accumulate in mm7
488233d2500723e5594f3e7c70896ffeeef32b9c950ywan
489233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; Row 4
490233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd        mm0, [rax]                  ; Copy eight bytes to mm0
491233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
492233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   mm1, mm6
493233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubsw      mm0, mm1                    ; A-B (low order) to MM0
494233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaddwd     mm0, mm0                    ; square and accumulate
495233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd       mm7, mm0                    ; accumulate in mm7
496233d2500723e5594f3e7c70896ffeeef32b9c950ywan
497233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        mm0,    mm7                 ;
498233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrlq       mm7,    32
499233d2500723e5594f3e7c70896ffeeef32b9c950ywan
500233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd       mm0,    mm7
501233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        rax,    mm0
502233d2500723e5594f3e7c70896ffeeef32b9c950ywan
503233d2500723e5594f3e7c70896ffeeef32b9c950ywan
504233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; begin epilog
505233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rbx
506233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rdi
507233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rsi
508233d2500723e5594f3e7c70896ffeeef32b9c950ywan    UNSHADOW_ARGS
509233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop         rbp
510233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ret
511233d2500723e5594f3e7c70896ffeeef32b9c950ywan
512233d2500723e5594f3e7c70896ffeeef32b9c950ywan%define mmx_filter_shift            7
513233d2500723e5594f3e7c70896ffeeef32b9c950ywan
514233d2500723e5594f3e7c70896ffeeef32b9c950ywan;void vp8_filter_block2d_bil4x4_var_mmx
515233d2500723e5594f3e7c70896ffeeef32b9c950ywan;(
516233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned char *ref_ptr,
517233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    int ref_pixels_per_line,
518233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned char *src_ptr,
519233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    int src_pixels_per_line,
520233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned short *HFilter,
521233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned short *VFilter,
522233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    int *sum,
523233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned int *sumsquared
524233d2500723e5594f3e7c70896ffeeef32b9c950ywan;)
525233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp8_filter_block2d_bil4x4_var_mmx) PRIVATE
526233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp8_filter_block2d_bil4x4_var_mmx):
527233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rbp
528233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         rbp, rsp
529233d2500723e5594f3e7c70896ffeeef32b9c950ywan    SHADOW_ARGS_TO_STACK 8
530233d2500723e5594f3e7c70896ffeeef32b9c950ywan    GET_GOT     rbx
531233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push rsi
532233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push rdi
533233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         rsp, 16
534233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; end prolog
535233d2500723e5594f3e7c70896ffeeef32b9c950ywan
536233d2500723e5594f3e7c70896ffeeef32b9c950ywan
537233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor            mm6,            mm6                 ;
538233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor            mm7,            mm7                 ;
539233d2500723e5594f3e7c70896ffeeef32b9c950ywan
540233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov             rax,            arg(4) ;HFilter             ;
541233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov             rdx,            arg(5) ;VFilter             ;
542233d2500723e5594f3e7c70896ffeeef32b9c950ywan
543233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov             rsi,            arg(0) ;ref_ptr              ;
544233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov             rdi,            arg(2) ;src_ptr              ;
545233d2500723e5594f3e7c70896ffeeef32b9c950ywan
546233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov             rcx,            4                   ;
547233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor            mm0,            mm0                 ;
548233d2500723e5594f3e7c70896ffeeef32b9c950ywan
549233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd            mm1,            [rsi]               ;
550233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd            mm3,            [rsi+1]             ;
551233d2500723e5594f3e7c70896ffeeef32b9c950ywan
552233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw       mm1,            mm0                 ;
553233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmullw          mm1,            [rax]               ;
554233d2500723e5594f3e7c70896ffeeef32b9c950ywan
555233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw       mm3,            mm0                 ;
556233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmullw          mm3,            [rax+8]             ;
557233d2500723e5594f3e7c70896ffeeef32b9c950ywan
558233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw           mm1,            mm3                 ;
559233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
560233d2500723e5594f3e7c70896ffeeef32b9c950ywan
561233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psraw           mm1,            mmx_filter_shift    ;
562233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq            mm5,            mm1
563233d2500723e5594f3e7c70896ffeeef32b9c950ywan
564233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if ABI_IS_32BIT
565233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add             rsi, dword ptr  arg(1) ;ref_pixels_per_line    ;
566233d2500723e5594f3e7c70896ffeeef32b9c950ywan%else
567233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movsxd          r8, dword ptr  arg(1) ;ref_pixels_per_line    ;
568233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add             rsi, r8
569233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif
570233d2500723e5594f3e7c70896ffeeef32b9c950ywan
571233d2500723e5594f3e7c70896ffeeef32b9c950ywan.filter_block2d_bil4x4_var_mmx_loop:
572233d2500723e5594f3e7c70896ffeeef32b9c950ywan
573233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd            mm1,            [rsi]               ;
574233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd            mm3,            [rsi+1]             ;
575233d2500723e5594f3e7c70896ffeeef32b9c950ywan
576233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw       mm1,            mm0                 ;
577233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmullw          mm1,            [rax]               ;
578233d2500723e5594f3e7c70896ffeeef32b9c950ywan
579233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw       mm3,            mm0                 ;
580233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmullw          mm3,            [rax+8]             ;
581233d2500723e5594f3e7c70896ffeeef32b9c950ywan
582233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw           mm1,            mm3                 ;
583233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
584233d2500723e5594f3e7c70896ffeeef32b9c950ywan
585233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psraw           mm1,            mmx_filter_shift    ;
586233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq            mm3,            mm5                 ;
587233d2500723e5594f3e7c70896ffeeef32b9c950ywan
588233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq            mm5,            mm1                 ;
589233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmullw          mm3,            [rdx]               ;
590233d2500723e5594f3e7c70896ffeeef32b9c950ywan
591233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmullw          mm1,            [rdx+8]             ;
592233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw           mm1,            mm3                 ;
593233d2500723e5594f3e7c70896ffeeef32b9c950ywan
594233d2500723e5594f3e7c70896ffeeef32b9c950ywan
595233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
596233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psraw           mm1,            mmx_filter_shift    ;
597233d2500723e5594f3e7c70896ffeeef32b9c950ywan
598233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd            mm3,            [rdi]               ;
599233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw       mm3,            mm0                 ;
600233d2500723e5594f3e7c70896ffeeef32b9c950ywan
601233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubw           mm1,            mm3                 ;
602233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw           mm6,            mm1                 ;
603233d2500723e5594f3e7c70896ffeeef32b9c950ywan
604233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaddwd         mm1,            mm1                 ;
605233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd           mm7,            mm1                 ;
606233d2500723e5594f3e7c70896ffeeef32b9c950ywan
607233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if ABI_IS_32BIT
608233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
609233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;
610233d2500723e5594f3e7c70896ffeeef32b9c950ywan%else
611233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
612233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
613233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add             rsi,            r8
614233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add             rdi,            r9
615233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif
616233d2500723e5594f3e7c70896ffeeef32b9c950ywan        sub             rcx,            1                   ;
617233d2500723e5594f3e7c70896ffeeef32b9c950ywan        jnz             .filter_block2d_bil4x4_var_mmx_loop       ;
618233d2500723e5594f3e7c70896ffeeef32b9c950ywan
619233d2500723e5594f3e7c70896ffeeef32b9c950ywan
620233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor            mm3,            mm3                 ;
621233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor            mm2,            mm2                 ;
622233d2500723e5594f3e7c70896ffeeef32b9c950ywan
623233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklwd       mm2,            mm6                 ;
624233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhwd       mm3,            mm6                 ;
625233d2500723e5594f3e7c70896ffeeef32b9c950ywan
626233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd           mm2,            mm3                 ;
627233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq            mm6,            mm2                 ;
628233d2500723e5594f3e7c70896ffeeef32b9c950ywan
629233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrlq           mm6,            32                  ;
630233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd           mm2,            mm6                 ;
631233d2500723e5594f3e7c70896ffeeef32b9c950ywan
632233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrad           mm2,            16                  ;
633233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq            mm4,            mm7                 ;
634233d2500723e5594f3e7c70896ffeeef32b9c950ywan
635233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrlq           mm4,            32                  ;
636233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd           mm4,            mm7                 ;
637233d2500723e5594f3e7c70896ffeeef32b9c950ywan
638233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov             rdi,            arg(6) ;sum
639233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov             rsi,            arg(7) ;sumsquared
640233d2500723e5594f3e7c70896ffeeef32b9c950ywan
641233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd            dword ptr [rdi],          mm2                 ;
642233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd            dword ptr [rsi],          mm4                 ;
643233d2500723e5594f3e7c70896ffeeef32b9c950ywan
644233d2500723e5594f3e7c70896ffeeef32b9c950ywan
645233d2500723e5594f3e7c70896ffeeef32b9c950ywan
646233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; begin epilog
647233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add rsp, 16
648233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rdi
649233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rsi
650233d2500723e5594f3e7c70896ffeeef32b9c950ywan    RESTORE_GOT
651233d2500723e5594f3e7c70896ffeeef32b9c950ywan    UNSHADOW_ARGS
652233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop         rbp
653233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ret
654233d2500723e5594f3e7c70896ffeeef32b9c950ywan
655233d2500723e5594f3e7c70896ffeeef32b9c950ywan
656233d2500723e5594f3e7c70896ffeeef32b9c950ywan
657233d2500723e5594f3e7c70896ffeeef32b9c950ywan
658233d2500723e5594f3e7c70896ffeeef32b9c950ywan;void vp8_filter_block2d_bil_var_mmx
659233d2500723e5594f3e7c70896ffeeef32b9c950ywan;(
660233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned char *ref_ptr,
661233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    int ref_pixels_per_line,
662233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned char *src_ptr,
663233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    int src_pixels_per_line,
664233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned int Height,
665233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned short *HFilter,
666233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned short *VFilter,
667233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    int *sum,
668233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned int *sumsquared
669233d2500723e5594f3e7c70896ffeeef32b9c950ywan;)
670233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp8_filter_block2d_bil_var_mmx) PRIVATE
671233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp8_filter_block2d_bil_var_mmx):
672233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rbp
673233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         rbp, rsp
674233d2500723e5594f3e7c70896ffeeef32b9c950ywan    SHADOW_ARGS_TO_STACK 9
675233d2500723e5594f3e7c70896ffeeef32b9c950ywan    GET_GOT     rbx
676233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push rsi
677233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push rdi
678233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         rsp, 16
679233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; end prolog
680233d2500723e5594f3e7c70896ffeeef32b9c950ywan
681233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor            mm6,            mm6                 ;
682233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor            mm7,            mm7                 ;
683233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov             rax,            arg(5) ;HFilter             ;
684233d2500723e5594f3e7c70896ffeeef32b9c950ywan
685233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov             rdx,            arg(6) ;VFilter             ;
686233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov             rsi,            arg(0) ;ref_ptr              ;
687233d2500723e5594f3e7c70896ffeeef32b9c950ywan
688233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov             rdi,            arg(2) ;src_ptr              ;
689233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movsxd          rcx,            dword ptr arg(4) ;Height              ;
690233d2500723e5594f3e7c70896ffeeef32b9c950ywan
691233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor            mm0,            mm0                 ;
692233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq            mm1,            [rsi]               ;
693233d2500723e5594f3e7c70896ffeeef32b9c950ywan
694233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq            mm3,            [rsi+1]             ;
695233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq            mm2,            mm1                 ;
696233d2500723e5594f3e7c70896ffeeef32b9c950ywan
697233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq            mm4,            mm3                 ;
698233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw       mm1,            mm0                 ;
699233d2500723e5594f3e7c70896ffeeef32b9c950ywan
700233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhbw       mm2,            mm0                 ;
701233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmullw          mm1,            [rax]               ;
702233d2500723e5594f3e7c70896ffeeef32b9c950ywan
703233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmullw          mm2,            [rax]               ;
704233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw       mm3,            mm0                 ;
705233d2500723e5594f3e7c70896ffeeef32b9c950ywan
706233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhbw       mm4,            mm0                 ;
707233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmullw          mm3,            [rax+8]             ;
708233d2500723e5594f3e7c70896ffeeef32b9c950ywan
709233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmullw          mm4,            [rax+8]             ;
710233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw           mm1,            mm3                 ;
711233d2500723e5594f3e7c70896ffeeef32b9c950ywan
712233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw           mm2,            mm4                 ;
713233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
714233d2500723e5594f3e7c70896ffeeef32b9c950ywan
715233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psraw           mm1,            mmx_filter_shift    ;
716233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
717233d2500723e5594f3e7c70896ffeeef32b9c950ywan
718233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psraw           mm2,            mmx_filter_shift    ;
719233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq            mm5,            mm1
720233d2500723e5594f3e7c70896ffeeef32b9c950ywan
721233d2500723e5594f3e7c70896ffeeef32b9c950ywan        packuswb        mm5,            mm2                 ;
722233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if ABI_IS_32BIT
723233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line
724233d2500723e5594f3e7c70896ffeeef32b9c950ywan%else
725233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
726233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add             rsi,            r8
727233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif
728233d2500723e5594f3e7c70896ffeeef32b9c950ywan
729233d2500723e5594f3e7c70896ffeeef32b9c950ywan.filter_block2d_bil_var_mmx_loop:
730233d2500723e5594f3e7c70896ffeeef32b9c950ywan
731233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq            mm1,            [rsi]               ;
732233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq            mm3,            [rsi+1]             ;
733233d2500723e5594f3e7c70896ffeeef32b9c950ywan
734233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq            mm2,            mm1                 ;
735233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq            mm4,            mm3                 ;
736233d2500723e5594f3e7c70896ffeeef32b9c950ywan
737233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw       mm1,            mm0                 ;
738233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhbw       mm2,            mm0                 ;
739233d2500723e5594f3e7c70896ffeeef32b9c950ywan
740233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmullw          mm1,            [rax]               ;
741233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmullw          mm2,            [rax]               ;
742233d2500723e5594f3e7c70896ffeeef32b9c950ywan
743233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw       mm3,            mm0                 ;
744233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhbw       mm4,            mm0                 ;
745233d2500723e5594f3e7c70896ffeeef32b9c950ywan
746233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmullw          mm3,            [rax+8]             ;
747233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmullw          mm4,            [rax+8]             ;
748233d2500723e5594f3e7c70896ffeeef32b9c950ywan
749233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw           mm1,            mm3                 ;
750233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw           mm2,            mm4                 ;
751233d2500723e5594f3e7c70896ffeeef32b9c950ywan
752233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
753233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psraw           mm1,            mmx_filter_shift    ;
754233d2500723e5594f3e7c70896ffeeef32b9c950ywan
755233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
756233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psraw           mm2,            mmx_filter_shift    ;
757233d2500723e5594f3e7c70896ffeeef32b9c950ywan
758233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq            mm3,            mm5                 ;
759233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq            mm4,            mm5                 ;
760233d2500723e5594f3e7c70896ffeeef32b9c950ywan
761233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw       mm3,            mm0                 ;
762233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhbw       mm4,            mm0                 ;
763233d2500723e5594f3e7c70896ffeeef32b9c950ywan
764233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq            mm5,            mm1                 ;
765233d2500723e5594f3e7c70896ffeeef32b9c950ywan        packuswb        mm5,            mm2                 ;
766233d2500723e5594f3e7c70896ffeeef32b9c950ywan
767233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmullw          mm3,            [rdx]               ;
768233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmullw          mm4,            [rdx]               ;
769233d2500723e5594f3e7c70896ffeeef32b9c950ywan
770233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmullw          mm1,            [rdx+8]             ;
771233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmullw          mm2,            [rdx+8]             ;
772233d2500723e5594f3e7c70896ffeeef32b9c950ywan
773233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw           mm1,            mm3                 ;
774233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw           mm2,            mm4                 ;
775233d2500723e5594f3e7c70896ffeeef32b9c950ywan
776233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
777233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
778233d2500723e5594f3e7c70896ffeeef32b9c950ywan
779233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psraw           mm1,            mmx_filter_shift    ;
780233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psraw           mm2,            mmx_filter_shift    ;
781233d2500723e5594f3e7c70896ffeeef32b9c950ywan
782233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq            mm3,            [rdi]               ;
783233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq            mm4,            mm3                 ;
784233d2500723e5594f3e7c70896ffeeef32b9c950ywan
785233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw       mm3,            mm0                 ;
786233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhbw       mm4,            mm0                 ;
787233d2500723e5594f3e7c70896ffeeef32b9c950ywan
788233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubw           mm1,            mm3                 ;
789233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubw           mm2,            mm4                 ;
790233d2500723e5594f3e7c70896ffeeef32b9c950ywan
791233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw           mm6,            mm1                 ;
792233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaddwd         mm1,            mm1                 ;
793233d2500723e5594f3e7c70896ffeeef32b9c950ywan
794233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw           mm6,            mm2                 ;
795233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaddwd         mm2,            mm2                 ;
796233d2500723e5594f3e7c70896ffeeef32b9c950ywan
797233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd           mm7,            mm1                 ;
798233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd           mm7,            mm2                 ;
799233d2500723e5594f3e7c70896ffeeef32b9c950ywan
800233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if ABI_IS_32BIT
801233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
802233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;
803233d2500723e5594f3e7c70896ffeeef32b9c950ywan%else
804233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line    ;
805233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line    ;
806233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add             rsi,            r8
807233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add             rdi,            r9
808233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif
809233d2500723e5594f3e7c70896ffeeef32b9c950ywan        sub             rcx,            1                   ;
810233d2500723e5594f3e7c70896ffeeef32b9c950ywan        jnz             .filter_block2d_bil_var_mmx_loop       ;
811233d2500723e5594f3e7c70896ffeeef32b9c950ywan
812233d2500723e5594f3e7c70896ffeeef32b9c950ywan
813233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor            mm3,            mm3                 ;
814233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor            mm2,            mm2                 ;
815233d2500723e5594f3e7c70896ffeeef32b9c950ywan
816233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklwd       mm2,            mm6                 ;
817233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhwd       mm3,            mm6                 ;
818233d2500723e5594f3e7c70896ffeeef32b9c950ywan
819233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd           mm2,            mm3                 ;
820233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq            mm6,            mm2                 ;
821233d2500723e5594f3e7c70896ffeeef32b9c950ywan
822233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrlq           mm6,            32                  ;
823233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd           mm2,            mm6                 ;
824233d2500723e5594f3e7c70896ffeeef32b9c950ywan
825233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrad           mm2,            16                  ;
826233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq            mm4,            mm7                 ;
827233d2500723e5594f3e7c70896ffeeef32b9c950ywan
828233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrlq           mm4,            32                  ;
829233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddd           mm4,            mm7                 ;
830233d2500723e5594f3e7c70896ffeeef32b9c950ywan
831233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov             rdi,            arg(7) ;sum
832233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov             rsi,            arg(8) ;sumsquared
833233d2500723e5594f3e7c70896ffeeef32b9c950ywan
834233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd            dword ptr [rdi],          mm2                 ;
835233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd            dword ptr [rsi],          mm4                 ;
836233d2500723e5594f3e7c70896ffeeef32b9c950ywan
837233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; begin epilog
838233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add rsp, 16
839233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rdi
840233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rsi
841233d2500723e5594f3e7c70896ffeeef32b9c950ywan    RESTORE_GOT
842233d2500723e5594f3e7c70896ffeeef32b9c950ywan    UNSHADOW_ARGS
843233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop         rbp
844233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ret
845233d2500723e5594f3e7c70896ffeeef32b9c950ywan
846233d2500723e5594f3e7c70896ffeeef32b9c950ywan
847233d2500723e5594f3e7c70896ffeeef32b9c950ywanSECTION_RODATA
848233d2500723e5594f3e7c70896ffeeef32b9c950ywan;short mmx_bi_rd[4] = { 64, 64, 64, 64};
849233d2500723e5594f3e7c70896ffeeef32b9c950ywanalign 16
850233d2500723e5594f3e7c70896ffeeef32b9c950ywanmmx_bi_rd:
851233d2500723e5594f3e7c70896ffeeef32b9c950ywan    times 4 dw 64
852