1233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
2233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
4233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  Use of this source code is governed by a BSD-style license
5233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  that can be found in the LICENSE file in the root of the source
6233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  tree. An additional intellectual property rights grant can be found
7233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  in the file PATENTS.  All contributing project authors may
8233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  be found in the AUTHORS file in the root of the source tree.
9233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
10233d2500723e5594f3e7c70896ffeeef32b9c950ywan
11233d2500723e5594f3e7c70896ffeeef32b9c950ywan
12233d2500723e5594f3e7c70896ffeeef32b9c950ywan%include "vpx_ports/x86_abi_support.asm"
13233d2500723e5594f3e7c70896ffeeef32b9c950ywan%define _t0 0
14233d2500723e5594f3e7c70896ffeeef32b9c950ywan%define _t1 _t0 + 16
15233d2500723e5594f3e7c70896ffeeef32b9c950ywan%define _p3 _t1 + 16
16233d2500723e5594f3e7c70896ffeeef32b9c950ywan%define _p2 _p3 + 16
17233d2500723e5594f3e7c70896ffeeef32b9c950ywan%define _p1 _p2 + 16
18233d2500723e5594f3e7c70896ffeeef32b9c950ywan%define _p0 _p1 + 16
19233d2500723e5594f3e7c70896ffeeef32b9c950ywan%define _q0 _p0 + 16
20233d2500723e5594f3e7c70896ffeeef32b9c950ywan%define _q1 _q0 + 16
21233d2500723e5594f3e7c70896ffeeef32b9c950ywan%define _q2 _q1 + 16
22233d2500723e5594f3e7c70896ffeeef32b9c950ywan%define _q3 _q2 + 16
23233d2500723e5594f3e7c70896ffeeef32b9c950ywan%define lf_var_size 160
24233d2500723e5594f3e7c70896ffeeef32b9c950ywan
25233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Use of pmaxub instead of psubusb to compute filter mask was seen
26233d2500723e5594f3e7c70896ffeeef32b9c950ywan; in ffvp8
27233d2500723e5594f3e7c70896ffeeef32b9c950ywan
28233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro LFH_FILTER_AND_HEV_MASK 1
29233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if %1
30233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm2,                   [rdi+2*rax]       ; q3
31233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm1,                   [rsi+2*rax]       ; q2
32233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm4,                   [rsi+rax]         ; q1
33233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm5,                   [rsi]             ; q0
34233d2500723e5594f3e7c70896ffeeef32b9c950ywan        neg         rax                     ; negate pitch to deal with above border
35233d2500723e5594f3e7c70896ffeeef32b9c950ywan%else
36233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movlps      xmm2,                   [rsi + rcx*2]     ; q3
37233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movlps      xmm1,                   [rsi + rcx]       ; q2
38233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movlps      xmm4,                   [rsi]             ; q1
39233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movlps      xmm5,                   [rsi + rax]       ; q0
40233d2500723e5594f3e7c70896ffeeef32b9c950ywan
41233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movhps      xmm2,                   [rdi + rcx*2]
42233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movhps      xmm1,                   [rdi + rcx]
43233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movhps      xmm4,                   [rdi]
44233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movhps      xmm5,                   [rdi + rax]
45233d2500723e5594f3e7c70896ffeeef32b9c950ywan
46233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rsi,                    [rsi + rax*4]
47233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rdi,                    [rdi + rax*4]
48233d2500723e5594f3e7c70896ffeeef32b9c950ywan
49233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      [rsp+_q2],              xmm1              ; store q2
50233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      [rsp+_q1],              xmm4              ; store q1
51233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif
52233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm7,                   [rdx]             ;limit
53233d2500723e5594f3e7c70896ffeeef32b9c950ywan
54233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm6,                   xmm1              ; q2
55233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm3,                   xmm4              ; q1
56233d2500723e5594f3e7c70896ffeeef32b9c950ywan
57233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm1,                   xmm2              ; q2-=q3
58233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm2,                   xmm6              ; q3-=q2
59233d2500723e5594f3e7c70896ffeeef32b9c950ywan
60233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm4,                   xmm6              ; q1-=q2
61233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm6,                   xmm3              ; q2-=q1
62233d2500723e5594f3e7c70896ffeeef32b9c950ywan
63233d2500723e5594f3e7c70896ffeeef32b9c950ywan        por         xmm4,                   xmm6              ; abs(q2-q1)
64233d2500723e5594f3e7c70896ffeeef32b9c950ywan        por         xmm1,                   xmm2              ; abs(q3-q2)
65233d2500723e5594f3e7c70896ffeeef32b9c950ywan
66233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm0,                   xmm5              ; q0
67233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaxub      xmm1,                   xmm4
68233d2500723e5594f3e7c70896ffeeef32b9c950ywan
69233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm5,                   xmm3              ; q0-=q1
70233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm3,                   xmm0              ; q1-=q0
71233d2500723e5594f3e7c70896ffeeef32b9c950ywan
72233d2500723e5594f3e7c70896ffeeef32b9c950ywan        por         xmm5,                   xmm3              ; abs(q0-q1)
73233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      [rsp+_t0],              xmm5              ; save to t0
74233d2500723e5594f3e7c70896ffeeef32b9c950ywan
75233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaxub      xmm1,                   xmm5
76233d2500723e5594f3e7c70896ffeeef32b9c950ywan
77233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if %1
78233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm2,                   [rsi+4*rax]       ; p3
79233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm4,                   [rdi+4*rax]       ; p2
80233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm6,                   [rsi+2*rax]       ; p1
81233d2500723e5594f3e7c70896ffeeef32b9c950ywan%else
82233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movlps      xmm2,                   [rsi + rax]       ; p3
83233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movlps      xmm4,                   [rsi]             ; p2
84233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movlps      xmm6,                   [rsi + rcx]       ; p1
85233d2500723e5594f3e7c70896ffeeef32b9c950ywan
86233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movhps      xmm2,                   [rdi + rax]
87233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movhps      xmm4,                   [rdi]
88233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movhps      xmm6,                   [rdi + rcx]
89233d2500723e5594f3e7c70896ffeeef32b9c950ywan
90233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      [rsp+_p2],              xmm4              ; store p2
91233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      [rsp+_p1],              xmm6              ; store p1
92233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif
93233d2500723e5594f3e7c70896ffeeef32b9c950ywan
94233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm5,                   xmm4              ; p2
95233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm3,                   xmm6              ; p1
96233d2500723e5594f3e7c70896ffeeef32b9c950ywan
97233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm4,                   xmm2              ; p2-=p3
98233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm2,                   xmm5              ; p3-=p2
99233d2500723e5594f3e7c70896ffeeef32b9c950ywan
100233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm3,                   xmm5              ; p1-=p2
101233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaxub      xmm1,                   xmm4              ; abs(p3 - p2)
102233d2500723e5594f3e7c70896ffeeef32b9c950ywan
103233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm5,                   xmm6              ; p2-=p1
104233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaxub      xmm1,                   xmm2              ; abs(p3 - p2)
105233d2500723e5594f3e7c70896ffeeef32b9c950ywan
106233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaxub      xmm1,                   xmm5              ; abs(p2 - p1)
107233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm2,                   xmm6              ; p1
108233d2500723e5594f3e7c70896ffeeef32b9c950ywan
109233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaxub      xmm1,                   xmm3              ; abs(p2 - p1)
110233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if %1
111233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm4,                   [rsi+rax]         ; p0
112233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm3,                   [rdi]             ; q1
113233d2500723e5594f3e7c70896ffeeef32b9c950ywan%else
114233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movlps      xmm4,                   [rsi + rcx*2]     ; p0
115233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movhps      xmm4,                   [rdi + rcx*2]
116233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm3,                   [rsp+_q1]                ; q1
117233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif
118233d2500723e5594f3e7c70896ffeeef32b9c950ywan
119233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm5,                   xmm4              ; p0
120233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm4,                   xmm6              ; p0-=p1
121233d2500723e5594f3e7c70896ffeeef32b9c950ywan
122233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm6,                   xmm5              ; p1-=p0
123233d2500723e5594f3e7c70896ffeeef32b9c950ywan
124233d2500723e5594f3e7c70896ffeeef32b9c950ywan        por         xmm6,                   xmm4              ; abs(p1 - p0)
125233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rdx,                    arg(2)            ; get blimit
126233d2500723e5594f3e7c70896ffeeef32b9c950ywan
127233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa     [rsp+_t1],               xmm6              ; save to t1
128233d2500723e5594f3e7c70896ffeeef32b9c950ywan
129233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm4,                   xmm3              ; q1
130233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaxub      xmm1,                   xmm6
131233d2500723e5594f3e7c70896ffeeef32b9c950ywan
132233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm3,                   xmm2              ; q1-=p1
133233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm2,                   xmm4              ; p1-=q1
134233d2500723e5594f3e7c70896ffeeef32b9c950ywan
135233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm1,                   xmm7
136233d2500723e5594f3e7c70896ffeeef32b9c950ywan        por         xmm2,                   xmm3              ; abs(p1-q1)
137233d2500723e5594f3e7c70896ffeeef32b9c950ywan
138233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm7,                   [rdx]             ; blimit
139233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rdx,                    arg(4)            ; hev get thresh
140233d2500723e5594f3e7c70896ffeeef32b9c950ywan
141233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm3,                   xmm0              ; q0
142233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pand        xmm2,                   [GLOBAL(tfe)]     ; set lsb of each byte to zero
143233d2500723e5594f3e7c70896ffeeef32b9c950ywan
144233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm6,                   xmm5              ; p0
145233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrlw       xmm2,                   1                 ; abs(p1-q1)/2
146233d2500723e5594f3e7c70896ffeeef32b9c950ywan
147233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm5,                   xmm3              ; p0-=q0
148233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm3,                   xmm6              ; q0-=p0
149233d2500723e5594f3e7c70896ffeeef32b9c950ywan        por         xmm5,                   xmm3              ; abs(p0 - q0)
150233d2500723e5594f3e7c70896ffeeef32b9c950ywan
151233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddusb     xmm5,                   xmm5              ; abs(p0-q0)*2
152233d2500723e5594f3e7c70896ffeeef32b9c950ywan
153233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm4,                   [rsp+_t0]                ; hev get abs (q1 - q0)
154233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm3,                   [rsp+_t1]                ; get abs (p1 - p0)
155233d2500723e5594f3e7c70896ffeeef32b9c950ywan
156233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddusb     xmm5,                   xmm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
157233d2500723e5594f3e7c70896ffeeef32b9c950ywan
158233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm2,                   [rdx]             ; hev
159233d2500723e5594f3e7c70896ffeeef32b9c950ywan
160233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm5,                   xmm7              ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
161233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm4,                   xmm2              ; hev
162233d2500723e5594f3e7c70896ffeeef32b9c950ywan
163233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm3,                   xmm2              ; hev
164233d2500723e5594f3e7c70896ffeeef32b9c950ywan        por         xmm1,                   xmm5
165233d2500723e5594f3e7c70896ffeeef32b9c950ywan
166233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm7,                   xmm7
167233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddb       xmm4,                   xmm3              ; hev abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
168233d2500723e5594f3e7c70896ffeeef32b9c950ywan
169233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pcmpeqb     xmm4,                   xmm5              ; hev
170233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pcmpeqb     xmm3,                   xmm3              ; hev
171233d2500723e5594f3e7c70896ffeeef32b9c950ywan
172233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pcmpeqb     xmm1,                   xmm7              ; mask xmm1
173233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm4,                   xmm3              ; hev
174233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
175233d2500723e5594f3e7c70896ffeeef32b9c950ywan
176233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro B_FILTER 1
177233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm3,                   [GLOBAL(t80)]
178233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if %1 == 0
179233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm2,                   [rsp+_p1]                ; p1
180233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm7,                   [rsp+_q1]                ; q1
181233d2500723e5594f3e7c70896ffeeef32b9c950ywan%elif %1 == 1
182233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm2,                   [rsi+2*rax]       ; p1
183233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm7,                   [rdi]             ; q1
184233d2500723e5594f3e7c70896ffeeef32b9c950ywan%elif %1 == 2
185233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm2,                   [rsp+_p1]         ; p1
186233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm6,                   [rsp+_p0]         ; p0
187233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm0,                   [rsp+_q0]         ; q0
188233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm7,                   [rsp+_q1]         ; q1
189233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif
190233d2500723e5594f3e7c70896ffeeef32b9c950ywan
191233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm2,                   xmm3              ; p1 offset to convert to signed values
192233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm7,                   xmm3              ; q1 offset to convert to signed values
193233d2500723e5594f3e7c70896ffeeef32b9c950ywan
194233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubsb      xmm2,                   xmm7              ; p1 - q1
195233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm6,                   xmm3              ; offset to convert to signed values
196233d2500723e5594f3e7c70896ffeeef32b9c950ywan
197233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pand        xmm2,                   xmm4              ; high var mask (hvm)(p1 - q1)
198233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm0,                   xmm3              ; offset to convert to signed values
199233d2500723e5594f3e7c70896ffeeef32b9c950ywan
200233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm3,                   xmm0              ; q0
201233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubsb      xmm0,                   xmm6              ; q0 - p0
202233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddsb      xmm2,                   xmm0              ; 1 * (q0 - p0) + hvm(p1 - q1)
203233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddsb      xmm2,                   xmm0              ; 2 * (q0 - p0) + hvm(p1 - q1)
204233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddsb      xmm2,                   xmm0              ; 3 * (q0 - p0) + hvm(p1 - q1)
205233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pand        xmm1,                   xmm2              ; mask filter values we don't care about
206233d2500723e5594f3e7c70896ffeeef32b9c950ywan
207233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm2,                   xmm1
208233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddsb      xmm1,                   [GLOBAL(t4)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 4
209233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddsb      xmm2,                   [GLOBAL(t3)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 3
210233d2500723e5594f3e7c70896ffeeef32b9c950ywan
211233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhbw   xmm5,                   xmm2              ; axbxcxdx
212233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm2,                   xmm2              ; exfxgxhx
213233d2500723e5594f3e7c70896ffeeef32b9c950ywan
214233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm0,                   xmm1              ; exfxgxhx
215233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psraw       xmm5,                   11                ; sign extended shift right by 3
216233d2500723e5594f3e7c70896ffeeef32b9c950ywan
217233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhbw   xmm1,                   xmm1              ; axbxcxdx
218233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psraw       xmm2,                   11                ; sign extended shift right by 3
219233d2500723e5594f3e7c70896ffeeef32b9c950ywan
220233d2500723e5594f3e7c70896ffeeef32b9c950ywan        packsswb    xmm2,                   xmm5              ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
221233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psraw       xmm0,                   11                ; sign extended shift right by 3
222233d2500723e5594f3e7c70896ffeeef32b9c950ywan
223233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psraw       xmm1,                   11                ; sign extended shift right by 3
224233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm5,                   xmm0              ; save results
225233d2500723e5594f3e7c70896ffeeef32b9c950ywan
226233d2500723e5594f3e7c70896ffeeef32b9c950ywan        packsswb    xmm0,                   xmm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
227233d2500723e5594f3e7c70896ffeeef32b9c950ywan
228233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddsb      xmm6,                   xmm2              ; p0+= p0 add
229233d2500723e5594f3e7c70896ffeeef32b9c950ywan
230233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm2,                   [GLOBAL(ones)]
231233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddsw      xmm5,                   xmm2
232233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddsw      xmm1,                   xmm2
233233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psraw       xmm5,                   1                 ; partial shifted one more time for 2nd tap
234233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psraw       xmm1,                   1                 ; partial shifted one more time for 2nd tap
235233d2500723e5594f3e7c70896ffeeef32b9c950ywan        packsswb    xmm5,                   xmm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
236233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm2,                   [GLOBAL(t80)]
237233d2500723e5594f3e7c70896ffeeef32b9c950ywan
238233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if %1 == 0
239233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm1,                   [rsp+_p1]         ; p1
240233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rsi,                    [rsi + rcx*2]
241233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rdi,                    [rdi + rcx*2]
242233d2500723e5594f3e7c70896ffeeef32b9c950ywan%elif %1 == 1
243233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm1,                   [rsi+2*rax]       ; p1
244233d2500723e5594f3e7c70896ffeeef32b9c950ywan%elif %1 == 2
245233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm1,                   [rsp+_p1]         ; p1
246233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif
247233d2500723e5594f3e7c70896ffeeef32b9c950ywan
248233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pandn       xmm4,                   xmm5              ; high edge variance additive
249233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm6,                   xmm2              ; unoffset
250233d2500723e5594f3e7c70896ffeeef32b9c950ywan
251233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm1,                   xmm2              ; reoffset
252233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubsb      xmm3,                   xmm0              ; q0-= q0 add
253233d2500723e5594f3e7c70896ffeeef32b9c950ywan
254233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddsb      xmm1,                   xmm4              ; p1+= p1 add
255233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm3,                   xmm2              ; unoffset
256233d2500723e5594f3e7c70896ffeeef32b9c950ywan
257233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm1,                   xmm2              ; unoffset
258233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubsb      xmm7,                   xmm4              ; q1-= q1 add
259233d2500723e5594f3e7c70896ffeeef32b9c950ywan
260233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm7,                   xmm2              ; unoffset
261233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if %1 == 0
262233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        [rsi],                  xmm6              ; p0
263233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movhps      [rdi],                  xmm6
264233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        [rsi + rax],            xmm1              ; p1
265233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movhps      [rdi + rax],            xmm1
266233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        [rsi + rcx],            xmm3              ; q0
267233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movhps      [rdi + rcx],            xmm3
268233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        [rsi + rcx*2],          xmm7              ; q1
269233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movhps      [rdi + rcx*2],          xmm7
270233d2500723e5594f3e7c70896ffeeef32b9c950ywan%elif %1 == 1
271233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      [rsi+rax],              xmm6              ; write back
272233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      [rsi+2*rax],            xmm1              ; write back
273233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      [rsi],                  xmm3              ; write back
274233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      [rdi],                  xmm7              ; write back
275233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif
276233d2500723e5594f3e7c70896ffeeef32b9c950ywan
277233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
278233d2500723e5594f3e7c70896ffeeef32b9c950ywan
279233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if ABI_IS_32BIT
280233d2500723e5594f3e7c70896ffeeef32b9c950ywan
281233d2500723e5594f3e7c70896ffeeef32b9c950ywan;void vp8_loop_filter_horizontal_edge_sse2
282233d2500723e5594f3e7c70896ffeeef32b9c950ywan;(
283233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned char *src_ptr,
284233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    int            src_pixel_step,
285233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    const char    *blimit,
286233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    const char    *limit,
287233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    const char    *thresh,
288233d2500723e5594f3e7c70896ffeeef32b9c950ywan;)
289233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp8_loop_filter_horizontal_edge_sse2) PRIVATE
290233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp8_loop_filter_horizontal_edge_sse2):
291233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rbp
292233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         rbp, rsp
293233d2500723e5594f3e7c70896ffeeef32b9c950ywan    SHADOW_ARGS_TO_STACK 5
294233d2500723e5594f3e7c70896ffeeef32b9c950ywan    SAVE_XMM 7
295233d2500723e5594f3e7c70896ffeeef32b9c950ywan    GET_GOT     rbx
296233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rsi
297233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rdi
298233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; end prolog
299233d2500723e5594f3e7c70896ffeeef32b9c950ywan
300233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ALIGN_STACK 16, rax
301233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         rsp, lf_var_size
302233d2500723e5594f3e7c70896ffeeef32b9c950ywan
303233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rsi,                    arg(0)           ;src_ptr
304233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movsxd      rax,                    dword ptr arg(1) ;src_pixel_step
305233d2500723e5594f3e7c70896ffeeef32b9c950ywan
306233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rdx,                    arg(3)           ;limit
307233d2500723e5594f3e7c70896ffeeef32b9c950ywan
308233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rdi,                    [rsi+rax]        ; rdi points to row +1 for indirect addressing
309233d2500723e5594f3e7c70896ffeeef32b9c950ywan
310233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; calculate breakout conditions and high edge variance
311233d2500723e5594f3e7c70896ffeeef32b9c950ywan        LFH_FILTER_AND_HEV_MASK 1
312233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; filter and write back the result
313233d2500723e5594f3e7c70896ffeeef32b9c950ywan        B_FILTER 1
314233d2500723e5594f3e7c70896ffeeef32b9c950ywan
315233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add rsp, lf_var_size
316233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rsp
317233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; begin epilog
318233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rdi
319233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rsi
320233d2500723e5594f3e7c70896ffeeef32b9c950ywan    RESTORE_GOT
321233d2500723e5594f3e7c70896ffeeef32b9c950ywan    RESTORE_XMM
322233d2500723e5594f3e7c70896ffeeef32b9c950ywan    UNSHADOW_ARGS
323233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop         rbp
324233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ret
325233d2500723e5594f3e7c70896ffeeef32b9c950ywan
326233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif
327233d2500723e5594f3e7c70896ffeeef32b9c950ywan
328233d2500723e5594f3e7c70896ffeeef32b9c950ywan;void vp8_loop_filter_horizontal_edge_uv_sse2
329233d2500723e5594f3e7c70896ffeeef32b9c950ywan;(
330233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned char *src_ptr,
331233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    int            src_pixel_step,
332233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    const char    *blimit,
333233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    const char    *limit,
334233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    const char    *thresh,
335233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    int            count
336233d2500723e5594f3e7c70896ffeeef32b9c950ywan;)
337233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp8_loop_filter_horizontal_edge_uv_sse2) PRIVATE
338233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp8_loop_filter_horizontal_edge_uv_sse2):
339233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rbp
340233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         rbp, rsp
341233d2500723e5594f3e7c70896ffeeef32b9c950ywan    SHADOW_ARGS_TO_STACK 6
342233d2500723e5594f3e7c70896ffeeef32b9c950ywan    SAVE_XMM 7
343233d2500723e5594f3e7c70896ffeeef32b9c950ywan    GET_GOT     rbx
344233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rsi
345233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rdi
346233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; end prolog
347233d2500723e5594f3e7c70896ffeeef32b9c950ywan
348233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ALIGN_STACK 16, rax
349233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         rsp, lf_var_size
350233d2500723e5594f3e7c70896ffeeef32b9c950ywan
351233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rsi,                    arg(0)             ; u
352233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rdi,                    arg(5)             ; v
353233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movsxd      rax,                    dword ptr arg(1)   ; src_pixel_step
354233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rcx,                    rax
355233d2500723e5594f3e7c70896ffeeef32b9c950ywan        neg         rax                     ; negate pitch to deal with above border
356233d2500723e5594f3e7c70896ffeeef32b9c950ywan
357233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rdx,                    arg(3)             ;limit
358233d2500723e5594f3e7c70896ffeeef32b9c950ywan
359233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rsi,                    [rsi + rcx]
360233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rdi,                    [rdi + rcx]
361233d2500723e5594f3e7c70896ffeeef32b9c950ywan
362233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; calculate breakout conditions and high edge variance
363233d2500723e5594f3e7c70896ffeeef32b9c950ywan        LFH_FILTER_AND_HEV_MASK 0
364233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; filter and write back the result
365233d2500723e5594f3e7c70896ffeeef32b9c950ywan        B_FILTER 0
366233d2500723e5594f3e7c70896ffeeef32b9c950ywan
367233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add rsp, lf_var_size
368233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rsp
369233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; begin epilog
370233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rdi
371233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rsi
372233d2500723e5594f3e7c70896ffeeef32b9c950ywan    RESTORE_GOT
373233d2500723e5594f3e7c70896ffeeef32b9c950ywan    RESTORE_XMM
374233d2500723e5594f3e7c70896ffeeef32b9c950ywan    UNSHADOW_ARGS
375233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop         rbp
376233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ret
377233d2500723e5594f3e7c70896ffeeef32b9c950ywan
378233d2500723e5594f3e7c70896ffeeef32b9c950ywan
379233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro MB_FILTER_AND_WRITEBACK 1
380233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm3,                   [GLOBAL(t80)]
381233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if %1 == 0
382233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm2,                   [rsp+_p1]              ; p1
383233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm7,                   [rsp+_q1]              ; q1
384233d2500723e5594f3e7c70896ffeeef32b9c950ywan%elif %1 == 1
385233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm2,                   [rsi+2*rax]     ; p1
386233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm7,                   [rdi]           ; q1
387233d2500723e5594f3e7c70896ffeeef32b9c950ywan
388233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rcx,                    rax
389233d2500723e5594f3e7c70896ffeeef32b9c950ywan        neg         rcx
390233d2500723e5594f3e7c70896ffeeef32b9c950ywan%elif %1 == 2
391233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm2,                   [rsp+_p1]       ; p1
392233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm6,                   [rsp+_p0]       ; p0
393233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm0,                   [rsp+_q0]       ; q0
394233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm7,                   [rsp+_q1]       ; q1
395233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif
396233d2500723e5594f3e7c70896ffeeef32b9c950ywan
397233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm2,                   xmm3            ; p1 offset to convert to signed values
398233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm7,                   xmm3            ; q1 offset to convert to signed values
399233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm6,                   xmm3            ; offset to convert to signed values
400233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm0,                   xmm3            ; offset to convert to signed values
401233d2500723e5594f3e7c70896ffeeef32b9c950ywan
402233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubsb      xmm2,                   xmm7            ; p1 - q1
403233d2500723e5594f3e7c70896ffeeef32b9c950ywan
404233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm3,                   xmm0            ; q0
405233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubsb      xmm0,                   xmm6            ; q0 - p0
406233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddsb      xmm2,                   xmm0            ; 1 * (q0 - p0) + (p1 - q1)
407233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddsb      xmm2,                   xmm0            ; 2 * (q0 - p0)
408233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddsb      xmm2,                   xmm0            ; 3 * (q0 - p0) + (p1 - q1)
409233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pand        xmm1,                   xmm2            ; mask filter values we don't care about
410233d2500723e5594f3e7c70896ffeeef32b9c950ywan
411233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm2,                   xmm1            ; vp8_filter
412233d2500723e5594f3e7c70896ffeeef32b9c950ywan
413233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pand        xmm2,                   xmm4            ; Filter2 = vp8_filter & hev
414233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm0,                   xmm0
415233d2500723e5594f3e7c70896ffeeef32b9c950ywan
416233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pandn       xmm4,                   xmm1            ; vp8_filter&=~hev
417233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm1,                   xmm1
418233d2500723e5594f3e7c70896ffeeef32b9c950ywan
419233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm0,                   xmm4            ; Filter 2 (hi)
420233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhbw   xmm1,                   xmm4            ; Filter 2 (lo)
421233d2500723e5594f3e7c70896ffeeef32b9c950ywan
422233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm5,                   xmm2
423233d2500723e5594f3e7c70896ffeeef32b9c950ywan
424233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm4,                   [GLOBAL(s9)]
425233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddsb      xmm5,                   [GLOBAL(t3)]    ; vp8_signed_char_clamp(Filter2 + 3)
426233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddsb      xmm2,                   [GLOBAL(t4)]    ; vp8_signed_char_clamp(Filter2 + 4)
427233d2500723e5594f3e7c70896ffeeef32b9c950ywan
428233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmulhw      xmm1,                   xmm4            ; Filter 2 (lo) * 9
429233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmulhw      xmm0,                   xmm4            ; Filter 2 (hi) * 9
430233d2500723e5594f3e7c70896ffeeef32b9c950ywan
431233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhbw   xmm7,                   xmm5            ; axbxcxdx
432233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm5,                   xmm5            ; exfxgxhx
433233d2500723e5594f3e7c70896ffeeef32b9c950ywan
434233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psraw       xmm7,                   11              ; sign extended shift right by 3
435233d2500723e5594f3e7c70896ffeeef32b9c950ywan
436233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psraw       xmm5,                   11              ; sign extended shift right by 3
437233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhbw   xmm4,                   xmm2            ; axbxcxdx
438233d2500723e5594f3e7c70896ffeeef32b9c950ywan
439233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm2,                   xmm2            ; exfxgxhx
440233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psraw       xmm4,                   11              ; sign extended shift right by 3
441233d2500723e5594f3e7c70896ffeeef32b9c950ywan
442233d2500723e5594f3e7c70896ffeeef32b9c950ywan        packsswb    xmm5,                   xmm7            ; Filter2 >>=3;
443233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psraw       xmm2,                   11              ; sign extended shift right by 3
444233d2500723e5594f3e7c70896ffeeef32b9c950ywan
445233d2500723e5594f3e7c70896ffeeef32b9c950ywan        packsswb    xmm2,                   xmm4            ; Filter1 >>=3;
446233d2500723e5594f3e7c70896ffeeef32b9c950ywan
447233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddsb      xmm6,                   xmm5            ; ps0 =ps0 + Fitler2
448233d2500723e5594f3e7c70896ffeeef32b9c950ywan
449233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubsb      xmm3,                   xmm2            ; qs0 =qs0 - Filter1
450233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm7,                   xmm1
451233d2500723e5594f3e7c70896ffeeef32b9c950ywan
452233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm4,                   [GLOBAL(s63)]
453233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm5,                   xmm0
454233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm2,                   xmm5
455233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm0,                   xmm4            ; Filter 2 (hi) * 9 + 63
456233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm1,                   xmm4            ; Filter 2 (lo) * 9 + 63
457233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm4,                   xmm7
458233d2500723e5594f3e7c70896ffeeef32b9c950ywan
459233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm5,                   xmm5            ; Filter 2 (hi) * 18
460233d2500723e5594f3e7c70896ffeeef32b9c950ywan
461233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm7,                   xmm7            ; Filter 2 (lo) * 18
462233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm5,                   xmm0            ; Filter 2 (hi) * 27 + 63
463233d2500723e5594f3e7c70896ffeeef32b9c950ywan
464233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm7,                   xmm1            ; Filter 2 (lo) * 27 + 63
465233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm2,                   xmm0            ; Filter 2 (hi) * 18 + 63
466233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psraw       xmm0,                   7               ; (Filter 2 (hi) * 9 + 63) >> 7
467233d2500723e5594f3e7c70896ffeeef32b9c950ywan
468233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddw       xmm4,                   xmm1            ; Filter 2 (lo) * 18 + 63
469233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psraw       xmm1,                   7               ; (Filter 2 (lo) * 9 + 63) >> 7
470233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psraw       xmm2,                   7               ; (Filter 2 (hi) * 18 + 63) >> 7
471233d2500723e5594f3e7c70896ffeeef32b9c950ywan
472233d2500723e5594f3e7c70896ffeeef32b9c950ywan        packsswb    xmm0,                   xmm1            ; u1 = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
473233d2500723e5594f3e7c70896ffeeef32b9c950ywan
474233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psraw       xmm4,                   7               ; (Filter 2 (lo) * 18 + 63) >> 7
475233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psraw       xmm5,                   7               ; (Filter 2 (hi) * 27 + 63) >> 7
476233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psraw       xmm7,                   7               ; (Filter 2 (lo) * 27 + 63) >> 7
477233d2500723e5594f3e7c70896ffeeef32b9c950ywan
478233d2500723e5594f3e7c70896ffeeef32b9c950ywan        packsswb    xmm5,                   xmm7            ; u3 = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
479233d2500723e5594f3e7c70896ffeeef32b9c950ywan        packsswb    xmm2,                   xmm4            ; u2 = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
480233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm7,                   [GLOBAL(t80)]
481233d2500723e5594f3e7c70896ffeeef32b9c950ywan
482233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if %1 == 0
483233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm1,                   [rsp+_q1]       ; q1
484233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm4,                   [rsp+_p1]       ; p1
485233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rsi,                    [rsi+rcx*2]
486233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rdi,                    [rdi+rcx*2]
487233d2500723e5594f3e7c70896ffeeef32b9c950ywan
488233d2500723e5594f3e7c70896ffeeef32b9c950ywan%elif %1 == 1
489233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm1,                   [rdi]           ; q1
490233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm4,                   [rsi+rax*2]     ; p1
491233d2500723e5594f3e7c70896ffeeef32b9c950ywan%elif %1 == 2
492233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm4,                   [rsp+_p1]       ; p1
493233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm1,                   [rsp+_q1]       ; q1
494233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif
495233d2500723e5594f3e7c70896ffeeef32b9c950ywan
496233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm1,                   xmm7
497233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm4,                   xmm7
498233d2500723e5594f3e7c70896ffeeef32b9c950ywan
499233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubsb      xmm3,                   xmm5            ; sq = vp8_signed_char_clamp(qs0 - u3)
500233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddsb      xmm6,                   xmm5            ; sp = vp8_signed_char_clamp(ps0 - u3)
501233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubsb      xmm1,                   xmm2            ; sq = vp8_signed_char_clamp(qs1 - u2)
502233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddsb      xmm4,                   xmm2            ; sp = vp8_signed_char_clamp(ps1 - u2)
503233d2500723e5594f3e7c70896ffeeef32b9c950ywan
504233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if %1 == 1
505233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm2,                   [rdi+rax*4]     ; p2
506233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm5,                   [rdi+rcx]       ; q2
507233d2500723e5594f3e7c70896ffeeef32b9c950ywan%else
508233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm2,                   [rsp+_p2]       ; p2
509233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm5,                   [rsp+_q2]       ; q2
510233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif
511233d2500723e5594f3e7c70896ffeeef32b9c950ywan
512233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm1,                   xmm7            ; *oq1 = sq^0x80;
513233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm4,                   xmm7            ; *op1 = sp^0x80;
514233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm2,                   xmm7
515233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm5,                   xmm7
516233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddsb      xmm2,                   xmm0            ; sp = vp8_signed_char_clamp(ps2 - u)
517233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubsb      xmm5,                   xmm0            ; sq = vp8_signed_char_clamp(qs2 - u)
518233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm2,                   xmm7            ; *op2 = sp^0x80;
519233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm5,                   xmm7            ; *oq2 = sq^0x80;
520233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm3,                   xmm7            ; *oq0 = sq^0x80
521233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm6,                   xmm7            ; *oq0 = sp^0x80
522233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if %1 == 0
523233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        [rsi],                  xmm6            ; p0
524233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movhps      [rdi],                  xmm6
525233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        [rsi + rcx],            xmm3            ; q0
526233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movhps      [rdi + rcx],            xmm3
527233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rdx,                    [rcx + rcx*2]
528233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        [rsi+rcx*2],            xmm1            ; q1
529233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movhps      [rdi+rcx*2],            xmm1
530233d2500723e5594f3e7c70896ffeeef32b9c950ywan
531233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        [rsi + rax],            xmm4            ; p1
532233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movhps      [rdi + rax],            xmm4
533233d2500723e5594f3e7c70896ffeeef32b9c950ywan
534233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        [rsi+rax*2],            xmm2            ; p2
535233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movhps      [rdi+rax*2],            xmm2
536233d2500723e5594f3e7c70896ffeeef32b9c950ywan
537233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        [rsi+rdx],              xmm5            ; q2
538233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movhps      [rdi+rdx],              xmm5
539233d2500723e5594f3e7c70896ffeeef32b9c950ywan%elif %1 == 1
540233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      [rdi+rcx],              xmm5            ; q2
541233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      [rdi],                  xmm1            ; q1
542233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      [rsi],                  xmm3            ; q0
543233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      [rsi+rax  ],            xmm6            ; p0
544233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      [rsi+rax*2],            xmm4            ; p1
545233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      [rdi+rax*4],            xmm2            ; p2
546233d2500723e5594f3e7c70896ffeeef32b9c950ywan%elif %1 == 2
547233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      [rsp+_p1],              xmm4            ; p1
548233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      [rsp+_p0],              xmm6            ; p0
549233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      [rsp+_q0],              xmm3            ; q0
550233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      [rsp+_q1],              xmm1            ; q1
551233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif
552233d2500723e5594f3e7c70896ffeeef32b9c950ywan
553233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
554233d2500723e5594f3e7c70896ffeeef32b9c950ywan
555233d2500723e5594f3e7c70896ffeeef32b9c950ywan
556233d2500723e5594f3e7c70896ffeeef32b9c950ywan;void vp8_mbloop_filter_horizontal_edge_sse2
557233d2500723e5594f3e7c70896ffeeef32b9c950ywan;(
558233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned char *src_ptr,
559233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    int            src_pixel_step,
560233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    const char    *blimit,
561233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    const char    *limit,
562233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    const char    *thresh,
563233d2500723e5594f3e7c70896ffeeef32b9c950ywan;)
564233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp8_mbloop_filter_horizontal_edge_sse2) PRIVATE
565233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp8_mbloop_filter_horizontal_edge_sse2):
566233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rbp
567233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         rbp, rsp
568233d2500723e5594f3e7c70896ffeeef32b9c950ywan    SHADOW_ARGS_TO_STACK 5
569233d2500723e5594f3e7c70896ffeeef32b9c950ywan    SAVE_XMM 7
570233d2500723e5594f3e7c70896ffeeef32b9c950ywan    GET_GOT     rbx
571233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rsi
572233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rdi
573233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; end prolog
574233d2500723e5594f3e7c70896ffeeef32b9c950ywan
575233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ALIGN_STACK 16, rax
576233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         rsp, lf_var_size
577233d2500723e5594f3e7c70896ffeeef32b9c950ywan
578233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rsi,                    arg(0)            ;src_ptr
579233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movsxd      rax,                    dword ptr arg(1)  ;src_pixel_step
580233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rdx,                    arg(3)            ;limit
581233d2500723e5594f3e7c70896ffeeef32b9c950ywan
582233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rdi,                    [rsi+rax]         ; rdi points to row +1 for indirect addressing
583233d2500723e5594f3e7c70896ffeeef32b9c950ywan
584233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; calculate breakout conditions and high edge variance
585233d2500723e5594f3e7c70896ffeeef32b9c950ywan        LFH_FILTER_AND_HEV_MASK 1
586233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; filter and write back the results
587233d2500723e5594f3e7c70896ffeeef32b9c950ywan        MB_FILTER_AND_WRITEBACK 1
588233d2500723e5594f3e7c70896ffeeef32b9c950ywan
589233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add rsp, lf_var_size
590233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rsp
591233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; begin epilog
592233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rdi
593233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rsi
594233d2500723e5594f3e7c70896ffeeef32b9c950ywan    RESTORE_GOT
595233d2500723e5594f3e7c70896ffeeef32b9c950ywan    RESTORE_XMM
596233d2500723e5594f3e7c70896ffeeef32b9c950ywan    UNSHADOW_ARGS
597233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop         rbp
598233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ret
599233d2500723e5594f3e7c70896ffeeef32b9c950ywan
600233d2500723e5594f3e7c70896ffeeef32b9c950ywan
601233d2500723e5594f3e7c70896ffeeef32b9c950ywan;void vp8_mbloop_filter_horizontal_edge_uv_sse2
602233d2500723e5594f3e7c70896ffeeef32b9c950ywan;(
603233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned char *u,
604233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    int            src_pixel_step,
605233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    const char    *blimit,
606233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    const char    *limit,
607233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    const char    *thresh,
608233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned char *v
609233d2500723e5594f3e7c70896ffeeef32b9c950ywan;)
610233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp8_mbloop_filter_horizontal_edge_uv_sse2) PRIVATE
611233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
612233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rbp
613233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         rbp, rsp
614233d2500723e5594f3e7c70896ffeeef32b9c950ywan    SHADOW_ARGS_TO_STACK 6
615233d2500723e5594f3e7c70896ffeeef32b9c950ywan    SAVE_XMM 7
616233d2500723e5594f3e7c70896ffeeef32b9c950ywan    GET_GOT     rbx
617233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rsi
618233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rdi
619233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; end prolog
620233d2500723e5594f3e7c70896ffeeef32b9c950ywan
621233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ALIGN_STACK 16, rax
622233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         rsp, lf_var_size
623233d2500723e5594f3e7c70896ffeeef32b9c950ywan
624233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rsi,                    arg(0)             ; u
625233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rdi,                    arg(5)             ; v
626233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movsxd      rax,                    dword ptr arg(1)   ; src_pixel_step
627233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rcx,                    rax
628233d2500723e5594f3e7c70896ffeeef32b9c950ywan        neg         rax                     ; negate pitch to deal with above border
629233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rdx,                    arg(3)             ;limit
630233d2500723e5594f3e7c70896ffeeef32b9c950ywan
631233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rsi,                    [rsi + rcx]
632233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rdi,                    [rdi + rcx]
633233d2500723e5594f3e7c70896ffeeef32b9c950ywan
634233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; calculate breakout conditions and high edge variance
635233d2500723e5594f3e7c70896ffeeef32b9c950ywan        LFH_FILTER_AND_HEV_MASK 0
636233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; filter and write back the results
637233d2500723e5594f3e7c70896ffeeef32b9c950ywan        MB_FILTER_AND_WRITEBACK 0
638233d2500723e5594f3e7c70896ffeeef32b9c950ywan
639233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add rsp, lf_var_size
640233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rsp
641233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; begin epilog
642233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rdi
643233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rsi
644233d2500723e5594f3e7c70896ffeeef32b9c950ywan    RESTORE_GOT
645233d2500723e5594f3e7c70896ffeeef32b9c950ywan    RESTORE_XMM
646233d2500723e5594f3e7c70896ffeeef32b9c950ywan    UNSHADOW_ARGS
647233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop         rbp
648233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ret
649233d2500723e5594f3e7c70896ffeeef32b9c950ywan
650233d2500723e5594f3e7c70896ffeeef32b9c950ywan
651233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro TRANSPOSE_16X8 2
652233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        xmm4,               [rsi]           ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00
653233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        xmm1,               [rdi]           ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10
654233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        xmm0,               [rsi+2*rax]     ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20
655233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        xmm7,               [rdi+2*rax]     ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30
656233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        xmm5,               [rsi+4*rax]     ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40
657233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        xmm2,               [rdi+4*rax]     ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50
658233d2500723e5594f3e7c70896ffeeef32b9c950ywan
659233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm4,               xmm1            ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
660233d2500723e5594f3e7c70896ffeeef32b9c950ywan
661233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        xmm1,               [rdi+2*rcx]     ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70
662233d2500723e5594f3e7c70896ffeeef32b9c950ywan
663233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm3,               xmm4            ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
664233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm0,               xmm7            ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20
665233d2500723e5594f3e7c70896ffeeef32b9c950ywan
666233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        xmm7,               [rsi+2*rcx]     ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60
667233d2500723e5594f3e7c70896ffeeef32b9c950ywan
668233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm5,               xmm2            ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
669233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if %1
670233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rsi,                [rsi+rax*8]
671233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rdi,                [rdi+rax*8]
672233d2500723e5594f3e7c70896ffeeef32b9c950ywan%else
673233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rsi,                arg(5)          ; v_ptr
674233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif
675233d2500723e5594f3e7c70896ffeeef32b9c950ywan
676233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm6,               xmm5            ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
677233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm7,               xmm1            ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60
678233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklwd   xmm5,               xmm7            ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
679233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhwd   xmm6,               xmm7            ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44
680233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklwd   xmm3,               xmm0            ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
681233d2500723e5594f3e7c70896ffeeef32b9c950ywan
682233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if %1 == 0
683233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rdi,                [rsi + rax - 4] ; rdi points to row +1 for indirect addressing
684233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rsi,                [rsi - 4]
685233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif
686233d2500723e5594f3e7c70896ffeeef32b9c950ywan
687233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm2,               xmm3            ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
688233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhwd   xmm4,               xmm0            ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
689233d2500723e5594f3e7c70896ffeeef32b9c950ywan
690233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm7,               xmm4            ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
691233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhdq   xmm3,               xmm5            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
692233d2500723e5594f3e7c70896ffeeef32b9c950ywan
693233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhdq   xmm7,               xmm6            ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
694233d2500723e5594f3e7c70896ffeeef32b9c950ywan
695233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckldq   xmm4,               xmm6            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
696233d2500723e5594f3e7c70896ffeeef32b9c950ywan
697233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckldq   xmm2,               xmm5            ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
698233d2500723e5594f3e7c70896ffeeef32b9c950ywan
699233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      [rsp+_t0],          xmm2            ; save to free XMM2
700233d2500723e5594f3e7c70896ffeeef32b9c950ywan
701233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        xmm2,               [rsi]           ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80
702233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        xmm6,               [rdi]           ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90
703233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        xmm0,               [rsi+2*rax]     ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0
704233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        xmm5,               [rdi+2*rax]     ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0
705233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        xmm1,               [rsi+4*rax]     ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0
706233d2500723e5594f3e7c70896ffeeef32b9c950ywan
707233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm2,               xmm6            ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
708233d2500723e5594f3e7c70896ffeeef32b9c950ywan
709233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        xmm6,               [rdi+4*rax]     ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0
710233d2500723e5594f3e7c70896ffeeef32b9c950ywan
711233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm0,               xmm5            ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0
712233d2500723e5594f3e7c70896ffeeef32b9c950ywan
713233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        xmm5,               [rsi+2*rcx]     ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0
714233d2500723e5594f3e7c70896ffeeef32b9c950ywan
715233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm1,               xmm6            ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0
716233d2500723e5594f3e7c70896ffeeef32b9c950ywan
717233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        xmm6,               [rdi+2*rcx]     ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0
718233d2500723e5594f3e7c70896ffeeef32b9c950ywan
719233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm5,               xmm6            ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0
720233d2500723e5594f3e7c70896ffeeef32b9c950ywan
721233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm6,               xmm1            ;
722233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhwd   xmm6,               xmm5            ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4
723233d2500723e5594f3e7c70896ffeeef32b9c950ywan
724233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklwd   xmm1,               xmm5            ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
725233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm5,               xmm2            ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
726233d2500723e5594f3e7c70896ffeeef32b9c950ywan
727233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklwd   xmm5,               xmm0            ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
728233d2500723e5594f3e7c70896ffeeef32b9c950ywan
729233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhwd   xmm2,               xmm0            ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
730233d2500723e5594f3e7c70896ffeeef32b9c950ywan
731233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm0,               xmm5
732233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckldq   xmm0,               xmm1            ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
733233d2500723e5594f3e7c70896ffeeef32b9c950ywan
734233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhdq   xmm5,               xmm1            ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
735233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm1,               xmm2            ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
736233d2500723e5594f3e7c70896ffeeef32b9c950ywan
737233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckldq   xmm1,               xmm6            ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84
738233d2500723e5594f3e7c70896ffeeef32b9c950ywan
739233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhdq   xmm2,               xmm6            ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86
740233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm6,               xmm7            ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
741233d2500723e5594f3e7c70896ffeeef32b9c950ywan
742233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklqdq  xmm6,               xmm2            ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
743233d2500723e5594f3e7c70896ffeeef32b9c950ywan
744233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhqdq  xmm7,               xmm2            ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07
745233d2500723e5594f3e7c70896ffeeef32b9c950ywan
746233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if %2 == 0
747233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      [rsp+_q3],          xmm7            ; save 7
748233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      [rsp+_q2],          xmm6            ; save 6
749233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif
750233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm2,               xmm3            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
751233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhqdq  xmm3,               xmm5            ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
752233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklqdq  xmm2,               xmm5            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
753233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      [rsp+_p1],          xmm2            ; save 2
754233d2500723e5594f3e7c70896ffeeef32b9c950ywan
755233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm5,               xmm4            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
756233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklqdq  xmm4,               xmm1            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
757233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      [rsp+_p0],          xmm3            ; save 3
758233d2500723e5594f3e7c70896ffeeef32b9c950ywan
759233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhqdq  xmm5,               xmm1            ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
760233d2500723e5594f3e7c70896ffeeef32b9c950ywan
761233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      [rsp+_q0],          xmm4            ; save 4
762233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      [rsp+_q1],          xmm5            ; save 5
763233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm1,               [rsp+_t0]
764233d2500723e5594f3e7c70896ffeeef32b9c950ywan
765233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm2,               xmm1            ;
766233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhqdq  xmm1,               xmm0            ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
767233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklqdq  xmm2,               xmm0            ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
768233d2500723e5594f3e7c70896ffeeef32b9c950ywan
769233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if %2 == 0
770233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      [rsp+_p2],          xmm1
771233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      [rsp+_p3],          xmm2
772233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif
773233d2500723e5594f3e7c70896ffeeef32b9c950ywan
774233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
775233d2500723e5594f3e7c70896ffeeef32b9c950ywan
776233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro LFV_FILTER_MASK_HEV_MASK 0
777233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm0,               xmm6            ; q2
778233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm0,               xmm7            ; q2-q3
779233d2500723e5594f3e7c70896ffeeef32b9c950ywan
780233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm7,               xmm6            ; q3-q2
781233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm4,               xmm5            ; q1
782233d2500723e5594f3e7c70896ffeeef32b9c950ywan
783233d2500723e5594f3e7c70896ffeeef32b9c950ywan        por         xmm7,               xmm0            ; abs (q3-q2)
784233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm4,               xmm6            ; q1-q2
785233d2500723e5594f3e7c70896ffeeef32b9c950ywan
786233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm0,               xmm1
787233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm6,               xmm5            ; q2-q1
788233d2500723e5594f3e7c70896ffeeef32b9c950ywan
789233d2500723e5594f3e7c70896ffeeef32b9c950ywan        por         xmm6,               xmm4            ; abs (q2-q1)
790233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm0,               xmm2            ; p2 - p3;
791233d2500723e5594f3e7c70896ffeeef32b9c950ywan
792233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm2,               xmm1            ; p3 - p2;
793233d2500723e5594f3e7c70896ffeeef32b9c950ywan        por         xmm0,               xmm2            ; abs(p2-p3)
794233d2500723e5594f3e7c70896ffeeef32b9c950ywan
795233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm5,               [rsp+_p1]       ; p1
796233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaxub      xmm0,               xmm7
797233d2500723e5594f3e7c70896ffeeef32b9c950ywan
798233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm2,               xmm5            ; p1
799233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm5,               xmm1            ; p1-p2
800233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm1,               xmm2            ; p2-p1
801233d2500723e5594f3e7c70896ffeeef32b9c950ywan
802233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm7,               xmm3            ; p0
803233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm7,               xmm2            ; p0-p1
804233d2500723e5594f3e7c70896ffeeef32b9c950ywan
805233d2500723e5594f3e7c70896ffeeef32b9c950ywan        por         xmm1,               xmm5            ; abs(p2-p1)
806233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaxub      xmm0,               xmm6
807233d2500723e5594f3e7c70896ffeeef32b9c950ywan
808233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaxub      xmm0,               xmm1
809233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm1,               xmm2            ; p1
810233d2500723e5594f3e7c70896ffeeef32b9c950ywan
811233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm2,               xmm3            ; p1-p0
812233d2500723e5594f3e7c70896ffeeef32b9c950ywan
813233d2500723e5594f3e7c70896ffeeef32b9c950ywan        por         xmm2,               xmm7            ; abs(p1-p0)
814233d2500723e5594f3e7c70896ffeeef32b9c950ywan
815233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaxub      xmm0,               xmm2
816233d2500723e5594f3e7c70896ffeeef32b9c950ywan
817233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm5,               [rsp+_q0]       ; q0
818233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm7,               [rsp+_q1]       ; q1
819233d2500723e5594f3e7c70896ffeeef32b9c950ywan
820233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rdx,                arg(3)          ; limit
821233d2500723e5594f3e7c70896ffeeef32b9c950ywan
822233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm6,               xmm5            ; q0
823233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm4,               xmm7            ; q1
824233d2500723e5594f3e7c70896ffeeef32b9c950ywan
825233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm5,               xmm7            ; q0-q1
826233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm7,               xmm6            ; q1-q0
827233d2500723e5594f3e7c70896ffeeef32b9c950ywan
828233d2500723e5594f3e7c70896ffeeef32b9c950ywan        por         xmm7,               xmm5            ; abs(q1-q0)
829233d2500723e5594f3e7c70896ffeeef32b9c950ywan
830233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pmaxub      xmm0,               xmm7
831233d2500723e5594f3e7c70896ffeeef32b9c950ywan
832233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm0,               [rdx]           ; limit
833233d2500723e5594f3e7c70896ffeeef32b9c950ywan
834233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rdx,                arg(2)          ; blimit
835233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm5,               xmm4            ; q1
836233d2500723e5594f3e7c70896ffeeef32b9c950ywan
837233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm5,               xmm1            ; q1-=p1
838233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm1,               xmm4            ; p1-=q1
839233d2500723e5594f3e7c70896ffeeef32b9c950ywan
840233d2500723e5594f3e7c70896ffeeef32b9c950ywan        por         xmm5,               xmm1            ; abs(p1-q1)
841233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm1,               xmm3            ; p0
842233d2500723e5594f3e7c70896ffeeef32b9c950ywan
843233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pand        xmm5,               [GLOBAL(tfe)]   ; set lsb of each byte to zero
844233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm1,               xmm6            ; p0-q0
845233d2500723e5594f3e7c70896ffeeef32b9c950ywan
846233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm4,               [rdx]           ; blimit
847233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rdx,                arg(4)          ; get thresh
848233d2500723e5594f3e7c70896ffeeef32b9c950ywan
849233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrlw       xmm5,               1               ; abs(p1-q1)/2
850233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm6,               xmm3            ; q0-p0
851233d2500723e5594f3e7c70896ffeeef32b9c950ywan
852233d2500723e5594f3e7c70896ffeeef32b9c950ywan        por         xmm1,               xmm6            ; abs(q0-p0)
853233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddusb     xmm1,               xmm1            ; abs(q0-p0)*2
854233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm3,               [rdx]
855233d2500723e5594f3e7c70896ffeeef32b9c950ywan
856233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddusb     xmm1,               xmm5            ; abs (p0 - q0) *2 + abs(p1-q1)/2
857233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm2,               xmm3            ; abs(q1 - q0) > thresh
858233d2500723e5594f3e7c70896ffeeef32b9c950ywan
859233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm7,               xmm3            ; abs(p1 - p0)> thresh
860233d2500723e5594f3e7c70896ffeeef32b9c950ywan
861233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm1,               xmm4            ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
862233d2500723e5594f3e7c70896ffeeef32b9c950ywan        por         xmm2,               xmm7            ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
863233d2500723e5594f3e7c70896ffeeef32b9c950ywan
864233d2500723e5594f3e7c70896ffeeef32b9c950ywan        por         xmm1,               xmm0            ; mask
865233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pcmpeqb     xmm2,               xmm0
866233d2500723e5594f3e7c70896ffeeef32b9c950ywan
867233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm0,               xmm0
868233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pcmpeqb     xmm4,               xmm4
869233d2500723e5594f3e7c70896ffeeef32b9c950ywan
870233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pcmpeqb     xmm1,               xmm0
871233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm4,               xmm2
872233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
873233d2500723e5594f3e7c70896ffeeef32b9c950ywan
874233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro BV_TRANSPOSE 0
875233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; xmm1 =    f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
876233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; xmm6 =    f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
877233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; xmm3 =    f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
878233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; xmm7 =    f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
879233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm2,               xmm1            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
880233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm2,               xmm6            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
881233d2500723e5594f3e7c70896ffeeef32b9c950ywan
882233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm4,               xmm3            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
883233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhbw   xmm1,               xmm6            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
884233d2500723e5594f3e7c70896ffeeef32b9c950ywan
885233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm4,               xmm7            ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
886233d2500723e5594f3e7c70896ffeeef32b9c950ywan
887233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhbw   xmm3,               xmm7            ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
888233d2500723e5594f3e7c70896ffeeef32b9c950ywan
889233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm6,               xmm2            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
890233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklwd   xmm2,               xmm4            ; 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
891233d2500723e5594f3e7c70896ffeeef32b9c950ywan
892233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhwd   xmm6,               xmm4            ; 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
893233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm5,               xmm1            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
894233d2500723e5594f3e7c70896ffeeef32b9c950ywan
895233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklwd   xmm1,               xmm3            ; b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
896233d2500723e5594f3e7c70896ffeeef32b9c950ywan
897233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhwd   xmm5,               xmm3            ; f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
898233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; xmm2 = 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
899233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; xmm6 = 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
900233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; xmm1 = b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
901233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; xmm5 = f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
902233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
903233d2500723e5594f3e7c70896ffeeef32b9c950ywan
904233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro BV_WRITEBACK 2
905233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd        [rsi+2],            %1
906233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd        [rsi+4*rax+2],      %2
907233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrldq      %1,                 4
908233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrldq      %2,                 4
909233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd        [rdi+2],            %1
910233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd        [rdi+4*rax+2],      %2
911233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrldq      %1,                 4
912233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrldq      %2,                 4
913233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd        [rsi+2*rax+2],      %1
914233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd        [rsi+2*rcx+2],      %2
915233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrldq      %1,                 4
916233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrldq      %2,                 4
917233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd        [rdi+2*rax+2],      %1
918233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd        [rdi+2*rcx+2],      %2
919233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
920233d2500723e5594f3e7c70896ffeeef32b9c950ywan
921233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if ABI_IS_32BIT
922233d2500723e5594f3e7c70896ffeeef32b9c950ywan
923233d2500723e5594f3e7c70896ffeeef32b9c950ywan;void vp8_loop_filter_vertical_edge_sse2
924233d2500723e5594f3e7c70896ffeeef32b9c950ywan;(
925233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned char *src_ptr,
926233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    int            src_pixel_step,
927233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    const char    *blimit,
928233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    const char    *limit,
929233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    const char    *thresh,
930233d2500723e5594f3e7c70896ffeeef32b9c950ywan;)
931233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp8_loop_filter_vertical_edge_sse2) PRIVATE
932233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp8_loop_filter_vertical_edge_sse2):
933233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rbp
934233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         rbp, rsp
935233d2500723e5594f3e7c70896ffeeef32b9c950ywan    SHADOW_ARGS_TO_STACK 5
936233d2500723e5594f3e7c70896ffeeef32b9c950ywan    SAVE_XMM 7
937233d2500723e5594f3e7c70896ffeeef32b9c950ywan    GET_GOT     rbx
938233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rsi
939233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rdi
940233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; end prolog
941233d2500723e5594f3e7c70896ffeeef32b9c950ywan
942233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ALIGN_STACK 16, rax
943233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub             rsp, lf_var_size
944233d2500723e5594f3e7c70896ffeeef32b9c950ywan
945233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rsi,        arg(0)                  ; src_ptr
946233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movsxd      rax,        dword ptr arg(1)        ; src_pixel_step
947233d2500723e5594f3e7c70896ffeeef32b9c950ywan
948233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rsi,        [rsi - 4]
949233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
950233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rcx,        [rax*2+rax]
951233d2500723e5594f3e7c70896ffeeef32b9c950ywan
952233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ;transpose 16x8 to 8x16, and store the 8-line result on stack.
953233d2500723e5594f3e7c70896ffeeef32b9c950ywan        TRANSPOSE_16X8 1, 1
954233d2500723e5594f3e7c70896ffeeef32b9c950ywan
955233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; calculate filter mask and high edge variance
956233d2500723e5594f3e7c70896ffeeef32b9c950ywan        LFV_FILTER_MASK_HEV_MASK
957233d2500723e5594f3e7c70896ffeeef32b9c950ywan
958233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; start work on filters
959233d2500723e5594f3e7c70896ffeeef32b9c950ywan        B_FILTER 2
960233d2500723e5594f3e7c70896ffeeef32b9c950ywan
961233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; transpose and write back - only work on q1, q0, p0, p1
962233d2500723e5594f3e7c70896ffeeef32b9c950ywan        BV_TRANSPOSE
963233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; store 16-line result
964233d2500723e5594f3e7c70896ffeeef32b9c950ywan
965233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rdx,        [rax]
966233d2500723e5594f3e7c70896ffeeef32b9c950ywan        neg         rdx
967233d2500723e5594f3e7c70896ffeeef32b9c950ywan
968233d2500723e5594f3e7c70896ffeeef32b9c950ywan        BV_WRITEBACK xmm1, xmm5
969233d2500723e5594f3e7c70896ffeeef32b9c950ywan
970233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rsi,        [rsi+rdx*8]
971233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rdi,        [rdi+rdx*8]
972233d2500723e5594f3e7c70896ffeeef32b9c950ywan        BV_WRITEBACK xmm2, xmm6
973233d2500723e5594f3e7c70896ffeeef32b9c950ywan
974233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add rsp, lf_var_size
975233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rsp
976233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; begin epilog
977233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rdi
978233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rsi
979233d2500723e5594f3e7c70896ffeeef32b9c950ywan    RESTORE_GOT
980233d2500723e5594f3e7c70896ffeeef32b9c950ywan    RESTORE_XMM
981233d2500723e5594f3e7c70896ffeeef32b9c950ywan    UNSHADOW_ARGS
982233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop         rbp
983233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ret
984233d2500723e5594f3e7c70896ffeeef32b9c950ywan
985233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif
986233d2500723e5594f3e7c70896ffeeef32b9c950ywan
987233d2500723e5594f3e7c70896ffeeef32b9c950ywan;void vp8_loop_filter_vertical_edge_uv_sse2
988233d2500723e5594f3e7c70896ffeeef32b9c950ywan;(
989233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned char *u,
990233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    int            src_pixel_step,
991233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    const char    *blimit,
992233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    const char    *limit,
993233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    const char    *thresh,
994233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned char *v
995233d2500723e5594f3e7c70896ffeeef32b9c950ywan;)
996233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp8_loop_filter_vertical_edge_uv_sse2) PRIVATE
997233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp8_loop_filter_vertical_edge_uv_sse2):
998233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rbp
999233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         rbp, rsp
1000233d2500723e5594f3e7c70896ffeeef32b9c950ywan    SHADOW_ARGS_TO_STACK 6
1001233d2500723e5594f3e7c70896ffeeef32b9c950ywan    SAVE_XMM 7
1002233d2500723e5594f3e7c70896ffeeef32b9c950ywan    GET_GOT     rbx
1003233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rsi
1004233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rdi
1005233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; end prolog
1006233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1007233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ALIGN_STACK 16, rax
1008233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub             rsp, lf_var_size
1009233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1010233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rsi,        arg(0)                  ; u_ptr
1011233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movsxd      rax,        dword ptr arg(1)        ; src_pixel_step
1012233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1013233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rsi,        [rsi - 4]
1014233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
1015233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rcx,        [rax+2*rax]
1016233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1017233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ;transpose 16x8 to 8x16, and store the 8-line result on stack.
1018233d2500723e5594f3e7c70896ffeeef32b9c950ywan        TRANSPOSE_16X8 0, 1
1019233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1020233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; calculate filter mask and high edge variance
1021233d2500723e5594f3e7c70896ffeeef32b9c950ywan        LFV_FILTER_MASK_HEV_MASK
1022233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1023233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; start work on filters
1024233d2500723e5594f3e7c70896ffeeef32b9c950ywan        B_FILTER 2
1025233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1026233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; transpose and write back - only work on q1, q0, p0, p1
1027233d2500723e5594f3e7c70896ffeeef32b9c950ywan        BV_TRANSPOSE
1028233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1029233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
1030233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1031233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; store 16-line result
1032233d2500723e5594f3e7c70896ffeeef32b9c950ywan        BV_WRITEBACK xmm1, xmm5
1033233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1034233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rsi,        arg(0)                  ; u_ptr
1035233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rsi,        [rsi - 4]
1036233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
1037233d2500723e5594f3e7c70896ffeeef32b9c950ywan        BV_WRITEBACK xmm2, xmm6
1038233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1039233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add rsp, lf_var_size
1040233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rsp
1041233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; begin epilog
1042233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rdi
1043233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rsi
1044233d2500723e5594f3e7c70896ffeeef32b9c950ywan    RESTORE_GOT
1045233d2500723e5594f3e7c70896ffeeef32b9c950ywan    RESTORE_XMM
1046233d2500723e5594f3e7c70896ffeeef32b9c950ywan    UNSHADOW_ARGS
1047233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop         rbp
1048233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ret
1049233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1050233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro MBV_TRANSPOSE 0
1051233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm0,               [rsp+_p3]           ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
1052233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm1,               xmm0                ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
1053233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1054233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm0,               xmm2                ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
1055233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhbw   xmm1,               xmm2                ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
1056233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1057233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm7,               [rsp+_p1]           ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
1058233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm6,               xmm7                ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
1059233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1060233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm7,               [rsp+_p0]           ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
1061233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhbw   xmm6,               [rsp+_p0]           ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
1062233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1063233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm3,               xmm0                ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
1064233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklwd   xmm0,               xmm7                ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
1065233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1066233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhwd   xmm3,               xmm7                ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
1067233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm4,               xmm1                ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
1068233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1069233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklwd   xmm1,               xmm6                ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
1070233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhwd   xmm4,               xmm6                ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
1071233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1072233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm7,               [rsp+_q0]           ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
1073233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm7,               [rsp+_q1]           ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
1074233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1075233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm6,               xmm5                ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
1076233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm6,               [rsp+_q3]           ; 77 76 67 66 57 56 47 46 37 36 27 26 17 16 07 06
1077233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1078233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm2,               xmm7                ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
1079233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklwd   xmm7,               xmm6                ; 37 36 35 34 27 26 25 24 17 16 15 14 07 06 05 04
1080233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1081233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhwd   xmm2,               xmm6                ; 77 76 75 74 67 66 65 64 57 56 55 54 47 46 45 44
1082233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm6,               xmm0                ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
1083233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1084233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckldq   xmm0,               xmm7                ; 17 16 15 14 13 12 11 10 07 06 05 04 03 02 01 00
1085233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhdq   xmm6,               xmm7                ; 37 36 35 34 33 32 31 30 27 26 25 24 23 22 21 20
1086233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
1087233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1088233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro MBV_WRITEBACK_1 0
1089233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        [rsi],              xmm0
1090233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movhps      [rdi],              xmm0
1091233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1092233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        [rsi+2*rax],        xmm6
1093233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movhps      [rdi+2*rax],        xmm6
1094233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1095233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm0,               xmm3                ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
1096233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckldq   xmm0,               xmm2                ; 57 56 55 54 53 52 51 50 47 46 45 44 43 42 41 40
1097233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhdq   xmm3,               xmm2                ; 77 76 75 74 73 72 71 70 67 66 65 64 63 62 61 60
1098233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1099233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        [rsi+4*rax],        xmm0
1100233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movhps      [rdi+4*rax],        xmm0
1101233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1102233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        [rsi+2*rcx],        xmm3
1103233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movhps      [rdi+2*rcx],        xmm3
1104233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1105233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm7,               [rsp+_q0]           ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
1106233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhbw   xmm7,               [rsp+_q1]           ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
1107233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhbw   xmm5,               [rsp+_q3]           ; f7 f6 e7 e6 d7 d6 c7 c6 b7 b6 a7 a6 97 96 87 86
1108233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1109233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm0,               xmm7
1110233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklwd   xmm0,               xmm5                ; b7 b6 b4 b4 a7 a6 a5 a4 97 96 95 94 87 86 85 84
1111233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhwd   xmm7,               xmm5                ; f7 f6 f5 f4 e7 e6 e5 e4 d7 d6 d5 d4 c7 c6 c5 c4
1112233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1113233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm5,               xmm1                ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
1114233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckldq   xmm1,               xmm0                ; 97 96 95 94 93 92 91 90 87 86 85 83 84 82 81 80
1115233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhdq   xmm5,               xmm0                ; b7 b6 b5 b4 b3 b2 b1 b0 a7 a6 a5 a4 a3 a2 a1 a0
1116233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
1117233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1118233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro MBV_WRITEBACK_2 0
1119233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        [rsi],              xmm1
1120233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movhps      [rdi],              xmm1
1121233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1122233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        [rsi+2*rax],        xmm5
1123233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movhps      [rdi+2*rax],        xmm5
1124233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1125233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm1,               xmm4                ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
1126233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckldq   xmm1,               xmm7                ; d7 d6 d5 d4 d3 d2 d1 d0 c7 c6 c5 c4 c3 c2 c1 c0
1127233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhdq   xmm4,               xmm7                ; f7 f6 f4 f4 f3 f2 f1 f0 e7 e6 e5 e4 e3 e2 e1 e0
1128233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1129233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        [rsi+4*rax],        xmm1
1130233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movhps      [rdi+4*rax],        xmm1
1131233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1132233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        [rsi+2*rcx],        xmm4
1133233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movhps      [rdi+2*rcx],        xmm4
1134233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
1135233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1136233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1137233d2500723e5594f3e7c70896ffeeef32b9c950ywan;void vp8_mbloop_filter_vertical_edge_sse2
1138233d2500723e5594f3e7c70896ffeeef32b9c950ywan;(
1139233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned char *src_ptr,
1140233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    int            src_pixel_step,
1141233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    const char    *blimit,
1142233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    const char    *limit,
1143233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    const char    *thresh,
1144233d2500723e5594f3e7c70896ffeeef32b9c950ywan;)
1145233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp8_mbloop_filter_vertical_edge_sse2) PRIVATE
1146233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp8_mbloop_filter_vertical_edge_sse2):
1147233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rbp
1148233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         rbp, rsp
1149233d2500723e5594f3e7c70896ffeeef32b9c950ywan    SHADOW_ARGS_TO_STACK 5
1150233d2500723e5594f3e7c70896ffeeef32b9c950ywan    SAVE_XMM 7
1151233d2500723e5594f3e7c70896ffeeef32b9c950ywan    GET_GOT     rbx
1152233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rsi
1153233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rdi
1154233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; end prolog
1155233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1156233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ALIGN_STACK 16, rax
1157233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub          rsp, lf_var_size
1158233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1159233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rsi,                arg(0)              ; src_ptr
1160233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movsxd      rax,                dword ptr arg(1)    ; src_pixel_step
1161233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1162233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rsi,                [rsi - 4]
1163233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rdi,                [rsi + rax]         ; rdi points to row +1 for indirect addressing
1164233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rcx,                [rax*2+rax]
1165233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1166233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; Transpose
1167233d2500723e5594f3e7c70896ffeeef32b9c950ywan        TRANSPOSE_16X8 1, 0
1168233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1169233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; calculate filter mask and high edge variance
1170233d2500723e5594f3e7c70896ffeeef32b9c950ywan        LFV_FILTER_MASK_HEV_MASK
1171233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1172233d2500723e5594f3e7c70896ffeeef32b9c950ywan        neg         rax
1173233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; start work on filters
1174233d2500723e5594f3e7c70896ffeeef32b9c950ywan        MB_FILTER_AND_WRITEBACK 2
1175233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1176233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rsi,                [rsi+rax*8]
1177233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rdi,                [rdi+rax*8]
1178233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1179233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; transpose and write back
1180233d2500723e5594f3e7c70896ffeeef32b9c950ywan        MBV_TRANSPOSE
1181233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1182233d2500723e5594f3e7c70896ffeeef32b9c950ywan        neg         rax
1183233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1184233d2500723e5594f3e7c70896ffeeef32b9c950ywan        MBV_WRITEBACK_1
1185233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1186233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1187233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rsi,                [rsi+rax*8]
1188233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rdi,                [rdi+rax*8]
1189233d2500723e5594f3e7c70896ffeeef32b9c950ywan        MBV_WRITEBACK_2
1190233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1191233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add rsp, lf_var_size
1192233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rsp
1193233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; begin epilog
1194233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rdi
1195233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rsi
1196233d2500723e5594f3e7c70896ffeeef32b9c950ywan    RESTORE_GOT
1197233d2500723e5594f3e7c70896ffeeef32b9c950ywan    RESTORE_XMM
1198233d2500723e5594f3e7c70896ffeeef32b9c950ywan    UNSHADOW_ARGS
1199233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop         rbp
1200233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ret
1201233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1202233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1203233d2500723e5594f3e7c70896ffeeef32b9c950ywan;void vp8_mbloop_filter_vertical_edge_uv_sse2
1204233d2500723e5594f3e7c70896ffeeef32b9c950ywan;(
1205233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned char *u,
1206233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    int            src_pixel_step,
1207233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    const char    *blimit,
1208233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    const char    *limit,
1209233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    const char    *thresh,
1210233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned char *v
1211233d2500723e5594f3e7c70896ffeeef32b9c950ywan;)
1212233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp8_mbloop_filter_vertical_edge_uv_sse2) PRIVATE
1213233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp8_mbloop_filter_vertical_edge_uv_sse2):
1214233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rbp
1215233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         rbp, rsp
1216233d2500723e5594f3e7c70896ffeeef32b9c950ywan    SHADOW_ARGS_TO_STACK 6
1217233d2500723e5594f3e7c70896ffeeef32b9c950ywan    SAVE_XMM 7
1218233d2500723e5594f3e7c70896ffeeef32b9c950ywan    GET_GOT     rbx
1219233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rsi
1220233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rdi
1221233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; end prolog
1222233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1223233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ALIGN_STACK 16, rax
1224233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub          rsp, lf_var_size
1225233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1226233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rsi,                arg(0)              ; u_ptr
1227233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movsxd      rax,                dword ptr arg(1)    ; src_pixel_step
1228233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1229233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rsi,                [rsi - 4]
1230233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rdi,                [rsi + rax]         ; rdi points to row +1 for indirect addressing
1231233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rcx,                [rax+2*rax]
1232233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1233233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; Transpose
1234233d2500723e5594f3e7c70896ffeeef32b9c950ywan        TRANSPOSE_16X8 0, 0
1235233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1236233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; calculate filter mask and high edge variance
1237233d2500723e5594f3e7c70896ffeeef32b9c950ywan        LFV_FILTER_MASK_HEV_MASK
1238233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1239233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; start work on filters
1240233d2500723e5594f3e7c70896ffeeef32b9c950ywan        MB_FILTER_AND_WRITEBACK 2
1241233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1242233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; transpose and write back
1243233d2500723e5594f3e7c70896ffeeef32b9c950ywan        MBV_TRANSPOSE
1244233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1245233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rsi,                arg(0)             ;u_ptr
1246233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rsi,                [rsi - 4]
1247233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rdi,                [rsi + rax]
1248233d2500723e5594f3e7c70896ffeeef32b9c950ywan        MBV_WRITEBACK_1
1249233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rsi,                arg(5)             ;v_ptr
1250233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rsi,                [rsi - 4]
1251233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rdi,                [rsi + rax]
1252233d2500723e5594f3e7c70896ffeeef32b9c950ywan        MBV_WRITEBACK_2
1253233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1254233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add rsp, lf_var_size
1255233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rsp
1256233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; begin epilog
1257233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rdi
1258233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rsi
1259233d2500723e5594f3e7c70896ffeeef32b9c950ywan    RESTORE_GOT
1260233d2500723e5594f3e7c70896ffeeef32b9c950ywan    RESTORE_XMM
1261233d2500723e5594f3e7c70896ffeeef32b9c950ywan    UNSHADOW_ARGS
1262233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop         rbp
1263233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ret
1264233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1265233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1266233d2500723e5594f3e7c70896ffeeef32b9c950ywan;void vp8_loop_filter_simple_horizontal_edge_sse2
1267233d2500723e5594f3e7c70896ffeeef32b9c950ywan;(
1268233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned char *src_ptr,
1269233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    int  src_pixel_step,
1270233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    const char *blimit,
1271233d2500723e5594f3e7c70896ffeeef32b9c950ywan;)
1272233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp8_loop_filter_simple_horizontal_edge_sse2) PRIVATE
1273233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp8_loop_filter_simple_horizontal_edge_sse2):
1274233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rbp
1275233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         rbp, rsp
1276233d2500723e5594f3e7c70896ffeeef32b9c950ywan    SHADOW_ARGS_TO_STACK 3
1277233d2500723e5594f3e7c70896ffeeef32b9c950ywan    SAVE_XMM 7
1278233d2500723e5594f3e7c70896ffeeef32b9c950ywan    GET_GOT     rbx
1279233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; end prolog
1280233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1281233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rcx, arg(0)             ;src_ptr
1282233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movsxd      rax, dword ptr arg(1)   ;src_pixel_step     ; destination pitch?
1283233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm6, [GLOBAL(tfe)]
1284233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rdx, [rcx + rax]
1285233d2500723e5594f3e7c70896ffeeef32b9c950ywan        neg         rax
1286233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1287233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; calculate mask
1288233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm0, [rdx]             ; q1
1289233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rdx, arg(2)             ;blimit
1290233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm1, [rcx+2*rax]       ; p1
1291233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1292233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm2, xmm1
1293233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm3, xmm0
1294233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1295233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm0, xmm1              ; q1-=p1
1296233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm1, xmm3              ; p1-=q1
1297233d2500723e5594f3e7c70896ffeeef32b9c950ywan        por         xmm1, xmm0              ; abs(p1-q1)
1298233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pand        xmm1, xmm6              ; set lsb of each byte to zero
1299233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrlw       xmm1, 1                 ; abs(p1-q1)/2
1300233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1301233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm7, XMMWORD PTR [rdx]
1302233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1303233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm5, [rcx+rax]         ; p0
1304233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm4, [rcx]             ; q0
1305233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm0, xmm4              ; q0
1306233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm6, xmm5              ; p0
1307233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm5, xmm4              ; p0-=q0
1308233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm4, xmm6              ; q0-=p0
1309233d2500723e5594f3e7c70896ffeeef32b9c950ywan        por         xmm5, xmm4              ; abs(p0 - q0)
1310233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1311233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm4, [GLOBAL(t80)]
1312233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1313233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddusb     xmm5, xmm5              ; abs(p0-q0)*2
1314233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddusb     xmm5, xmm1              ; abs (p0 - q0) *2 + abs(p1-q1)/2
1315233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm5, xmm7              ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
1316233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm7, xmm7
1317233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pcmpeqb     xmm5, xmm7
1318233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1319233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1320233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; start work on filters
1321233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm2, xmm4     ; p1 offset to convert to signed values
1322233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm3, xmm4     ; q1 offset to convert to signed values
1323233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubsb      xmm2, xmm3              ; p1 - q1
1324233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1325233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm6, xmm4     ; offset to convert to signed values
1326233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm0, xmm4     ; offset to convert to signed values
1327233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm3, xmm0              ; q0
1328233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubsb      xmm0, xmm6              ; q0 - p0
1329233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddsb      xmm2, xmm0              ; p1 - q1 + 1 * (q0 - p0)
1330233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddsb      xmm2, xmm0              ; p1 - q1 + 2 * (q0 - p0)
1331233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddsb      xmm2, xmm0              ; p1 - q1 + 3 * (q0 - p0)
1332233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pand        xmm5, xmm2              ; mask filter values we don't care about
1333233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1334233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm0, xmm5
1335233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddsb      xmm5,        [GLOBAL(t3)]                  ;  3* (q0 - p0) + (p1 - q1) + 4
1336233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddsb      xmm0,        [GLOBAL(t4)]                  ; +3 instead of +4
1337233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1338233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm1, [GLOBAL(te0)]
1339233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm2, [GLOBAL(t1f)]
1340233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1341233d2500723e5594f3e7c70896ffeeef32b9c950ywan;        pxor        xmm7, xmm7
1342233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pcmpgtb     xmm7, xmm0              ;save sign
1343233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pand        xmm7, xmm1              ;preserve the upper 3 bits
1344233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrlw       xmm0, 3
1345233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pand        xmm0, xmm2              ;clear out upper 3 bits
1346233d2500723e5594f3e7c70896ffeeef32b9c950ywan        por         xmm0, xmm7              ;add sign
1347233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubsb      xmm3, xmm0              ; q0-= q0sz add
1348233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1349233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm7, xmm7
1350233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pcmpgtb     xmm7, xmm5              ;save sign
1351233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pand        xmm7, xmm1              ;preserve the upper 3 bits
1352233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrlw       xmm5, 3
1353233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pand        xmm5, xmm2              ;clear out upper 3 bits
1354233d2500723e5594f3e7c70896ffeeef32b9c950ywan        por         xmm5, xmm7              ;add sign
1355233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddsb      xmm6, xmm5              ; p0+= p0 add
1356233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1357233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm3, xmm4     ; unoffset
1358233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      [rcx], xmm3             ; write back
1359233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1360233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm6, xmm4     ; unoffset
1361233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      [rcx+rax], xmm6         ; write back
1362233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1363233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; begin epilog
1364233d2500723e5594f3e7c70896ffeeef32b9c950ywan    RESTORE_GOT
1365233d2500723e5594f3e7c70896ffeeef32b9c950ywan    RESTORE_XMM
1366233d2500723e5594f3e7c70896ffeeef32b9c950ywan    UNSHADOW_ARGS
1367233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop         rbp
1368233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ret
1369233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1370233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1371233d2500723e5594f3e7c70896ffeeef32b9c950ywan;void vp8_loop_filter_simple_vertical_edge_sse2
1372233d2500723e5594f3e7c70896ffeeef32b9c950ywan;(
1373233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned char *src_ptr,
1374233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    int  src_pixel_step,
1375233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    const char *blimit,
1376233d2500723e5594f3e7c70896ffeeef32b9c950ywan;)
1377233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp8_loop_filter_simple_vertical_edge_sse2) PRIVATE
1378233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp8_loop_filter_simple_vertical_edge_sse2):
1379233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rbp         ; save old base pointer value.
1380233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         rbp, rsp    ; set new base pointer value.
1381233d2500723e5594f3e7c70896ffeeef32b9c950ywan    SHADOW_ARGS_TO_STACK 3
1382233d2500723e5594f3e7c70896ffeeef32b9c950ywan    SAVE_XMM 7
1383233d2500723e5594f3e7c70896ffeeef32b9c950ywan    GET_GOT     rbx         ; save callee-saved reg
1384233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rsi
1385233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rdi
1386233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; end prolog
1387233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1388233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ALIGN_STACK 16, rax
1389233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         rsp, 32                         ; reserve 32 bytes
1390233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define t0  [rsp + 0]    ;__declspec(align(16)) char t0[16];
1391233d2500723e5594f3e7c70896ffeeef32b9c950ywan    %define t1  [rsp + 16]   ;__declspec(align(16)) char t1[16];
1392233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1393233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rsi, arg(0) ;src_ptr
1394233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
1395233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1396233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rsi,        [rsi - 2 ]
1397233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rdi,        [rsi + rax]
1398233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rdx,        [rsi + rax*4]
1399233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rcx,        [rdx + rax]
1400233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1401233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd        xmm0,       [rsi]                   ; (high 96 bits unused) 03 02 01 00
1402233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd        xmm1,       [rdx]                   ; (high 96 bits unused) 43 42 41 40
1403233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd        xmm2,       [rdi]                   ; 13 12 11 10
1404233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd        xmm3,       [rcx]                   ; 53 52 51 50
1405233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckldq   xmm0,       xmm1                    ; (high 64 bits unused) 43 42 41 40 03 02 01 00
1406233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckldq   xmm2,       xmm3                    ; 53 52 51 50 13 12 11 10
1407233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1408233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd        xmm4,       [rsi + rax*2]           ; 23 22 21 20
1409233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd        xmm5,       [rdx + rax*2]           ; 63 62 61 60
1410233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd        xmm6,       [rdi + rax*2]           ; 33 32 31 30
1411233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd        xmm7,       [rcx + rax*2]           ; 73 72 71 70
1412233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckldq   xmm4,       xmm5                    ; 63 62 61 60 23 22 21 20
1413233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckldq   xmm6,       xmm7                    ; 73 72 71 70 33 32 31 30
1414233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1415233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm0,       xmm2                    ; 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00
1416233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm4,       xmm6                    ; 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20
1417233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1418233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm1,       xmm0
1419233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklwd   xmm0,       xmm4                    ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
1420233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhwd   xmm1,       xmm4                    ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
1421233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1422233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm2,       xmm0
1423233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckldq   xmm0,       xmm1                    ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
1424233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhdq   xmm2,       xmm1                    ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
1425233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1426233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rsi,        [rsi + rax*8]
1427233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rdi,        [rsi + rax]
1428233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rdx,        [rsi + rax*4]
1429233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rcx,        [rdx + rax]
1430233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1431233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd        xmm4,       [rsi]                   ; 83 82 81 80
1432233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd        xmm1,       [rdx]                   ; c3 c2 c1 c0
1433233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd        xmm6,       [rdi]                   ; 93 92 91 90
1434233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd        xmm3,       [rcx]                   ; d3 d2 d1 d0
1435233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckldq   xmm4,       xmm1                    ; c3 c2 c1 c0 83 82 81 80
1436233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckldq   xmm6,       xmm3                    ; d3 d2 d1 d0 93 92 91 90
1437233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1438233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd        xmm1,       [rsi + rax*2]           ; a3 a2 a1 a0
1439233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd        xmm5,       [rdx + rax*2]           ; e3 e2 e1 e0
1440233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd        xmm3,       [rdi + rax*2]           ; b3 b2 b1 b0
1441233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd        xmm7,       [rcx + rax*2]           ; f3 f2 f1 f0
1442233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckldq   xmm1,       xmm5                    ; e3 e2 e1 e0 a3 a2 a1 a0
1443233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckldq   xmm3,       xmm7                    ; f3 f2 f1 f0 b3 b2 b1 b0
1444233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1445233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm4,       xmm6                    ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80
1446233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm1,       xmm3                    ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0
1447233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1448233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm7,       xmm4
1449233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklwd   xmm4,       xmm1                    ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
1450233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhwd   xmm7,       xmm1                    ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
1451233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1452233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm6,       xmm4
1453233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckldq   xmm4,       xmm7                    ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
1454233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhdq   xmm6,       xmm7                    ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
1455233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1456233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm1,       xmm0
1457233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm3,       xmm2
1458233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1459233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklqdq  xmm0,       xmm4                    ; p1  f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
1460233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhqdq  xmm1,       xmm4                    ; p0  f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
1461233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklqdq  xmm2,       xmm6                    ; q0  f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
1462233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhqdq  xmm3,       xmm6                    ; q1  f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
1463233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1464233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rdx,        arg(2)                          ;blimit
1465233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1466233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; calculate mask
1467233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm6,       xmm0                            ; p1
1468233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm7,       xmm3                            ; q1
1469233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm7,       xmm0                            ; q1-=p1
1470233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm6,       xmm3                            ; p1-=q1
1471233d2500723e5594f3e7c70896ffeeef32b9c950ywan        por         xmm6,       xmm7                            ; abs(p1-q1)
1472233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pand        xmm6,       [GLOBAL(tfe)]                   ; set lsb of each byte to zero
1473233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrlw       xmm6,       1                               ; abs(p1-q1)/2
1474233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1475233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm7, [rdx]
1476233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1477233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm5,       xmm1                            ; p0
1478233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm4,       xmm2                            ; q0
1479233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm5,       xmm2                            ; p0-=q0
1480233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm4,       xmm1                            ; q0-=p0
1481233d2500723e5594f3e7c70896ffeeef32b9c950ywan        por         xmm5,       xmm4                            ; abs(p0 - q0)
1482233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddusb     xmm5,       xmm5                            ; abs(p0-q0)*2
1483233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddusb     xmm5,       xmm6                            ; abs (p0 - q0) *2 + abs(p1-q1)/2
1484233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1485233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm4, [GLOBAL(t80)]
1486233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1487233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm5,        xmm7                           ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
1488233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm7,        xmm7
1489233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pcmpeqb     xmm5,        xmm7                           ; mm5 = mask
1490233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1491233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; start work on filters
1492233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa        t0,        xmm0
1493233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa        t1,        xmm3
1494233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1495233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm0,        xmm4                  ; p1 offset to convert to signed values
1496233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm3,        xmm4                  ; q1 offset to convert to signed values
1497233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubsb      xmm0,        xmm3                           ; p1 - q1
1498233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1499233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm1,        xmm4                  ; offset to convert to signed values
1500233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm2,        xmm4                  ; offset to convert to signed values
1501233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1502233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm3,        xmm2                           ; offseted ; q0
1503233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubsb      xmm2,        xmm1                           ; q0 - p0
1504233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddsb      xmm0,        xmm2                           ; p1 - q1 + 1 * (q0 - p0)
1505233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddsb      xmm0,        xmm2                           ; p1 - q1 + 2 * (q0 - p0)
1506233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddsb      xmm0,        xmm2                           ; p1 - q1 + 3 * (q0 - p0)
1507233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pand        xmm5,        xmm0                           ; mask filter values we don't care about
1508233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1509233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm0, xmm5
1510233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddsb      xmm5,        [GLOBAL(t3)]                  ;  3* (q0 - p0) + (p1 - q1) + 4
1511233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddsb      xmm0,        [GLOBAL(t4)]                  ; +3 instead of +4
1512233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1513233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa  xmm6, [GLOBAL(te0)]
1514233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa  xmm2, [GLOBAL(t1f)]
1515233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1516233d2500723e5594f3e7c70896ffeeef32b9c950ywan;        pxor        xmm7, xmm7
1517233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pcmpgtb     xmm7, xmm0              ;save sign
1518233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pand        xmm7, xmm6              ;preserve the upper 3 bits
1519233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrlw       xmm0, 3
1520233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pand        xmm0, xmm2              ;clear out upper 3 bits
1521233d2500723e5594f3e7c70896ffeeef32b9c950ywan        por         xmm0, xmm7              ;add sign
1522233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubsb      xmm3, xmm0              ; q0-= q0sz add
1523233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1524233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm7, xmm7
1525233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pcmpgtb     xmm7, xmm5              ;save sign
1526233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pand        xmm7, xmm6              ;preserve the upper 3 bits
1527233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrlw       xmm5, 3
1528233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pand        xmm5, xmm2              ;clear out upper 3 bits
1529233d2500723e5594f3e7c70896ffeeef32b9c950ywan        por         xmm5, xmm7              ;add sign
1530233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddsb      xmm1, xmm5              ; p0+= p0 add
1531233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1532233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm3,        xmm4                  ; unoffset   q0
1533233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm1,        xmm4                  ; unoffset   p0
1534233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1535233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm0,        t0                             ; p1
1536233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm4,        t1                             ; q1
1537233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1538233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; write out order: xmm0 xmm2 xmm1 xmm3
1539233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rdx,        [rsi + rax*4]
1540233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1541233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; transpose back to write out
1542233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; p1  f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
1543233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; p0  f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
1544233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; q0  f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
1545233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; q1  f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
1546233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm6,       xmm0
1547233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm0,       xmm1                               ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
1548233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhbw   xmm6,       xmm1                               ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
1549233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1550233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm5,       xmm3
1551233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   xmm3,       xmm4                               ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
1552233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhbw   xmm5,       xmm4                               ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
1553233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1554233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm2,       xmm0
1555233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklwd   xmm0,       xmm3                               ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
1556233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhwd   xmm2,       xmm3                               ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
1557233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1558233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm3,       xmm6
1559233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklwd   xmm6,       xmm5                               ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
1560233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckhwd   xmm3,       xmm5                               ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
1561233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1562233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd        [rsi],      xmm6                               ; write the second 8-line result
1563233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd        [rdx],      xmm3
1564233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrldq      xmm6,       4
1565233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrldq      xmm3,       4
1566233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd        [rdi],      xmm6
1567233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd        [rcx],      xmm3
1568233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrldq      xmm6,       4
1569233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrldq      xmm3,       4
1570233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd        [rsi + rax*2], xmm6
1571233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd        [rdx + rax*2], xmm3
1572233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrldq      xmm6,       4
1573233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrldq      xmm3,       4
1574233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd        [rdi + rax*2], xmm6
1575233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd        [rcx + rax*2], xmm3
1576233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1577233d2500723e5594f3e7c70896ffeeef32b9c950ywan        neg         rax
1578233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rsi,        [rsi + rax*8]
1579233d2500723e5594f3e7c70896ffeeef32b9c950ywan        neg         rax
1580233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rdi,        [rsi + rax]
1581233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rdx,        [rsi + rax*4]
1582233d2500723e5594f3e7c70896ffeeef32b9c950ywan        lea         rcx,        [rdx + rax]
1583233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1584233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd        [rsi],      xmm0                                ; write the first 8-line result
1585233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd        [rdx],      xmm2
1586233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrldq      xmm0,       4
1587233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrldq      xmm2,       4
1588233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd        [rdi],      xmm0
1589233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd        [rcx],      xmm2
1590233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrldq      xmm0,       4
1591233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrldq      xmm2,       4
1592233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd        [rsi + rax*2], xmm0
1593233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd        [rdx + rax*2], xmm2
1594233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrldq      xmm0,       4
1595233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrldq      xmm2,       4
1596233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd        [rdi + rax*2], xmm0
1597233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movd        [rcx + rax*2], xmm2
1598233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1599233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add rsp, 32
1600233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rsp
1601233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; begin epilog
1602233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rdi
1603233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rsi
1604233d2500723e5594f3e7c70896ffeeef32b9c950ywan    RESTORE_GOT
1605233d2500723e5594f3e7c70896ffeeef32b9c950ywan    RESTORE_XMM
1606233d2500723e5594f3e7c70896ffeeef32b9c950ywan    UNSHADOW_ARGS
1607233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop         rbp
1608233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ret
1609233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1610233d2500723e5594f3e7c70896ffeeef32b9c950ywanSECTION_RODATA
1611233d2500723e5594f3e7c70896ffeeef32b9c950ywanalign 16
1612233d2500723e5594f3e7c70896ffeeef32b9c950ywantfe:
1613233d2500723e5594f3e7c70896ffeeef32b9c950ywan    times 16 db 0xfe
1614233d2500723e5594f3e7c70896ffeeef32b9c950ywanalign 16
1615233d2500723e5594f3e7c70896ffeeef32b9c950ywant80:
1616233d2500723e5594f3e7c70896ffeeef32b9c950ywan    times 16 db 0x80
1617233d2500723e5594f3e7c70896ffeeef32b9c950ywanalign 16
1618233d2500723e5594f3e7c70896ffeeef32b9c950ywant1s:
1619233d2500723e5594f3e7c70896ffeeef32b9c950ywan    times 16 db 0x01
1620233d2500723e5594f3e7c70896ffeeef32b9c950ywanalign 16
1621233d2500723e5594f3e7c70896ffeeef32b9c950ywant3:
1622233d2500723e5594f3e7c70896ffeeef32b9c950ywan    times 16 db 0x03
1623233d2500723e5594f3e7c70896ffeeef32b9c950ywanalign 16
1624233d2500723e5594f3e7c70896ffeeef32b9c950ywant4:
1625233d2500723e5594f3e7c70896ffeeef32b9c950ywan    times 16 db 0x04
1626233d2500723e5594f3e7c70896ffeeef32b9c950ywanalign 16
1627233d2500723e5594f3e7c70896ffeeef32b9c950ywanones:
1628233d2500723e5594f3e7c70896ffeeef32b9c950ywan    times 8 dw 0x0001
1629233d2500723e5594f3e7c70896ffeeef32b9c950ywanalign 16
1630233d2500723e5594f3e7c70896ffeeef32b9c950ywans9:
1631233d2500723e5594f3e7c70896ffeeef32b9c950ywan    times 8 dw 0x0900
1632233d2500723e5594f3e7c70896ffeeef32b9c950ywanalign 16
1633233d2500723e5594f3e7c70896ffeeef32b9c950ywans63:
1634233d2500723e5594f3e7c70896ffeeef32b9c950ywan    times 8 dw 0x003f
1635233d2500723e5594f3e7c70896ffeeef32b9c950ywanalign 16
1636233d2500723e5594f3e7c70896ffeeef32b9c950ywante0:
1637233d2500723e5594f3e7c70896ffeeef32b9c950ywan    times 16 db 0xe0
1638233d2500723e5594f3e7c70896ffeeef32b9c950ywanalign 16
1639233d2500723e5594f3e7c70896ffeeef32b9c950ywant1f:
1640233d2500723e5594f3e7c70896ffeeef32b9c950ywan    times 16 db 0x1f
1641