190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;
2f71323e297a928af368937089d3ed71239786f86Andreas Huber;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;
4f71323e297a928af368937089d3ed71239786f86Andreas Huber;  Use of this source code is governed by a BSD-style license
5f71323e297a928af368937089d3ed71239786f86Andreas Huber;  that can be found in the LICENSE file in the root of the source
6f71323e297a928af368937089d3ed71239786f86Andreas Huber;  tree. An additional intellectual property rights grant can be found
7f71323e297a928af368937089d3ed71239786f86Andreas Huber;  in the file PATENTS.  All contributing project authors may
8f71323e297a928af368937089d3ed71239786f86Andreas Huber;  be found in the AUTHORS file in the root of the source tree.
990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;
1090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%include "vpx_ports/x86_abi_support.asm"
1390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
14538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber; Use of pmaxub instead of psubusb to compute filter mask was seen
15538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber; in ffvp8
1690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
17538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%macro LFH_FILTER_AND_HEV_MASK 1
18f71323e297a928af368937089d3ed71239786f86Andreas Huber%if %1
19f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm2,                   [rdi+2*rax]       ; q3
20f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm1,                   [rsi+2*rax]       ; q2
21538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm4,                   [rsi+rax]         ; q1
22538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm5,                   [rsi]             ; q0
23538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        neg         rax                     ; negate pitch to deal with above border
24f71323e297a928af368937089d3ed71239786f86Andreas Huber%else
25538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movlps      xmm2,                   [rsi + rcx*2]     ; q3
26538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movlps      xmm1,                   [rsi + rcx]       ; q2
27538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movlps      xmm4,                   [rsi]             ; q1
28538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movlps      xmm5,                   [rsi + rax]       ; q0
2990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
30538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movhps      xmm2,                   [rdi + rcx*2]
31538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movhps      xmm1,                   [rdi + rcx]
32538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movhps      xmm4,                   [rdi]
33538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movhps      xmm5,                   [rdi + rax]
3490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
35538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        lea         rsi,                    [rsi + rax*4]
36538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        lea         rdi,                    [rdi + rax*4]
3790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
38538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      XMMWORD PTR [rsp],      xmm1              ; store q2
39f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      XMMWORD PTR [rsp + 16], xmm4              ; store q1
40f71323e297a928af368937089d3ed71239786f86Andreas Huber%endif
4190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
42538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm6,                   xmm1              ; q2
43f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm3,                   xmm4              ; q1
44538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
45538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        psubusb     xmm1,                   xmm2              ; q2-=q3
46538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        psubusb     xmm2,                   xmm6              ; q3-=q2
47538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
48f71323e297a928af368937089d3ed71239786f86Andreas Huber        psubusb     xmm4,                   xmm6              ; q1-=q2
49f71323e297a928af368937089d3ed71239786f86Andreas Huber        psubusb     xmm6,                   xmm3              ; q2-=q1
5090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
51538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        por         xmm4,                   xmm6              ; abs(q2-q1)
52538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        por         xmm1,                   xmm2              ; abs(q3-q2)
5390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
54538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm0,                   xmm5              ; q0
55538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmaxub      xmm1,                   xmm4
5690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
57538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        psubusb     xmm5,                   xmm3              ; q0-=q1
58f71323e297a928af368937089d3ed71239786f86Andreas Huber        psubusb     xmm3,                   xmm0              ; q1-=q0
5990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
60538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        por         xmm5,                   xmm3              ; abs(q0-q1)
61538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      t0,                     xmm5              ; save to t0
6290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
63538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmaxub      xmm1,                   xmm5
64f71323e297a928af368937089d3ed71239786f86Andreas Huber
65538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%if %1
66f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm2,                   [rsi+4*rax]       ; p3
67f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm4,                   [rdi+4*rax]       ; p2
68538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm6,                   [rsi+2*rax]       ; p1
69f71323e297a928af368937089d3ed71239786f86Andreas Huber%else
70538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movlps      xmm2,                   [rsi + rax]       ; p3
71538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movlps      xmm4,                   [rsi]             ; p2
72538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movlps      xmm6,                   [rsi + rcx]       ; p1
73538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
74538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movhps      xmm2,                   [rdi + rax]
75538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movhps      xmm4,                   [rdi]
76538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movhps      xmm6,                   [rdi + rcx]
77f71323e297a928af368937089d3ed71239786f86Andreas Huber
78f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      XMMWORD PTR [rsp + 32], xmm4              ; store p2
79538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      XMMWORD PTR [rsp + 48], xmm6              ; store p1
80f71323e297a928af368937089d3ed71239786f86Andreas Huber%endif
8190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
82f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm5,                   xmm4              ; p2
83538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm3,                   xmm6              ; p1
84538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
85f71323e297a928af368937089d3ed71239786f86Andreas Huber        psubusb     xmm4,                   xmm2              ; p2-=p3
86f71323e297a928af368937089d3ed71239786f86Andreas Huber        psubusb     xmm2,                   xmm5              ; p3-=p2
8790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
88538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        psubusb     xmm3,                   xmm5              ; p1-=p2
89538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmaxub      xmm1,                   xmm4              ; abs(p3 - p2)
9090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
91538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        psubusb     xmm5,                   xmm6              ; p2-=p1
92538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmaxub      xmm1,                   xmm2              ; abs(p3 - p2)
9390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
94538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmaxub      xmm1,                   xmm5              ; abs(p2 - p1)
95538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm2,                   xmm6              ; p1
9690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
97538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmaxub      xmm1,                   xmm3              ; abs(p2 - p1)
98f71323e297a928af368937089d3ed71239786f86Andreas Huber%if %1
99f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm4,                   [rsi+rax]         ; p0
100538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm3,                   [rdi]             ; q1
101f71323e297a928af368937089d3ed71239786f86Andreas Huber%else
102538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movlps      xmm4,                   [rsi + rcx*2]     ; p0
103538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movhps      xmm4,                   [rdi + rcx*2]
104538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm3,                   q1                ; q1
105f71323e297a928af368937089d3ed71239786f86Andreas Huber%endif
10690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
107f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm5,                   xmm4              ; p0
108538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        psubusb     xmm4,                   xmm6              ; p0-=p1
10990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
110538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        psubusb     xmm6,                   xmm5              ; p1-=p0
11190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
112538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        por         xmm6,                   xmm4              ; abs(p1 - p0)
113538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        mov         rdx,                    arg(2)            ; get flimit
114538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
115538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa        t1,                   xmm6              ; save to t1
11690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
117f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm4,                   xmm3              ; q1
118538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmaxub      xmm1,                   xmm6
119538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
120f71323e297a928af368937089d3ed71239786f86Andreas Huber        psubusb     xmm3,                   xmm2              ; q1-=p1
121f71323e297a928af368937089d3ed71239786f86Andreas Huber        psubusb     xmm2,                   xmm4              ; p1-=q1
122538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
123538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        psubusb     xmm1,                   xmm7
124f71323e297a928af368937089d3ed71239786f86Andreas Huber        por         xmm2,                   xmm3              ; abs(p1-q1)
12590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
126538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm4,                   XMMWORD PTR [rdx] ; flimit
127538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
128f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm3,                   xmm0              ; q0
129538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pand        xmm2,                   [GLOBAL(tfe)]     ; set lsb of each byte to zero
130538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
131538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        mov         rdx,                    arg(4)            ; hev get thresh
132538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
133538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm6,                   xmm5              ; p0
134538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        psrlw       xmm2,                   1                 ; abs(p1-q1)/2
135538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
136f71323e297a928af368937089d3ed71239786f86Andreas Huber        psubusb     xmm5,                   xmm3              ; p0-=q0
137538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddb       xmm4,                   xmm4              ; flimit*2 (less than 255)
138538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
139f71323e297a928af368937089d3ed71239786f86Andreas Huber        psubusb     xmm3,                   xmm6              ; q0-=p0
140f71323e297a928af368937089d3ed71239786f86Andreas Huber        por         xmm5,                   xmm3              ; abs(p0 - q0)
141538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
142f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddusb     xmm5,                   xmm5              ; abs(p0-q0)*2
143538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddb       xmm7,                   xmm4              ; flimit * 2 + limit (less than 255)
144538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
145538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm4,                   t0                ; hev get abs (q1 - q0)
146538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
147538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm3,                   t1                ; get abs (p1 - p0)
148538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
149f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddusb     xmm5,                   xmm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
15090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
151538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm2,                   XMMWORD PTR [rdx] ; hev
15290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
153f71323e297a928af368937089d3ed71239786f86Andreas Huber        psubusb     xmm5,                   xmm7              ; abs (p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
154538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        psubusb     xmm4,                   xmm2              ; hev
155538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
156538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        psubusb     xmm3,                   xmm2              ; hev
157f71323e297a928af368937089d3ed71239786f86Andreas Huber        por         xmm1,                   xmm5
15890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
159538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        xmm7,                   xmm7
160538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddb       xmm4,                   xmm3              ; hev abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
16190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
162538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pcmpeqb     xmm4,                   xmm5              ; hev
163538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pcmpeqb     xmm3,                   xmm3              ; hev
16490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
165538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pcmpeqb     xmm1,                   xmm7              ; mask xmm1
166538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        xmm4,                   xmm3              ; hev
167f71323e297a928af368937089d3ed71239786f86Andreas Huber%endmacro
16890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
169538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%macro B_FILTER 1
170538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%if %1 == 0
171f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm2,                   p1                ; p1
172f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm7,                   q1                ; q1
173538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%elif %1 == 1
174538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm2,                   [rsi+2*rax]       ; p1
175538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm7,                   [rdi]             ; q1
176538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%elif %1 == 2
177538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        lea         rdx,                    srct
178538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
179538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm2,                   [rdx]             ; p1
180538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm7,                   [rdx+48]          ; q1
181538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm6,                   [rdx+16]          ; p0
182538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm0,                   [rdx+32]          ; q0
183f71323e297a928af368937089d3ed71239786f86Andreas Huber%endif
18490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
185538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        xmm2,                   [GLOBAL(t80)]     ; p1 offset to convert to signed values
186538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        xmm7,                   [GLOBAL(t80)]     ; q1 offset to convert to signed values
18790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
188f71323e297a928af368937089d3ed71239786f86Andreas Huber        psubsb      xmm2,                   xmm7              ; p1 - q1
189538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        xmm6,                   [GLOBAL(t80)]     ; offset to convert to signed values
19090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
191f71323e297a928af368937089d3ed71239786f86Andreas Huber        pand        xmm2,                   xmm4              ; high var mask (hvm)(p1 - q1)
192538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        xmm0,                   [GLOBAL(t80)]     ; offset to convert to signed values
19390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
194f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm3,                   xmm0              ; q0
195f71323e297a928af368937089d3ed71239786f86Andreas Huber        psubsb      xmm0,                   xmm6              ; q0 - p0
196538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
197f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddsb      xmm2,                   xmm0              ; 1 * (q0 - p0) + hvm(p1 - q1)
198538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
199f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddsb      xmm2,                   xmm0              ; 2 * (q0 - p0) + hvm(p1 - q1)
200538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
201f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddsb      xmm2,                   xmm0              ; 3 * (q0 - p0) + hvm(p1 - q1)
202538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
203f71323e297a928af368937089d3ed71239786f86Andreas Huber        pand        xmm1,                   xmm2              ; mask filter values we don't care about
204538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
205f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm2,                   xmm1
206538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
207538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddsb      xmm1,                   [GLOBAL(t4)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 4
208538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddsb      xmm2,                   [GLOBAL(t3)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 3
20990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
210f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhbw   xmm5,                   xmm2              ; axbxcxdx
211f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklbw   xmm2,                   xmm2              ; exfxgxhx
21290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
213538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        punpcklbw   xmm0,                   xmm1              ; exfxgxhx
214f71323e297a928af368937089d3ed71239786f86Andreas Huber        psraw       xmm5,                   11                ; sign extended shift right by 3
21590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
216f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhbw   xmm1,                   xmm1              ; axbxcxdx
217538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        psraw       xmm2,                   11                ; sign extended shift right by 3
21890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
219538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        packsswb    xmm2,                   xmm5              ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
220f71323e297a928af368937089d3ed71239786f86Andreas Huber        psraw       xmm0,                   11                ; sign extended shift right by 3
221f71323e297a928af368937089d3ed71239786f86Andreas Huber
222538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        psraw       xmm1,                   11                ; sign extended shift right by 3
223f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm5,                   xmm0              ; save results
224f71323e297a928af368937089d3ed71239786f86Andreas Huber
225538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        packsswb    xmm0,                   xmm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
226538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddsw      xmm5,                   [GLOBAL(ones)]
227f71323e297a928af368937089d3ed71239786f86Andreas Huber
228538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddsw      xmm1,                   [GLOBAL(ones)]
229f71323e297a928af368937089d3ed71239786f86Andreas Huber        psraw       xmm5,                   1                 ; partial shifted one more time for 2nd tap
230f71323e297a928af368937089d3ed71239786f86Andreas Huber
231538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        psraw       xmm1,                   1                 ; partial shifted one more time for 2nd tap
232f71323e297a928af368937089d3ed71239786f86Andreas Huber
233f71323e297a928af368937089d3ed71239786f86Andreas Huber        paddsb      xmm6,                   xmm2              ; p0+= p0 add
234538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        packsswb    xmm5,                   xmm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
235f71323e297a928af368937089d3ed71239786f86Andreas Huber
236538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%if %1 == 0
237538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm1,                   p1                ; p1
238538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%elif %1 == 1
239538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm1,                   [rsi+2*rax]       ; p1
240538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%elif %1 == 2
241538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm1,                   [rdx]             ; p1
242f71323e297a928af368937089d3ed71239786f86Andreas Huber%endif
243538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pandn       xmm4,                   xmm5              ; high edge variance additive
244538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        xmm6,                   [GLOBAL(t80)]     ; unoffset
245f71323e297a928af368937089d3ed71239786f86Andreas Huber
246538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        xmm1,                   [GLOBAL(t80)]     ; reoffset
247f71323e297a928af368937089d3ed71239786f86Andreas Huber        psubsb      xmm3,                   xmm0              ; q0-= q0 add
248f71323e297a928af368937089d3ed71239786f86Andreas Huber
249538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddsb      xmm1,                   xmm4              ; p1+= p1 add
250538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        xmm3,                   [GLOBAL(t80)]     ; unoffset
251538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
252538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        xmm1,                   [GLOBAL(t80)]     ; unoffset
253f71323e297a928af368937089d3ed71239786f86Andreas Huber        psubsb      xmm7,                   xmm4              ; q1-= q1 add
254538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
255538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        xmm7,                   [GLOBAL(t80)]     ; unoffset
256538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%if %1 == 0
257538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        lea         rsi,                    [rsi + rcx*2]
258538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        lea         rdi,                    [rdi + rcx*2]
259538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movq        MMWORD PTR [rsi],       xmm6              ; p0
260538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movhps      MMWORD PTR [rdi],       xmm6
261538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movq        MMWORD PTR [rsi + rax], xmm1              ; p1
262538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movhps      MMWORD PTR [rdi + rax], xmm1
263538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movq        MMWORD PTR [rsi + rcx], xmm3              ; q0
264538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movhps      MMWORD PTR [rdi + rcx], xmm3
265f71323e297a928af368937089d3ed71239786f86Andreas Huber        movq        MMWORD PTR [rsi + rcx*2],xmm7             ; q1
266538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movhps      MMWORD PTR [rdi + rcx*2],xmm7
267538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%elif %1 == 1
268538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      [rsi+rax],              xmm6              ; write back
269538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      [rsi+2*rax],            xmm1              ; write back
270538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      [rsi],                  xmm3              ; write back
271538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      [rdi],                  xmm7              ; write back
272f71323e297a928af368937089d3ed71239786f86Andreas Huber%endif
273538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
274f71323e297a928af368937089d3ed71239786f86Andreas Huber%endmacro
27590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
27690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
277f71323e297a928af368937089d3ed71239786f86Andreas Huber;void vp8_loop_filter_horizontal_edge_sse2
27890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;(
27990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned char *src_ptr,
28090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int            src_pixel_step,
28190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    const char    *flimit,
28290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    const char    *limit,
283f71323e297a928af368937089d3ed71239786f86Andreas Huber;    const char    *thresh,
284f71323e297a928af368937089d3ed71239786f86Andreas Huber;    int            count
285f71323e297a928af368937089d3ed71239786f86Andreas Huber;)
286f71323e297a928af368937089d3ed71239786f86Andreas Huberglobal sym(vp8_loop_filter_horizontal_edge_sse2)
287f71323e297a928af368937089d3ed71239786f86Andreas Hubersym(vp8_loop_filter_horizontal_edge_sse2):
288f71323e297a928af368937089d3ed71239786f86Andreas Huber    push        rbp
289f71323e297a928af368937089d3ed71239786f86Andreas Huber    mov         rbp, rsp
290f71323e297a928af368937089d3ed71239786f86Andreas Huber    SHADOW_ARGS_TO_STACK 6
291f71323e297a928af368937089d3ed71239786f86Andreas Huber    SAVE_XMM
292f71323e297a928af368937089d3ed71239786f86Andreas Huber    GET_GOT     rbx
293f71323e297a928af368937089d3ed71239786f86Andreas Huber    push        rsi
294f71323e297a928af368937089d3ed71239786f86Andreas Huber    push        rdi
295f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; end prolog
29690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
297f71323e297a928af368937089d3ed71239786f86Andreas Huber    ALIGN_STACK 16, rax
298f71323e297a928af368937089d3ed71239786f86Andreas Huber    sub         rsp, 32     ; reserve 32 bytes
299f71323e297a928af368937089d3ed71239786f86Andreas Huber    %define t0 [rsp + 0]    ;__declspec(align(16)) char t0[16];
300f71323e297a928af368937089d3ed71239786f86Andreas Huber    %define t1 [rsp + 16]   ;__declspec(align(16)) char t1[16];
30190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
302f71323e297a928af368937089d3ed71239786f86Andreas Huber        mov         rsi,                    arg(0)           ;src_ptr
303f71323e297a928af368937089d3ed71239786f86Andreas Huber        movsxd      rax,                    dword ptr arg(1) ;src_pixel_step
30490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
305f71323e297a928af368937089d3ed71239786f86Andreas Huber        mov         rdx,                    arg(3)           ;limit
306f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm7,                   XMMWORD PTR [rdx]
30790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
308f71323e297a928af368937089d3ed71239786f86Andreas Huber        lea         rdi,                    [rsi+rax]        ; rdi points to row +1 for indirect addressing
30990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
310538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        ; calculate breakout conditions and high edge variance
311538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        LFH_FILTER_AND_HEV_MASK 1
312538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        ; filter and write back the result
313538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        B_FILTER 1
31490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
315f71323e297a928af368937089d3ed71239786f86Andreas Huber    add rsp, 32
316f71323e297a928af368937089d3ed71239786f86Andreas Huber    pop rsp
317f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; begin epilog
318f71323e297a928af368937089d3ed71239786f86Andreas Huber    pop rdi
319f71323e297a928af368937089d3ed71239786f86Andreas Huber    pop rsi
320f71323e297a928af368937089d3ed71239786f86Andreas Huber    RESTORE_GOT
321f71323e297a928af368937089d3ed71239786f86Andreas Huber    RESTORE_XMM
322f71323e297a928af368937089d3ed71239786f86Andreas Huber    UNSHADOW_ARGS
323f71323e297a928af368937089d3ed71239786f86Andreas Huber    pop         rbp
324f71323e297a928af368937089d3ed71239786f86Andreas Huber    ret
32590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
32690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
327f71323e297a928af368937089d3ed71239786f86Andreas Huber;void vp8_loop_filter_horizontal_edge_uv_sse2
328f71323e297a928af368937089d3ed71239786f86Andreas Huber;(
329f71323e297a928af368937089d3ed71239786f86Andreas Huber;    unsigned char *src_ptr,
330f71323e297a928af368937089d3ed71239786f86Andreas Huber;    int            src_pixel_step,
331f71323e297a928af368937089d3ed71239786f86Andreas Huber;    const char    *flimit,
332f71323e297a928af368937089d3ed71239786f86Andreas Huber;    const char    *limit,
333f71323e297a928af368937089d3ed71239786f86Andreas Huber;    const char    *thresh,
334f71323e297a928af368937089d3ed71239786f86Andreas Huber;    int            count
335f71323e297a928af368937089d3ed71239786f86Andreas Huber;)
336f71323e297a928af368937089d3ed71239786f86Andreas Huberglobal sym(vp8_loop_filter_horizontal_edge_uv_sse2)
337f71323e297a928af368937089d3ed71239786f86Andreas Hubersym(vp8_loop_filter_horizontal_edge_uv_sse2):
338f71323e297a928af368937089d3ed71239786f86Andreas Huber    push        rbp
339f71323e297a928af368937089d3ed71239786f86Andreas Huber    mov         rbp, rsp
340f71323e297a928af368937089d3ed71239786f86Andreas Huber    SHADOW_ARGS_TO_STACK 6
341f71323e297a928af368937089d3ed71239786f86Andreas Huber    SAVE_XMM
342f71323e297a928af368937089d3ed71239786f86Andreas Huber    GET_GOT     rbx
343f71323e297a928af368937089d3ed71239786f86Andreas Huber    push        rsi
344f71323e297a928af368937089d3ed71239786f86Andreas Huber    push        rdi
345f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; end prolog
34690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
347f71323e297a928af368937089d3ed71239786f86Andreas Huber    ALIGN_STACK 16, rax
348f71323e297a928af368937089d3ed71239786f86Andreas Huber    sub         rsp, 96       ; reserve 96 bytes
349f71323e297a928af368937089d3ed71239786f86Andreas Huber    %define q2  [rsp + 0]     ;__declspec(align(16)) char q2[16];
350f71323e297a928af368937089d3ed71239786f86Andreas Huber    %define q1  [rsp + 16]    ;__declspec(align(16)) char q1[16];
351f71323e297a928af368937089d3ed71239786f86Andreas Huber    %define p2  [rsp + 32]    ;__declspec(align(16)) char p2[16];
352f71323e297a928af368937089d3ed71239786f86Andreas Huber    %define p1  [rsp + 48]    ;__declspec(align(16)) char p1[16];
353f71323e297a928af368937089d3ed71239786f86Andreas Huber    %define t0  [rsp + 64]    ;__declspec(align(16)) char t0[16];
354f71323e297a928af368937089d3ed71239786f86Andreas Huber    %define t1  [rsp + 80]    ;__declspec(align(16)) char t1[16];
355f71323e297a928af368937089d3ed71239786f86Andreas Huber
356f71323e297a928af368937089d3ed71239786f86Andreas Huber        mov         rsi,                    arg(0)             ; u
357f71323e297a928af368937089d3ed71239786f86Andreas Huber        mov         rdi,                    arg(5)             ; v
358f71323e297a928af368937089d3ed71239786f86Andreas Huber        movsxd      rax,                    dword ptr arg(1)   ; src_pixel_step
359f71323e297a928af368937089d3ed71239786f86Andreas Huber        mov         rcx,                    rax
360f71323e297a928af368937089d3ed71239786f86Andreas Huber        neg         rax                     ; negate pitch to deal with above border
361f71323e297a928af368937089d3ed71239786f86Andreas Huber
362f71323e297a928af368937089d3ed71239786f86Andreas Huber        mov         rdx,                    arg(3)             ;limit
363f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm7,                   XMMWORD PTR [rdx]
36490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
365f71323e297a928af368937089d3ed71239786f86Andreas Huber        lea         rsi,                    [rsi + rcx]
366f71323e297a928af368937089d3ed71239786f86Andreas Huber        lea         rdi,                    [rdi + rcx]
36790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
368538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        ; calculate breakout conditions and high edge variance
369538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        LFH_FILTER_AND_HEV_MASK 0
370538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        ; filter and write back the result
371538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        B_FILTER 0
37290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
373f71323e297a928af368937089d3ed71239786f86Andreas Huber    add rsp, 96
374f71323e297a928af368937089d3ed71239786f86Andreas Huber    pop rsp
375f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; begin epilog
376f71323e297a928af368937089d3ed71239786f86Andreas Huber    pop rdi
377f71323e297a928af368937089d3ed71239786f86Andreas Huber    pop rsi
378f71323e297a928af368937089d3ed71239786f86Andreas Huber    RESTORE_GOT
379f71323e297a928af368937089d3ed71239786f86Andreas Huber    RESTORE_XMM
380f71323e297a928af368937089d3ed71239786f86Andreas Huber    UNSHADOW_ARGS
381f71323e297a928af368937089d3ed71239786f86Andreas Huber    pop         rbp
382f71323e297a928af368937089d3ed71239786f86Andreas Huber    ret
38390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
38490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
385538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%macro MB_FILTER_AND_WRITEBACK 1
386538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%if %1 == 0
387538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm2,                   p1              ; p1
388538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm7,                   q1              ; q1
389538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%elif %1 == 1
390538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm2,                   [rsi+2*rax]     ; p1
391538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm7,                   [rdi]           ; q1
392538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
393538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        mov         rcx,                    rax
394538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        neg         rcx
395538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%elif %1 == 2
396538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        lea         rdx,                    srct
397538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
398538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm2,                   [rdx+32]        ; p1
399538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm7,                   [rdx+80]        ; q1
400538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm6,                   [rdx+48]        ; p0
401538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm0,                   [rdx+64]        ; q0
402f71323e297a928af368937089d3ed71239786f86Andreas Huber%endif
40390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
404538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        xmm2,                   [GLOBAL(t80)]   ; p1 offset to convert to signed values
405538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        xmm7,                   [GLOBAL(t80)]   ; q1 offset to convert to signed values
406538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        xmm6,                   [GLOBAL(t80)]   ; offset to convert to signed values
407538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        xmm0,                   [GLOBAL(t80)]   ; offset to convert to signed values
40890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
409538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        psubsb      xmm2,                   xmm7            ; p1 - q1
410538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm3,                   xmm0            ; q0
41190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
412538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        psubsb      xmm0,                   xmm6            ; q0 - p0
41390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
414538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddsb      xmm2,                   xmm0            ; 1 * (q0 - p0) + (p1 - q1)
41590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
416538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddsb      xmm2,                   xmm0            ; 2 * (q0 - p0)
41790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
418538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddsb      xmm2,                   xmm0            ; 3 * (q0 - p0) + (p1 - q1)
41990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
420538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pand        xmm1,                   xmm2            ; mask filter values we don't care about
42190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
422538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm2,                   xmm1            ; vp8_filter
42390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
424538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pand        xmm2,                   xmm4            ; Filter2 = vp8_filter & hev
425538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        xmm0,                   xmm0
42690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
427538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pandn       xmm4,                   xmm1            ; vp8_filter&=~hev
42890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        xmm1,                   xmm1
42990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
430538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        punpcklbw   xmm0,                   xmm4            ; Filter 2 (hi)
431538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm5,                   xmm2
43290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
433538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        punpckhbw   xmm1,                   xmm4            ; Filter 2 (lo)
434538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddsb      xmm5,                   [GLOBAL(t3)]    ; vp8_signed_char_clamp(Filter2 + 3)
43590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
436538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmulhw      xmm1,                   [GLOBAL(s9)]    ; Filter 2 (lo) * 9
43790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
438538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmulhw      xmm0,                   [GLOBAL(s9)]    ; Filter 2 (hi) * 9
43990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
440538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        punpckhbw   xmm7,                   xmm5            ; axbxcxdx
441538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddsb      xmm2,                   [GLOBAL(t4)]    ; vp8_signed_char_clamp(Filter2 + 4)
44290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
443538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        punpcklbw   xmm5,                   xmm5            ; exfxgxhx
444538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        psraw       xmm7,                   11              ; sign extended shift right by 3
44590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
446538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        psraw       xmm5,                   11              ; sign extended shift right by 3
447538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        punpckhbw   xmm4,                   xmm2            ; axbxcxdx
44890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
449538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        punpcklbw   xmm2,                   xmm2            ; exfxgxhx
450538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        psraw       xmm4,                   11              ; sign extended shift right by 3
451f71323e297a928af368937089d3ed71239786f86Andreas Huber
452538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        packsswb    xmm5,                   xmm7            ; Filter2 >>=3;
453538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        psraw       xmm2,                   11              ; sign extended shift right by 3
45490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
455538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        packsswb    xmm2,                   xmm4            ; Filter1 >>=3;
456538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm7,                   xmm1
45790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
458538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddsb      xmm6,                   xmm5            ; ps0 =ps0 + Fitler2
459538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm4,                   xmm1
46090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
461538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        psubsb      xmm3,                   xmm2            ; qs0 =qs0 - Filter1
462538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm5,                   xmm0
46390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
464538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm2,                   xmm5
465538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw       xmm0,                   [GLOBAL(s63)]   ; Filter 2 (hi) * 9 + 63
46690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
467538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw       xmm1,                   [GLOBAL(s63)]   ; Filter 2 (lo) * 9 + 63
468538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw       xmm5,                   xmm5            ; Filter 2 (hi) * 18
46990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
470538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw       xmm7,                   xmm7            ; Filter 2 (lo) * 18
471538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw       xmm5,                   xmm0            ; Filter 2 (hi) * 27 + 63
47290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
473538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw       xmm7,                   xmm1            ; Filter 2 (lo) * 27 + 63
474538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw       xmm2,                   xmm0            ; Filter 2 (hi) * 18 + 63
47590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
476538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw       xmm4,                   xmm1            ; Filter 2 (lo) * 18 + 63
477538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        psraw       xmm0,                   7               ; (Filter 2 (hi) * 9 + 63) >> 7
47890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
479538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        psraw       xmm1,                   7               ; (Filter 2 (lo) * 9 + 63) >> 7
480538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        psraw       xmm2,                   7               ; (Filter 2 (hi) * 18 + 63) >> 7
48190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
482538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        packsswb    xmm0,                   xmm1            ; u1 = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
483538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        psraw       xmm4,                   7               ; (Filter 2 (lo) * 18 + 63) >> 7
48490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
485538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        psraw       xmm5,                   7               ; (Filter 2 (hi) * 27 + 63) >> 7
486538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        packsswb    xmm2,                   xmm4            ; u2 = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
487538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
488538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        psraw       xmm7,                   7               ; (Filter 2 (lo) * 27 + 63) >> 7
489f71323e297a928af368937089d3ed71239786f86Andreas Huber
490538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        packsswb    xmm5,                   xmm7            ; u3 = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
491538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
492538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        psubsb      xmm3,                   xmm5            ; sq = vp8_signed_char_clamp(qs0 - u3)
493538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddsb      xmm6,                   xmm5            ; sp = vp8_signed_char_clamp(ps0 - u3)
494538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
495538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%if %1 == 0
496538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm5,                   q2              ; q2
497538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm1,                   q1              ; q1
498538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm4,                   p1              ; p1
499538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm7,                   p2              ; p2
500538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
501538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%elif %1 == 1
502538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm5,                   XMMWORD PTR [rdi+rcx]   ; q2
503538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm1,                   XMMWORD PTR [rdi]       ; q1
504538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm4,                   XMMWORD PTR [rsi+rax*2] ; p1
505538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm7,                   XMMWORD PTR [rdi+rax*4] ; p2
506538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%elif %1 == 2
507538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm5,                   XMMWORD PTR [rdx+96]    ; q2
508538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm1,                   XMMWORD PTR [rdx+80]    ; q1
509538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm4,                   XMMWORD PTR [rdx+32]    ; p1
510538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm7,                   XMMWORD PTR [rdx+16]    ; p2
511f71323e297a928af368937089d3ed71239786f86Andreas Huber%endif
51290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
513538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        xmm3,                   [GLOBAL(t80)]   ; *oq0 = sq^0x80
514538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        xmm6,                   [GLOBAL(t80)]   ; *oq0 = sp^0x80
51590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
516538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        xmm1,                   [GLOBAL(t80)]
517538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        xmm4,                   [GLOBAL(t80)]
518f71323e297a928af368937089d3ed71239786f86Andreas Huber
519538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        psubsb      xmm1,                   xmm2            ; sq = vp8_signed_char_clamp(qs1 - u2)
520538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddsb      xmm4,                   xmm2            ; sp = vp8_signed_char_clamp(ps1 - u2)
521f71323e297a928af368937089d3ed71239786f86Andreas Huber
522538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        xmm1,                   [GLOBAL(t80)]   ; *oq1 = sq^0x80;
523538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        xmm4,                   [GLOBAL(t80)]   ; *op1 = sp^0x80;
524f71323e297a928af368937089d3ed71239786f86Andreas Huber
525538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        xmm7,                   [GLOBAL(t80)]
526538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        xmm5,                   [GLOBAL(t80)]
527f71323e297a928af368937089d3ed71239786f86Andreas Huber
528538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddsb      xmm7,                   xmm0            ; sp = vp8_signed_char_clamp(ps2 - u)
529538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        psubsb      xmm5,                   xmm0            ; sq = vp8_signed_char_clamp(qs2 - u)
530f71323e297a928af368937089d3ed71239786f86Andreas Huber
531538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        xmm7,                   [GLOBAL(t80)]   ; *op2 = sp^0x80;
532538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        xmm5,                   [GLOBAL(t80)]   ; *oq2 = sq^0x80;
533f71323e297a928af368937089d3ed71239786f86Andreas Huber
534538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%if %1 == 0
535538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        lea         rsi,                    [rsi+rcx*2]
536538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        lea         rdi,                    [rdi+rcx*2]
537f71323e297a928af368937089d3ed71239786f86Andreas Huber
538538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movq        MMWORD PTR [rsi],       xmm6            ; p0
539538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movhps      MMWORD PTR [rdi],       xmm6
540538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movq        MMWORD PTR [rsi + rcx], xmm3            ; q0
541538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movhps      MMWORD PTR [rdi + rcx], xmm3
542f71323e297a928af368937089d3ed71239786f86Andreas Huber
543538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movq        MMWORD PTR [rsi+rcx*2], xmm1            ; q1
544538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movhps      MMWORD PTR [rdi+rcx*2], xmm1
545f71323e297a928af368937089d3ed71239786f86Andreas Huber
546538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movq        MMWORD PTR [rsi + rax], xmm4            ; p1
547538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movhps      MMWORD PTR [rdi + rax], xmm4
548538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
549538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movq        MMWORD PTR [rsi+rax*2], xmm7            ; p2
550538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movhps      MMWORD PTR [rdi+rax*2], xmm7
551f71323e297a928af368937089d3ed71239786f86Andreas Huber
552f71323e297a928af368937089d3ed71239786f86Andreas Huber        lea         rsi,                    [rsi + rcx]
553f71323e297a928af368937089d3ed71239786f86Andreas Huber        lea         rdi,                    [rdi + rcx]
554538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movq        MMWORD PTR [rsi+rcx*2], xmm5            ; q2
555538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movhps      MMWORD PTR [rdi+rcx*2], xmm5
556538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%elif %1 == 1
557538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      XMMWORD PTR [rdi+rcx],  xmm5            ; q2
558538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      XMMWORD PTR [rdi],      xmm1            ; q1
559538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      XMMWORD PTR [rsi],      xmm3            ; q0
560538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      XMMWORD PTR [rsi+rax  ],xmm6            ; p0
561538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      XMMWORD PTR [rsi+rax*2],xmm4            ; p1
562538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      XMMWORD PTR [rdi+rax*4],xmm7            ; p2
563538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%elif %1 == 2
564538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      XMMWORD PTR [rdx+80],   xmm1            ; q1
565538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      XMMWORD PTR [rdx+64],   xmm3            ; q0
566538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      XMMWORD PTR [rdx+48],   xmm6            ; p0
567538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      XMMWORD PTR [rdx+32],   xmm4            ; p1
568f71323e297a928af368937089d3ed71239786f86Andreas Huber%endif
569538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
570f71323e297a928af368937089d3ed71239786f86Andreas Huber%endmacro
571f71323e297a928af368937089d3ed71239786f86Andreas Huber
572f71323e297a928af368937089d3ed71239786f86Andreas Huber
573f71323e297a928af368937089d3ed71239786f86Andreas Huber;void vp8_mbloop_filter_horizontal_edge_sse2
574f71323e297a928af368937089d3ed71239786f86Andreas Huber;(
575f71323e297a928af368937089d3ed71239786f86Andreas Huber;    unsigned char *src_ptr,
576f71323e297a928af368937089d3ed71239786f86Andreas Huber;    int            src_pixel_step,
577f71323e297a928af368937089d3ed71239786f86Andreas Huber;    const char    *flimit,
578f71323e297a928af368937089d3ed71239786f86Andreas Huber;    const char    *limit,
579f71323e297a928af368937089d3ed71239786f86Andreas Huber;    const char    *thresh,
580f71323e297a928af368937089d3ed71239786f86Andreas Huber;    int            count
581f71323e297a928af368937089d3ed71239786f86Andreas Huber;)
582f71323e297a928af368937089d3ed71239786f86Andreas Huberglobal sym(vp8_mbloop_filter_horizontal_edge_sse2)
583f71323e297a928af368937089d3ed71239786f86Andreas Hubersym(vp8_mbloop_filter_horizontal_edge_sse2):
584f71323e297a928af368937089d3ed71239786f86Andreas Huber    push        rbp
585f71323e297a928af368937089d3ed71239786f86Andreas Huber    mov         rbp, rsp
586f71323e297a928af368937089d3ed71239786f86Andreas Huber    SHADOW_ARGS_TO_STACK 6
587f71323e297a928af368937089d3ed71239786f86Andreas Huber    SAVE_XMM
588f71323e297a928af368937089d3ed71239786f86Andreas Huber    GET_GOT     rbx
589f71323e297a928af368937089d3ed71239786f86Andreas Huber    push        rsi
590f71323e297a928af368937089d3ed71239786f86Andreas Huber    push        rdi
591f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; end prolog
592f71323e297a928af368937089d3ed71239786f86Andreas Huber
593f71323e297a928af368937089d3ed71239786f86Andreas Huber    ALIGN_STACK 16, rax
594f71323e297a928af368937089d3ed71239786f86Andreas Huber    sub         rsp, 32     ; reserve 32 bytes
595f71323e297a928af368937089d3ed71239786f86Andreas Huber    %define t0 [rsp + 0]    ;__declspec(align(16)) char t0[16];
596f71323e297a928af368937089d3ed71239786f86Andreas Huber    %define t1 [rsp + 16]   ;__declspec(align(16)) char t1[16];
597f71323e297a928af368937089d3ed71239786f86Andreas Huber
598f71323e297a928af368937089d3ed71239786f86Andreas Huber        mov         rsi,                    arg(0)            ;src_ptr
599f71323e297a928af368937089d3ed71239786f86Andreas Huber        movsxd      rax,                    dword ptr arg(1)  ;src_pixel_step
600f71323e297a928af368937089d3ed71239786f86Andreas Huber
601f71323e297a928af368937089d3ed71239786f86Andreas Huber        mov         rdx,                    arg(3)            ;limit
602f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm7,                   XMMWORD PTR [rdx]
603f71323e297a928af368937089d3ed71239786f86Andreas Huber
604f71323e297a928af368937089d3ed71239786f86Andreas Huber        lea         rdi,                    [rsi+rax]         ; rdi points to row +1 for indirect addressing
605f71323e297a928af368937089d3ed71239786f86Andreas Huber
606538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        ; calculate breakout conditions and high edge variance
607538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        LFH_FILTER_AND_HEV_MASK 1
608538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        ; filter and write back the results
609538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        MB_FILTER_AND_WRITEBACK 1
610f71323e297a928af368937089d3ed71239786f86Andreas Huber
611f71323e297a928af368937089d3ed71239786f86Andreas Huber    add rsp, 32
612f71323e297a928af368937089d3ed71239786f86Andreas Huber    pop rsp
613f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; begin epilog
614f71323e297a928af368937089d3ed71239786f86Andreas Huber    pop rdi
615f71323e297a928af368937089d3ed71239786f86Andreas Huber    pop rsi
616f71323e297a928af368937089d3ed71239786f86Andreas Huber    RESTORE_GOT
617f71323e297a928af368937089d3ed71239786f86Andreas Huber    RESTORE_XMM
618f71323e297a928af368937089d3ed71239786f86Andreas Huber    UNSHADOW_ARGS
619f71323e297a928af368937089d3ed71239786f86Andreas Huber    pop         rbp
620f71323e297a928af368937089d3ed71239786f86Andreas Huber    ret
621f71323e297a928af368937089d3ed71239786f86Andreas Huber
622f71323e297a928af368937089d3ed71239786f86Andreas Huber
623f71323e297a928af368937089d3ed71239786f86Andreas Huber;void vp8_mbloop_filter_horizontal_edge_uv_sse2
624f71323e297a928af368937089d3ed71239786f86Andreas Huber;(
625f71323e297a928af368937089d3ed71239786f86Andreas Huber;    unsigned char *u,
626f71323e297a928af368937089d3ed71239786f86Andreas Huber;    int            src_pixel_step,
627f71323e297a928af368937089d3ed71239786f86Andreas Huber;    const char    *flimit,
628f71323e297a928af368937089d3ed71239786f86Andreas Huber;    const char    *limit,
629f71323e297a928af368937089d3ed71239786f86Andreas Huber;    const char    *thresh,
630f71323e297a928af368937089d3ed71239786f86Andreas Huber;    unsigned char *v
631f71323e297a928af368937089d3ed71239786f86Andreas Huber;)
632f71323e297a928af368937089d3ed71239786f86Andreas Huberglobal sym(vp8_mbloop_filter_horizontal_edge_uv_sse2)
633f71323e297a928af368937089d3ed71239786f86Andreas Hubersym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
634f71323e297a928af368937089d3ed71239786f86Andreas Huber    push        rbp
635f71323e297a928af368937089d3ed71239786f86Andreas Huber    mov         rbp, rsp
636f71323e297a928af368937089d3ed71239786f86Andreas Huber    SHADOW_ARGS_TO_STACK 6
637f71323e297a928af368937089d3ed71239786f86Andreas Huber    SAVE_XMM
638f71323e297a928af368937089d3ed71239786f86Andreas Huber    GET_GOT     rbx
639f71323e297a928af368937089d3ed71239786f86Andreas Huber    push        rsi
640f71323e297a928af368937089d3ed71239786f86Andreas Huber    push        rdi
641f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; end prolog
642f71323e297a928af368937089d3ed71239786f86Andreas Huber
643f71323e297a928af368937089d3ed71239786f86Andreas Huber    ALIGN_STACK 16, rax
644f71323e297a928af368937089d3ed71239786f86Andreas Huber    sub         rsp, 96       ; reserve 96 bytes
645f71323e297a928af368937089d3ed71239786f86Andreas Huber    %define q2  [rsp + 0]     ;__declspec(align(16)) char q2[16];
646f71323e297a928af368937089d3ed71239786f86Andreas Huber    %define q1  [rsp + 16]    ;__declspec(align(16)) char q1[16];
647f71323e297a928af368937089d3ed71239786f86Andreas Huber    %define p2  [rsp + 32]    ;__declspec(align(16)) char p2[16];
648f71323e297a928af368937089d3ed71239786f86Andreas Huber    %define p1  [rsp + 48]    ;__declspec(align(16)) char p1[16];
649f71323e297a928af368937089d3ed71239786f86Andreas Huber    %define t0  [rsp + 64]    ;__declspec(align(16)) char t0[16];
650f71323e297a928af368937089d3ed71239786f86Andreas Huber    %define t1  [rsp + 80]    ;__declspec(align(16)) char t1[16];
651f71323e297a928af368937089d3ed71239786f86Andreas Huber
652f71323e297a928af368937089d3ed71239786f86Andreas Huber        mov         rsi,                    arg(0)             ; u
653f71323e297a928af368937089d3ed71239786f86Andreas Huber        mov         rdi,                    arg(5)             ; v
654f71323e297a928af368937089d3ed71239786f86Andreas Huber        movsxd      rax,                    dword ptr arg(1)   ; src_pixel_step
655f71323e297a928af368937089d3ed71239786f86Andreas Huber        mov         rcx,                    rax
656f71323e297a928af368937089d3ed71239786f86Andreas Huber        neg         rax                     ; negate pitch to deal with above border
657f71323e297a928af368937089d3ed71239786f86Andreas Huber
658f71323e297a928af368937089d3ed71239786f86Andreas Huber        mov         rdx,                    arg(3)             ;limit
659f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm7,                   XMMWORD PTR [rdx]
660f71323e297a928af368937089d3ed71239786f86Andreas Huber
661f71323e297a928af368937089d3ed71239786f86Andreas Huber        lea         rsi,                    [rsi + rcx]
662f71323e297a928af368937089d3ed71239786f86Andreas Huber        lea         rdi,                    [rdi + rcx]
663f71323e297a928af368937089d3ed71239786f86Andreas Huber
664538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        ; calculate breakout conditions and high edge variance
665538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        LFH_FILTER_AND_HEV_MASK 0
666538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        ; filter and write back the results
667538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        MB_FILTER_AND_WRITEBACK 0
668f71323e297a928af368937089d3ed71239786f86Andreas Huber
669f71323e297a928af368937089d3ed71239786f86Andreas Huber    add rsp, 96
670f71323e297a928af368937089d3ed71239786f86Andreas Huber    pop rsp
671f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; begin epilog
672f71323e297a928af368937089d3ed71239786f86Andreas Huber    pop rdi
673f71323e297a928af368937089d3ed71239786f86Andreas Huber    pop rsi
674f71323e297a928af368937089d3ed71239786f86Andreas Huber    RESTORE_GOT
675f71323e297a928af368937089d3ed71239786f86Andreas Huber    RESTORE_XMM
676f71323e297a928af368937089d3ed71239786f86Andreas Huber    UNSHADOW_ARGS
677f71323e297a928af368937089d3ed71239786f86Andreas Huber    pop         rbp
678f71323e297a928af368937089d3ed71239786f86Andreas Huber    ret
679f71323e297a928af368937089d3ed71239786f86Andreas Huber
680f71323e297a928af368937089d3ed71239786f86Andreas Huber
681538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%macro TRANSPOSE_16X8 2
682538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movq        xmm4,               QWORD PTR [rsi]        ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00
683538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movq        xmm1,               QWORD PTR [rdi]        ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10
684f71323e297a928af368937089d3ed71239786f86Andreas Huber        movq        xmm0,               QWORD PTR [rsi+2*rax]  ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20
685538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movq        xmm7,               QWORD PTR [rdi+2*rax]  ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30
686538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movq        xmm5,               QWORD PTR [rsi+4*rax]  ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40
687538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movq        xmm2,               QWORD PTR [rdi+4*rax]  ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50
688f71323e297a928af368937089d3ed71239786f86Andreas Huber
689538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        punpcklbw   xmm4,               xmm1            ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
690f71323e297a928af368937089d3ed71239786f86Andreas Huber
691538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movq        xmm1,               QWORD PTR [rdi+2*rcx]  ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70
692538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
693538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm3,               xmm4            ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
694f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklbw   xmm0,               xmm7            ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20
695f71323e297a928af368937089d3ed71239786f86Andreas Huber
696538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movq        xmm7,               QWORD PTR [rsi+2*rcx]  ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60
697f71323e297a928af368937089d3ed71239786f86Andreas Huber
698f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklbw   xmm5,               xmm2            ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
699538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%if %1
700538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        lea         rsi,                [rsi+rax*8]
701538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%else
702538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        mov         rsi,                arg(5)          ; v_ptr
703538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%endif
704f71323e297a928af368937089d3ed71239786f86Andreas Huber
705f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm6,               xmm5            ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
706f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklbw   xmm7,               xmm1            ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60
707538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
708f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklwd   xmm5,               xmm7            ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
709f71323e297a928af368937089d3ed71239786f86Andreas Huber
710f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhwd   xmm6,               xmm7            ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44
711538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%if %1
712538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        lea         rdi,                [rdi+rax*8]
713538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%else
714538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        lea         rsi,                [rsi - 4]
715538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%endif
716f71323e297a928af368937089d3ed71239786f86Andreas Huber
717f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklwd   xmm3,               xmm0            ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
718538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%if %1
719538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        lea         rdx,                srct
720538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%else
721538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        lea         rdi,                [rsi + rax]     ; rdi points to row +1 for indirect addressing
722538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%endif
723538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
724538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm2,               xmm3            ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
725f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhwd   xmm4,               xmm0            ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
726f71323e297a928af368937089d3ed71239786f86Andreas Huber
727f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm7,               xmm4            ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
728538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        punpckhdq   xmm3,               xmm5            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
729f71323e297a928af368937089d3ed71239786f86Andreas Huber
730f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhdq   xmm7,               xmm6            ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
731538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
732f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckldq   xmm4,               xmm6            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
733f71323e297a928af368937089d3ed71239786f86Andreas Huber
734f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckldq   xmm2,               xmm5            ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
735f71323e297a928af368937089d3ed71239786f86Andreas Huber
736f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      t0,                 xmm2            ; save to free XMM2
737538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movq        xmm2,               QWORD PTR [rsi]       ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80
738538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movq        xmm6,               QWORD PTR [rdi]       ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90
739538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movq        xmm0,               QWORD PTR [rsi+2*rax] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0
740538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movq        xmm5,               QWORD PTR [rdi+2*rax] ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0
741538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movq        xmm1,               QWORD PTR [rsi+4*rax] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0
742f71323e297a928af368937089d3ed71239786f86Andreas Huber
743538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        punpcklbw   xmm2,               xmm6            ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
744f71323e297a928af368937089d3ed71239786f86Andreas Huber
745538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movq        xmm6,               QWORD PTR [rdi+4*rax] ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0
746f71323e297a928af368937089d3ed71239786f86Andreas Huber
747f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklbw   xmm0,               xmm5                  ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0
748f71323e297a928af368937089d3ed71239786f86Andreas Huber
749538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movq        xmm5,               QWORD PTR [rsi+2*rcx] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0
750f71323e297a928af368937089d3ed71239786f86Andreas Huber
751f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklbw   xmm1,               xmm6            ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0
752f71323e297a928af368937089d3ed71239786f86Andreas Huber
753f71323e297a928af368937089d3ed71239786f86Andreas Huber        movq        xmm6,               QWORD PTR [rdi+2*rcx] ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0
754538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
755f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklbw   xmm5,               xmm6            ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0
756f71323e297a928af368937089d3ed71239786f86Andreas Huber
757f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm6,               xmm1            ;
758f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhwd   xmm6,               xmm5            ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4
759f71323e297a928af368937089d3ed71239786f86Andreas Huber
760f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklwd   xmm1,               xmm5            ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
761f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm5,               xmm2            ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
762f71323e297a928af368937089d3ed71239786f86Andreas Huber
763f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklwd   xmm5,               xmm0            ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
764538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
765f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhwd   xmm2,               xmm0            ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
766f71323e297a928af368937089d3ed71239786f86Andreas Huber
767f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm0,               xmm5
768f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckldq   xmm0,               xmm1            ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
769f71323e297a928af368937089d3ed71239786f86Andreas Huber
770f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhdq   xmm5,               xmm1            ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
771f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm1,               xmm2            ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
772f71323e297a928af368937089d3ed71239786f86Andreas Huber
773f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckldq   xmm1,               xmm6            ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84
774f71323e297a928af368937089d3ed71239786f86Andreas Huber
775538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        punpckhdq   xmm2,               xmm6            ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86
776f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm6,               xmm7            ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
777538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
778f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklqdq  xmm6,               xmm2            ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
779f71323e297a928af368937089d3ed71239786f86Andreas Huber
780f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhqdq  xmm7,               xmm2            ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07
781538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%if %2
782f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm2,               xmm3            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
783f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklqdq  xmm2,               xmm5            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
784f71323e297a928af368937089d3ed71239786f86Andreas Huber
785f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhqdq  xmm3,               xmm5            ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
786538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
787f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      [rdx],              xmm2            ; save 2
788f71323e297a928af368937089d3ed71239786f86Andreas Huber
789f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm5,               xmm4            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
790f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklqdq  xmm4,               xmm1            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
791f71323e297a928af368937089d3ed71239786f86Andreas Huber
792f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      [rdx+16],           xmm3            ; save 3
793538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
794f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhqdq  xmm5,               xmm1            ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
795f71323e297a928af368937089d3ed71239786f86Andreas Huber
796f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      [rdx+32],           xmm4            ; save 4
797f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      [rdx+48],           xmm5            ; save 5
798f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm1,               t0              ; get
799f71323e297a928af368937089d3ed71239786f86Andreas Huber
800538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm2,               xmm1            ;
801f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhqdq  xmm1,               xmm0            ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
802538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
803f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklqdq  xmm2,               xmm0            ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
804f71323e297a928af368937089d3ed71239786f86Andreas Huber%else
805f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      [rdx+112],          xmm7            ; save 7
806f71323e297a928af368937089d3ed71239786f86Andreas Huber
807f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      [rdx+96],           xmm6            ; save 6
808f71323e297a928af368937089d3ed71239786f86Andreas Huber
809538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm2,               xmm3            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
810f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhqdq  xmm3,               xmm5            ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
811538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
812538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        punpcklqdq  xmm2,               xmm5            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
813538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
814f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      [rdx+32],           xmm2            ; save 2
815f71323e297a928af368937089d3ed71239786f86Andreas Huber
816f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm5,               xmm4            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
817f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklqdq  xmm4,               xmm1            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
81890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
819f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      [rdx+48],           xmm3            ; save 3
820538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
821f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhqdq  xmm5,               xmm1            ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
82290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
823f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      [rdx+64],           xmm4            ; save 4
824f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      [rdx+80],           xmm5            ; save 5
825f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm1,               t0              ; get
82690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
827538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm2,               xmm1
828f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhqdq  xmm1,               xmm0            ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
829538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
830f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklqdq  xmm2,               xmm0            ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
83190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
832f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      [rdx+16],           xmm1
833538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
834f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      [rdx],              xmm2
835f71323e297a928af368937089d3ed71239786f86Andreas Huber%endif
836f71323e297a928af368937089d3ed71239786f86Andreas Huber%endmacro
83790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
838538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber%macro LFV_FILTER_MASK_HEV_MASK 1
839f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm0,               xmm6            ; q2
840f71323e297a928af368937089d3ed71239786f86Andreas Huber        psubusb     xmm0,               xmm7            ; q2-q3
84190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
842f71323e297a928af368937089d3ed71239786f86Andreas Huber        psubusb     xmm7,               xmm6            ; q3-q2
843f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm4,               xmm5            ; q1
84490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
845538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        por         xmm7,               xmm0            ; abs (q3-q2)
846538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        psubusb     xmm4,               xmm6            ; q1-q2
84790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
848f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm0,               xmm1
849538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        psubusb     xmm6,               xmm5            ; q2-q1
85090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
851538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        por         xmm6,               xmm4            ; abs (q2-q1)
852f71323e297a928af368937089d3ed71239786f86Andreas Huber        psubusb     xmm0,               xmm2            ; p2 - p3;
85390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
854538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        psubusb     xmm2,               xmm1            ; p3 - p2;
855f71323e297a928af368937089d3ed71239786f86Andreas Huber        por         xmm0,               xmm2            ; abs(p2-p3)
856f71323e297a928af368937089d3ed71239786f86Andreas Huber%if %1
857f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm2,               [rdx]           ; p1
858f71323e297a928af368937089d3ed71239786f86Andreas Huber%else
859f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm2,               [rdx+32]        ; p1
860f71323e297a928af368937089d3ed71239786f86Andreas Huber%endif
861f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm5,               xmm2            ; p1
862538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmaxub      xmm0,               xmm7
86390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
864f71323e297a928af368937089d3ed71239786f86Andreas Huber        psubusb     xmm5,               xmm1            ; p1-p2
865f71323e297a928af368937089d3ed71239786f86Andreas Huber        psubusb     xmm1,               xmm2            ; p2-p1
86690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
867538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm7,               xmm3            ; p0
868538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        psubusb     xmm7,               xmm2            ; p0-p1
86990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
870538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        por         xmm1,               xmm5            ; abs(p2-p1)
871538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmaxub      xmm0,               xmm6
87290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
873538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmaxub      xmm0,               xmm1
874f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm1,               xmm2            ; p1
87590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
876f71323e297a928af368937089d3ed71239786f86Andreas Huber        psubusb     xmm2,               xmm3            ; p1-p0
877538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        lea         rdx,                srct
878538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
879f71323e297a928af368937089d3ed71239786f86Andreas Huber        por         xmm2,               xmm7            ; abs(p1-p0)
88090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
881f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      t0,                 xmm2            ; save abs(p1-p0)
88290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
883538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmaxub      xmm0,               xmm2
884538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
885f71323e297a928af368937089d3ed71239786f86Andreas Huber%if %1
886f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm5,               [rdx+32]        ; q0
887f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm7,               [rdx+48]        ; q1
888f71323e297a928af368937089d3ed71239786f86Andreas Huber%else
889f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm5,               [rdx+64]        ; q0
890f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm7,               [rdx+80]        ; q1
891f71323e297a928af368937089d3ed71239786f86Andreas Huber%endif
892538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        mov         rdx,                arg(3)          ; limit
893538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
894f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm6,               xmm5            ; q0
895f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm2,               xmm7            ; q1
896f71323e297a928af368937089d3ed71239786f86Andreas Huber
897538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        psubusb     xmm5,               xmm7            ; q0-q1
898f71323e297a928af368937089d3ed71239786f86Andreas Huber        psubusb     xmm7,               xmm6            ; q1-q0
899538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
900f71323e297a928af368937089d3ed71239786f86Andreas Huber        por         xmm7,               xmm5            ; abs(q1-q0)
901f71323e297a928af368937089d3ed71239786f86Andreas Huber
902f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      t1,                 xmm7            ; save abs(q1-q0)
903f71323e297a928af368937089d3ed71239786f86Andreas Huber
904538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm4,               XMMWORD PTR [rdx]; limit
905538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
906538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pmaxub      xmm0,               xmm7
907538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        mov         rdx,                arg(2)          ; flimit
908f71323e297a928af368937089d3ed71239786f86Andreas Huber
909538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        psubusb     xmm0,               xmm4
910f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm5,               xmm2            ; q1
911538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
912f71323e297a928af368937089d3ed71239786f86Andreas Huber        psubusb     xmm5,               xmm1            ; q1-=p1
913f71323e297a928af368937089d3ed71239786f86Andreas Huber        psubusb     xmm1,               xmm2            ; p1-=q1
914f71323e297a928af368937089d3ed71239786f86Andreas Huber
915538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        por         xmm5,               xmm1            ; abs(p1-q1)
916f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm1,               xmm3            ; p0
91790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
918538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pand        xmm5,               [GLOBAL(tfe)]   ; set lsb of each byte to zero
919538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        psubusb     xmm1,               xmm6            ; p0-q0
92090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
921538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        psrlw       xmm5,               1               ; abs(p1-q1)/2
922538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        psubusb     xmm6,               xmm3            ; q0-p0
92390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
924538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm2,               XMMWORD PTR [rdx]; flimit
92590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
926538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        mov         rdx,                arg(4)          ; get thresh
92790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
928538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        por         xmm1,               xmm6            ; abs(q0-p0)
929538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddb       xmm2,               xmm2            ; flimit*2 (less than 255)
93090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
931538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm6,               t0              ; get abs (q1 - q0)
93290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
933538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddusb     xmm1,               xmm1            ; abs(q0-p0)*2
93490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
935538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm3,               t1              ; get abs (p1 - p0)
93690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
937538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm7,               XMMWORD PTR [rdx]
93890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
939538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddusb     xmm1,               xmm5            ; abs (p0 - q0) *2 + abs(p1-q1)/2
940538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        psubusb     xmm6,               xmm7            ; abs(q1 - q0) > thresh
94190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
942538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddb       xmm4,               xmm2            ; flimit * 2 + limit (less than 255)
943538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        psubusb     xmm3,               xmm7            ; abs(p1 - p0)> thresh
94490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
945538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        psubusb     xmm1,               xmm4            ; abs (p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
946538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        por         xmm6,               xmm3            ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
94790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
948538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        por         xmm1,               xmm0            ; mask
949538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pcmpeqb     xmm6,               xmm0
95090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
951538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        xmm0,               xmm0
952538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pcmpeqb     xmm4,               xmm4
95390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
954538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pcmpeqb     xmm1,               xmm0
955538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        xmm4,               xmm6
956f71323e297a928af368937089d3ed71239786f86Andreas Huber%endmacro
95790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
958f71323e297a928af368937089d3ed71239786f86Andreas Huber%macro BV_TRANSPOSE 0
959f71323e297a928af368937089d3ed71239786f86Andreas Huber        ; xmm1 =    f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
960f71323e297a928af368937089d3ed71239786f86Andreas Huber        ; xmm6 =    f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
961f71323e297a928af368937089d3ed71239786f86Andreas Huber        ; xmm3 =    f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
962f71323e297a928af368937089d3ed71239786f86Andreas Huber        ; xmm7 =    f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
963f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm2,               xmm1            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
964f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklbw   xmm2,               xmm6            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
96590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
966f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm4,               xmm3            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
967f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhbw   xmm1,               xmm6            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
96890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
969f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklbw   xmm4,               xmm7            ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
970538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
971f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhbw   xmm3,               xmm7            ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
97290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
973f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm6,               xmm2            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
974f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklwd   xmm2,               xmm4            ; 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
97590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
976f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhwd   xmm6,               xmm4            ; 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
977f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm5,               xmm1            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
97890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
979f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklwd   xmm1,               xmm3            ; b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
980538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
981f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhwd   xmm5,               xmm3            ; f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
982f71323e297a928af368937089d3ed71239786f86Andreas Huber        ; xmm2 = 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
983f71323e297a928af368937089d3ed71239786f86Andreas Huber        ; xmm6 = 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
984f71323e297a928af368937089d3ed71239786f86Andreas Huber        ; xmm1 = b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
985f71323e297a928af368937089d3ed71239786f86Andreas Huber        ; xmm5 = f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
986f71323e297a928af368937089d3ed71239786f86Andreas Huber%endmacro
98790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
988f71323e297a928af368937089d3ed71239786f86Andreas Huber%macro BV_WRITEBACK 2
989f71323e297a928af368937089d3ed71239786f86Andreas Huber        movd        [rsi+2],            %1
990f71323e297a928af368937089d3ed71239786f86Andreas Huber        psrldq      %1,                 4
99190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
992f71323e297a928af368937089d3ed71239786f86Andreas Huber        movd        [rdi+2],            %1
993f71323e297a928af368937089d3ed71239786f86Andreas Huber        psrldq      %1,                 4
99490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
995f71323e297a928af368937089d3ed71239786f86Andreas Huber        movd        [rsi+2*rax+2],      %1
996f71323e297a928af368937089d3ed71239786f86Andreas Huber        psrldq      %1,                 4
99790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
998f71323e297a928af368937089d3ed71239786f86Andreas Huber        movd        [rdi+2*rax+2],      %1
99990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1000f71323e297a928af368937089d3ed71239786f86Andreas Huber        movd        [rsi+4*rax+2],      %2
1001f71323e297a928af368937089d3ed71239786f86Andreas Huber        psrldq      %2,                 4
100290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1003f71323e297a928af368937089d3ed71239786f86Andreas Huber        movd        [rdi+4*rax+2],      %2
1004f71323e297a928af368937089d3ed71239786f86Andreas Huber        psrldq      %2,                 4
100590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1006f71323e297a928af368937089d3ed71239786f86Andreas Huber        movd        [rsi+2*rcx+2],      %2
1007f71323e297a928af368937089d3ed71239786f86Andreas Huber        psrldq      %2,                 4
100890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1009f71323e297a928af368937089d3ed71239786f86Andreas Huber        movd        [rdi+2*rcx+2],      %2
1010f71323e297a928af368937089d3ed71239786f86Andreas Huber%endmacro
101190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
101290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1013f71323e297a928af368937089d3ed71239786f86Andreas Huber;void vp8_loop_filter_vertical_edge_sse2
1014f71323e297a928af368937089d3ed71239786f86Andreas Huber;(
1015f71323e297a928af368937089d3ed71239786f86Andreas Huber;    unsigned char *src_ptr,
1016f71323e297a928af368937089d3ed71239786f86Andreas Huber;    int            src_pixel_step,
1017f71323e297a928af368937089d3ed71239786f86Andreas Huber;    const char    *flimit,
1018f71323e297a928af368937089d3ed71239786f86Andreas Huber;    const char    *limit,
1019f71323e297a928af368937089d3ed71239786f86Andreas Huber;    const char    *thresh,
1020f71323e297a928af368937089d3ed71239786f86Andreas Huber;    int            count
1021f71323e297a928af368937089d3ed71239786f86Andreas Huber;)
1022f71323e297a928af368937089d3ed71239786f86Andreas Huberglobal sym(vp8_loop_filter_vertical_edge_sse2)
1023f71323e297a928af368937089d3ed71239786f86Andreas Hubersym(vp8_loop_filter_vertical_edge_sse2):
1024f71323e297a928af368937089d3ed71239786f86Andreas Huber    push        rbp
1025f71323e297a928af368937089d3ed71239786f86Andreas Huber    mov         rbp, rsp
1026f71323e297a928af368937089d3ed71239786f86Andreas Huber    SHADOW_ARGS_TO_STACK 6
1027f71323e297a928af368937089d3ed71239786f86Andreas Huber    SAVE_XMM
1028f71323e297a928af368937089d3ed71239786f86Andreas Huber    GET_GOT     rbx
1029f71323e297a928af368937089d3ed71239786f86Andreas Huber    push        rsi
1030f71323e297a928af368937089d3ed71239786f86Andreas Huber    push        rdi
1031f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; end prolog
103290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1033f71323e297a928af368937089d3ed71239786f86Andreas Huber    ALIGN_STACK 16, rax
1034f71323e297a928af368937089d3ed71239786f86Andreas Huber    sub             rsp, 96      ; reserve 96 bytes
1035f71323e297a928af368937089d3ed71239786f86Andreas Huber    %define t0      [rsp + 0]    ;__declspec(align(16)) char t0[16];
1036f71323e297a928af368937089d3ed71239786f86Andreas Huber    %define t1      [rsp + 16]   ;__declspec(align(16)) char t1[16];
1037f71323e297a928af368937089d3ed71239786f86Andreas Huber    %define srct    [rsp + 32]   ;__declspec(align(16)) char srct[64];
103890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1039f71323e297a928af368937089d3ed71239786f86Andreas Huber        mov         rsi,        arg(0)                  ; src_ptr
1040f71323e297a928af368937089d3ed71239786f86Andreas Huber        movsxd      rax,        dword ptr arg(1)        ; src_pixel_step
104190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1042f71323e297a928af368937089d3ed71239786f86Andreas Huber        lea         rsi,        [rsi - 4]
1043f71323e297a928af368937089d3ed71239786f86Andreas Huber        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
1044f71323e297a928af368937089d3ed71239786f86Andreas Huber        lea         rcx,        [rax*2+rax]
104590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1046f71323e297a928af368937089d3ed71239786f86Andreas Huber        ;transpose 16x8 to 8x16, and store the 8-line result on stack.
1047538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        TRANSPOSE_16X8 1, 1
104890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1049538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        ; calculate filter mask and high edge variance
1050538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        LFV_FILTER_MASK_HEV_MASK 1
105190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1052f71323e297a928af368937089d3ed71239786f86Andreas Huber        ; start work on filters
1053538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        B_FILTER 2
105490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1055f71323e297a928af368937089d3ed71239786f86Andreas Huber        ; tranpose and write back - only work on q1, q0, p0, p1
1056f71323e297a928af368937089d3ed71239786f86Andreas Huber        BV_TRANSPOSE
1057f71323e297a928af368937089d3ed71239786f86Andreas Huber        ; store 16-line result
105890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1059f71323e297a928af368937089d3ed71239786f86Andreas Huber        lea         rdx,        [rax]
1060f71323e297a928af368937089d3ed71239786f86Andreas Huber        neg         rdx
106190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1062f71323e297a928af368937089d3ed71239786f86Andreas Huber        BV_WRITEBACK xmm1, xmm5
106390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1064f71323e297a928af368937089d3ed71239786f86Andreas Huber        lea         rsi,        [rsi+rdx*8]
1065f71323e297a928af368937089d3ed71239786f86Andreas Huber        lea         rdi,        [rdi+rdx*8]
1066f71323e297a928af368937089d3ed71239786f86Andreas Huber        BV_WRITEBACK xmm2, xmm6
106790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1068f71323e297a928af368937089d3ed71239786f86Andreas Huber    add rsp, 96
1069f71323e297a928af368937089d3ed71239786f86Andreas Huber    pop rsp
1070f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; begin epilog
1071f71323e297a928af368937089d3ed71239786f86Andreas Huber    pop rdi
1072f71323e297a928af368937089d3ed71239786f86Andreas Huber    pop rsi
1073f71323e297a928af368937089d3ed71239786f86Andreas Huber    RESTORE_GOT
1074f71323e297a928af368937089d3ed71239786f86Andreas Huber    RESTORE_XMM
1075f71323e297a928af368937089d3ed71239786f86Andreas Huber    UNSHADOW_ARGS
1076f71323e297a928af368937089d3ed71239786f86Andreas Huber    pop         rbp
1077f71323e297a928af368937089d3ed71239786f86Andreas Huber    ret
107890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
107990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1080f71323e297a928af368937089d3ed71239786f86Andreas Huber;void vp8_loop_filter_vertical_edge_uv_sse2
1081f71323e297a928af368937089d3ed71239786f86Andreas Huber;(
1082f71323e297a928af368937089d3ed71239786f86Andreas Huber;    unsigned char *u,
1083f71323e297a928af368937089d3ed71239786f86Andreas Huber;    int            src_pixel_step,
1084f71323e297a928af368937089d3ed71239786f86Andreas Huber;    const char    *flimit,
1085f71323e297a928af368937089d3ed71239786f86Andreas Huber;    const char    *limit,
1086f71323e297a928af368937089d3ed71239786f86Andreas Huber;    const char    *thresh,
1087f71323e297a928af368937089d3ed71239786f86Andreas Huber;    unsigned char *v
1088f71323e297a928af368937089d3ed71239786f86Andreas Huber;)
1089f71323e297a928af368937089d3ed71239786f86Andreas Huberglobal sym(vp8_loop_filter_vertical_edge_uv_sse2)
1090f71323e297a928af368937089d3ed71239786f86Andreas Hubersym(vp8_loop_filter_vertical_edge_uv_sse2):
1091f71323e297a928af368937089d3ed71239786f86Andreas Huber    push        rbp
1092f71323e297a928af368937089d3ed71239786f86Andreas Huber    mov         rbp, rsp
1093f71323e297a928af368937089d3ed71239786f86Andreas Huber    SHADOW_ARGS_TO_STACK 6
1094f71323e297a928af368937089d3ed71239786f86Andreas Huber    SAVE_XMM
1095f71323e297a928af368937089d3ed71239786f86Andreas Huber    GET_GOT     rbx
1096f71323e297a928af368937089d3ed71239786f86Andreas Huber    push        rsi
1097f71323e297a928af368937089d3ed71239786f86Andreas Huber    push        rdi
1098f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; end prolog
109990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1100f71323e297a928af368937089d3ed71239786f86Andreas Huber    ALIGN_STACK 16, rax
1101f71323e297a928af368937089d3ed71239786f86Andreas Huber    sub             rsp, 96      ; reserve 96 bytes
1102f71323e297a928af368937089d3ed71239786f86Andreas Huber    %define t0      [rsp + 0]    ;__declspec(align(16)) char t0[16];
1103f71323e297a928af368937089d3ed71239786f86Andreas Huber    %define t1      [rsp + 16]   ;__declspec(align(16)) char t1[16];
1104f71323e297a928af368937089d3ed71239786f86Andreas Huber    %define srct    [rsp + 32]   ;__declspec(align(16)) char srct[64];
110590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1106f71323e297a928af368937089d3ed71239786f86Andreas Huber        mov         rsi,        arg(0)                  ; u_ptr
1107f71323e297a928af368937089d3ed71239786f86Andreas Huber        movsxd      rax,        dword ptr arg(1)        ; src_pixel_step
110890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1109f71323e297a928af368937089d3ed71239786f86Andreas Huber        lea         rsi,        [rsi - 4]
1110f71323e297a928af368937089d3ed71239786f86Andreas Huber        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
1111f71323e297a928af368937089d3ed71239786f86Andreas Huber        lea         rcx,        [rax+2*rax]
111290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1113f71323e297a928af368937089d3ed71239786f86Andreas Huber        lea         rdx,        srct
111490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1115538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        ;transpose 16x8 to 8x16, and store the 8-line result on stack.
1116538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        TRANSPOSE_16X8 0, 1
1117538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
1118538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        ; calculate filter mask and high edge variance
1119538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        LFV_FILTER_MASK_HEV_MASK 1
112090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1121f71323e297a928af368937089d3ed71239786f86Andreas Huber        ; start work on filters
1122538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        B_FILTER 2
112390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1124f71323e297a928af368937089d3ed71239786f86Andreas Huber        ; tranpose and write back - only work on q1, q0, p0, p1
1125f71323e297a928af368937089d3ed71239786f86Andreas Huber        BV_TRANSPOSE
112690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1127f71323e297a928af368937089d3ed71239786f86Andreas Huber        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
112890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1129f71323e297a928af368937089d3ed71239786f86Andreas Huber        ; store 16-line result
1130f71323e297a928af368937089d3ed71239786f86Andreas Huber        BV_WRITEBACK xmm1, xmm5
113190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1132f71323e297a928af368937089d3ed71239786f86Andreas Huber        mov         rsi,        arg(0)                  ; u_ptr
1133f71323e297a928af368937089d3ed71239786f86Andreas Huber        lea         rsi,        [rsi - 4]
1134f71323e297a928af368937089d3ed71239786f86Andreas Huber        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
1135f71323e297a928af368937089d3ed71239786f86Andreas Huber        BV_WRITEBACK xmm2, xmm6
113690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1137f71323e297a928af368937089d3ed71239786f86Andreas Huber    add rsp, 96
1138f71323e297a928af368937089d3ed71239786f86Andreas Huber    pop rsp
1139f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; begin epilog
1140f71323e297a928af368937089d3ed71239786f86Andreas Huber    pop rdi
1141f71323e297a928af368937089d3ed71239786f86Andreas Huber    pop rsi
1142f71323e297a928af368937089d3ed71239786f86Andreas Huber    RESTORE_GOT
1143f71323e297a928af368937089d3ed71239786f86Andreas Huber    RESTORE_XMM
1144f71323e297a928af368937089d3ed71239786f86Andreas Huber    UNSHADOW_ARGS
1145f71323e297a928af368937089d3ed71239786f86Andreas Huber    pop         rbp
1146f71323e297a928af368937089d3ed71239786f86Andreas Huber    ret
114790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1148f71323e297a928af368937089d3ed71239786f86Andreas Huber%macro MBV_TRANSPOSE 0
1149f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm0,               [rdx]               ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
1150f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm1,               xmm0                ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
115190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1152538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        punpcklbw   xmm0,               xmm7                ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
1153538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        punpckhbw   xmm1,               xmm7                ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
1154f71323e297a928af368937089d3ed71239786f86Andreas Huber
1155f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm2,               [rdx+32]            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
1156f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm6,               xmm2                ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
1157f71323e297a928af368937089d3ed71239786f86Andreas Huber
1158f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklbw   xmm2,               [rdx+48]            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
1159f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhbw   xmm6,               [rdx+48]            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
1160f71323e297a928af368937089d3ed71239786f86Andreas Huber
1161538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm3,               xmm0                ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
1162f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklwd   xmm0,               xmm2                ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
1163f71323e297a928af368937089d3ed71239786f86Andreas Huber
1164538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        punpckhwd   xmm3,               xmm2                ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
1165f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm4,               xmm1                ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
1166f71323e297a928af368937089d3ed71239786f86Andreas Huber
1167f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklwd   xmm1,               xmm6                ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
1168f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhwd   xmm4,               xmm6                ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
1169f71323e297a928af368937089d3ed71239786f86Andreas Huber
1170f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm2,               [rdx+64]            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
1171f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklbw   xmm2,               [rdx+80]            ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
1172f71323e297a928af368937089d3ed71239786f86Andreas Huber
1173538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm6,               xmm5                ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
1174f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklbw   xmm6,               [rdx+112]           ; 77 76 67 66 57 56 47 46 37 36 27 26 17 16 07 06
1175f71323e297a928af368937089d3ed71239786f86Andreas Huber
1176f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm7,               xmm2                ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
1177f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpcklwd   xmm2,               xmm6                ; 37 36 35 34 27 26 25 24 17 16 15 14 07 06 05 04
1178f71323e297a928af368937089d3ed71239786f86Andreas Huber
1179f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhwd   xmm7,               xmm6                ; 77 76 75 74 67 66 65 64 57 56 55 54 47 46 45 44
1180f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm6,               xmm0                ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
1181f71323e297a928af368937089d3ed71239786f86Andreas Huber
1182f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckldq   xmm0,               xmm2                ; 17 16 15 14 13 12 11 10 07 06 05 04 03 02 01 00
1183f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhdq   xmm6,               xmm2                ; 37 36 35 34 33 32 31 30 27 26 25 24 23 22 21 20
1184f71323e297a928af368937089d3ed71239786f86Andreas Huber%endmacro
1185f71323e297a928af368937089d3ed71239786f86Andreas Huber
1186f71323e297a928af368937089d3ed71239786f86Andreas Huber%macro MBV_WRITEBACK_1 0
1187538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movq        QWORD  PTR [rsi],   xmm0
1188538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movhps      MMWORD PTR [rdi],   xmm0
1189f71323e297a928af368937089d3ed71239786f86Andreas Huber
1190538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movq        QWORD  PTR [rsi+2*rax], xmm6
1191538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movhps      MMWORD PTR [rdi+2*rax], xmm6
1192f71323e297a928af368937089d3ed71239786f86Andreas Huber
1193538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm0,               xmm3                ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
1194f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckldq   xmm0,               xmm7                ; 57 56 55 54 53 52 51 50 47 46 45 44 43 42 41 40
119590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1196538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        punpckhdq   xmm3,               xmm7                ; 77 76 75 74 73 72 71 70 67 66 65 64 63 62 61 60
119790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1198538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movq        QWORD  PTR [rsi+4*rax], xmm0
1199538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movhps      MMWORD PTR [rdi+4*rax], xmm0
120090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1201538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movq        QWORD  PTR [rsi+2*rcx], xmm3
1202538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movhps      MMWORD PTR [rdi+2*rcx], xmm3
120390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1204f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm2,               [rdx+64]            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
1205f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhbw   xmm2,               [rdx+80]            ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
120690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1207538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        punpckhbw   xmm5,               [rdx+112]           ; f7 f6 e7 e6 d7 d6 c7 c6 b7 b6 a7 a6 97 96 87 86
1208f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm0,               xmm2
120990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1210538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        punpcklwd   xmm0,               xmm5                ; b7 b6 b4 b4 a7 a6 a5 a4 97 96 95 94 87 86 85 84
1211538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        punpckhwd   xmm2,               xmm5                ; f7 f6 f5 f4 e7 e6 e5 e4 d7 d6 d5 d4 c7 c6 c5 c4
121290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1213538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movdqa      xmm5,               xmm1                ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
1214f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckldq   xmm1,               xmm0                ; 97 96 95 94 93 92 91 90 87 86 85 83 84 82 81 80
121590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1216538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        punpckhdq   xmm5,               xmm0                ; b7 b6 b5 b4 b3 b2 b1 b0 a7 a6 a5 a4 a3 a2 a1 a0
1217f71323e297a928af368937089d3ed71239786f86Andreas Huber%endmacro
1218f71323e297a928af368937089d3ed71239786f86Andreas Huber
1219f71323e297a928af368937089d3ed71239786f86Andreas Huber%macro MBV_WRITEBACK_2 0
1220538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movq        QWORD  PTR [rsi],   xmm1
1221538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movhps      MMWORD PTR [rdi],   xmm1
1222f71323e297a928af368937089d3ed71239786f86Andreas Huber
1223538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movq        QWORD  PTR [rsi+2*rax], xmm5
1224538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movhps      MMWORD PTR [rdi+2*rax], xmm5
1225f71323e297a928af368937089d3ed71239786f86Andreas Huber
1226f71323e297a928af368937089d3ed71239786f86Andreas Huber        movdqa      xmm1,               xmm4                ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
1227f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckldq   xmm1,               xmm2                ; d7 d6 d5 d4 d3 d2 d1 d0 c7 c6 c5 c4 c3 c2 c1 c0
1228f71323e297a928af368937089d3ed71239786f86Andreas Huber        punpckhdq   xmm4,               xmm2                ; f7 f6 f4 f4 f3 f2 f1 f0 e7 e6 e5 e4 e3 e2 e1 e0
1229f71323e297a928af368937089d3ed71239786f86Andreas Huber
1230538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movq        QWORD  PTR [rsi+4*rax], xmm1
1231538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movhps      MMWORD PTR [rdi+4*rax], xmm1
1232f71323e297a928af368937089d3ed71239786f86Andreas Huber
1233538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movq        QWORD  PTR [rsi+2*rcx], xmm4
1234538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movhps      MMWORD PTR [rdi+2*rcx], xmm4
1235f71323e297a928af368937089d3ed71239786f86Andreas Huber%endmacro
1236f71323e297a928af368937089d3ed71239786f86Andreas Huber
1237f71323e297a928af368937089d3ed71239786f86Andreas Huber
1238f71323e297a928af368937089d3ed71239786f86Andreas Huber;void vp8_mbloop_filter_vertical_edge_sse2
1239f71323e297a928af368937089d3ed71239786f86Andreas Huber;(
1240f71323e297a928af368937089d3ed71239786f86Andreas Huber;    unsigned char *src_ptr,
1241f71323e297a928af368937089d3ed71239786f86Andreas Huber;    int            src_pixel_step,
1242f71323e297a928af368937089d3ed71239786f86Andreas Huber;    const char    *flimit,
1243f71323e297a928af368937089d3ed71239786f86Andreas Huber;    const char    *limit,
1244f71323e297a928af368937089d3ed71239786f86Andreas Huber;    const char    *thresh,
1245f71323e297a928af368937089d3ed71239786f86Andreas Huber;    int            count
1246f71323e297a928af368937089d3ed71239786f86Andreas Huber;)
1247f71323e297a928af368937089d3ed71239786f86Andreas Huberglobal sym(vp8_mbloop_filter_vertical_edge_sse2)
1248f71323e297a928af368937089d3ed71239786f86Andreas Hubersym(vp8_mbloop_filter_vertical_edge_sse2):
1249f71323e297a928af368937089d3ed71239786f86Andreas Huber    push        rbp
1250f71323e297a928af368937089d3ed71239786f86Andreas Huber    mov         rbp, rsp
1251f71323e297a928af368937089d3ed71239786f86Andreas Huber    SHADOW_ARGS_TO_STACK 6
1252f71323e297a928af368937089d3ed71239786f86Andreas Huber    SAVE_XMM
1253f71323e297a928af368937089d3ed71239786f86Andreas Huber    GET_GOT     rbx
1254f71323e297a928af368937089d3ed71239786f86Andreas Huber    push        rsi
1255f71323e297a928af368937089d3ed71239786f86Andreas Huber    push        rdi
1256f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; end prolog
125790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1258f71323e297a928af368937089d3ed71239786f86Andreas Huber    ALIGN_STACK 16, rax
1259f71323e297a928af368937089d3ed71239786f86Andreas Huber    sub          rsp, 160     ; reserve 160 bytes
1260f71323e297a928af368937089d3ed71239786f86Andreas Huber    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[16];
1261f71323e297a928af368937089d3ed71239786f86Andreas Huber    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[16];
1262f71323e297a928af368937089d3ed71239786f86Andreas Huber    %define srct [rsp + 32]   ;__declspec(align(16)) char srct[128];
126390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1264f71323e297a928af368937089d3ed71239786f86Andreas Huber        mov         rsi,                arg(0)              ; src_ptr
1265f71323e297a928af368937089d3ed71239786f86Andreas Huber        movsxd      rax,                dword ptr arg(1)    ; src_pixel_step
126690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1267f71323e297a928af368937089d3ed71239786f86Andreas Huber        lea         rsi,                [rsi - 4]
1268f71323e297a928af368937089d3ed71239786f86Andreas Huber        lea         rdi,                [rsi + rax]         ; rdi points to row +1 for indirect addressing
1269f71323e297a928af368937089d3ed71239786f86Andreas Huber        lea         rcx,                [rax*2+rax]
127090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1271f71323e297a928af368937089d3ed71239786f86Andreas Huber        ; Transpose
1272538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        TRANSPOSE_16X8 1, 0
127390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1274538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        ; calculate filter mask and high edge variance
1275538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        LFV_FILTER_MASK_HEV_MASK 0
127690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1277f71323e297a928af368937089d3ed71239786f86Andreas Huber        neg         rax
1278f71323e297a928af368937089d3ed71239786f86Andreas Huber        ; start work on filters
1279538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        MB_FILTER_AND_WRITEBACK 2
128090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1281f71323e297a928af368937089d3ed71239786f86Andreas Huber        lea         rsi,                [rsi+rax*8]
1282f71323e297a928af368937089d3ed71239786f86Andreas Huber        lea         rdi,                [rdi+rax*8]
128390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1284f71323e297a928af368937089d3ed71239786f86Andreas Huber        ; transpose and write back
1285f71323e297a928af368937089d3ed71239786f86Andreas Huber        MBV_TRANSPOSE
128690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1287f71323e297a928af368937089d3ed71239786f86Andreas Huber        neg         rax
128890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1289f71323e297a928af368937089d3ed71239786f86Andreas Huber        MBV_WRITEBACK_1
129090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1291f71323e297a928af368937089d3ed71239786f86Andreas Huber        lea         rsi,                [rsi+rax*8]
1292f71323e297a928af368937089d3ed71239786f86Andreas Huber        lea         rdi,                [rdi+rax*8]
1293f71323e297a928af368937089d3ed71239786f86Andreas Huber        MBV_WRITEBACK_2
129490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1295f71323e297a928af368937089d3ed71239786f86Andreas Huber    add rsp, 160
1296f71323e297a928af368937089d3ed71239786f86Andreas Huber    pop rsp
1297f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; begin epilog
1298f71323e297a928af368937089d3ed71239786f86Andreas Huber    pop rdi
1299f71323e297a928af368937089d3ed71239786f86Andreas Huber    pop rsi
1300f71323e297a928af368937089d3ed71239786f86Andreas Huber    RESTORE_GOT
1301f71323e297a928af368937089d3ed71239786f86Andreas Huber    RESTORE_XMM
1302f71323e297a928af368937089d3ed71239786f86Andreas Huber    UNSHADOW_ARGS
1303f71323e297a928af368937089d3ed71239786f86Andreas Huber    pop         rbp
1304f71323e297a928af368937089d3ed71239786f86Andreas Huber    ret
130590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
130690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1307f71323e297a928af368937089d3ed71239786f86Andreas Huber;void vp8_mbloop_filter_vertical_edge_uv_sse2
1308f71323e297a928af368937089d3ed71239786f86Andreas Huber;(
1309f71323e297a928af368937089d3ed71239786f86Andreas Huber;    unsigned char *u,
1310f71323e297a928af368937089d3ed71239786f86Andreas Huber;    int            src_pixel_step,
1311f71323e297a928af368937089d3ed71239786f86Andreas Huber;    const char    *flimit,
1312f71323e297a928af368937089d3ed71239786f86Andreas Huber;    const char    *limit,
1313f71323e297a928af368937089d3ed71239786f86Andreas Huber;    const char    *thresh,
1314f71323e297a928af368937089d3ed71239786f86Andreas Huber;    unsigned char *v
1315f71323e297a928af368937089d3ed71239786f86Andreas Huber;)
1316f71323e297a928af368937089d3ed71239786f86Andreas Huberglobal sym(vp8_mbloop_filter_vertical_edge_uv_sse2)
1317f71323e297a928af368937089d3ed71239786f86Andreas Hubersym(vp8_mbloop_filter_vertical_edge_uv_sse2):
1318f71323e297a928af368937089d3ed71239786f86Andreas Huber    push        rbp
1319f71323e297a928af368937089d3ed71239786f86Andreas Huber    mov         rbp, rsp
1320f71323e297a928af368937089d3ed71239786f86Andreas Huber    SHADOW_ARGS_TO_STACK 6
1321f71323e297a928af368937089d3ed71239786f86Andreas Huber    SAVE_XMM
1322f71323e297a928af368937089d3ed71239786f86Andreas Huber    GET_GOT     rbx
1323f71323e297a928af368937089d3ed71239786f86Andreas Huber    push        rsi
1324f71323e297a928af368937089d3ed71239786f86Andreas Huber    push        rdi
1325f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; end prolog
132690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1327f71323e297a928af368937089d3ed71239786f86Andreas Huber    ALIGN_STACK 16, rax
1328f71323e297a928af368937089d3ed71239786f86Andreas Huber    sub          rsp, 160     ; reserve 160 bytes
1329f71323e297a928af368937089d3ed71239786f86Andreas Huber    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[16];
1330f71323e297a928af368937089d3ed71239786f86Andreas Huber    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[16];
1331f71323e297a928af368937089d3ed71239786f86Andreas Huber    %define srct [rsp + 32]   ;__declspec(align(16)) char srct[128];
133290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1333f71323e297a928af368937089d3ed71239786f86Andreas Huber        mov         rsi,                arg(0)              ; u_ptr
1334f71323e297a928af368937089d3ed71239786f86Andreas Huber        movsxd      rax,                dword ptr arg(1)    ; src_pixel_step
133590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1336f71323e297a928af368937089d3ed71239786f86Andreas Huber        lea         rsi,                [rsi - 4]
1337f71323e297a928af368937089d3ed71239786f86Andreas Huber        lea         rdi,                [rsi + rax]         ; rdi points to row +1 for indirect addressing
1338f71323e297a928af368937089d3ed71239786f86Andreas Huber        lea         rcx,                [rax+2*rax]
133990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1340f71323e297a928af368937089d3ed71239786f86Andreas Huber        lea         rdx,                srct
134190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1342538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        ; Transpose
1343538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        TRANSPOSE_16X8 0, 0
1344538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
1345538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        ; calculate filter mask and high edge variance
1346538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        LFV_FILTER_MASK_HEV_MASK 0
134790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1348f71323e297a928af368937089d3ed71239786f86Andreas Huber        ; start work on filters
1349538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        MB_FILTER_AND_WRITEBACK 2
135090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1351f71323e297a928af368937089d3ed71239786f86Andreas Huber        ; transpose and write back
1352f71323e297a928af368937089d3ed71239786f86Andreas Huber        MBV_TRANSPOSE
135390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1354f71323e297a928af368937089d3ed71239786f86Andreas Huber        mov         rsi,                arg(0)             ;u_ptr
1355f71323e297a928af368937089d3ed71239786f86Andreas Huber        lea         rsi,                [rsi - 4]
1356f71323e297a928af368937089d3ed71239786f86Andreas Huber        lea         rdi,                [rsi + rax]
1357f71323e297a928af368937089d3ed71239786f86Andreas Huber        MBV_WRITEBACK_1
1358f71323e297a928af368937089d3ed71239786f86Andreas Huber        mov         rsi,                arg(5)             ;v_ptr
1359f71323e297a928af368937089d3ed71239786f86Andreas Huber        lea         rsi,                [rsi - 4]
1360f71323e297a928af368937089d3ed71239786f86Andreas Huber        lea         rdi,                [rsi + rax]
1361f71323e297a928af368937089d3ed71239786f86Andreas Huber        MBV_WRITEBACK_2
136290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
136390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    add rsp, 160
136490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rsp
136590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; begin epilog
136690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rdi
136790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rsi
136890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    RESTORE_GOT
1369f71323e297a928af368937089d3ed71239786f86Andreas Huber    RESTORE_XMM
137090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    UNSHADOW_ARGS
137190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop         rbp
137290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ret
137390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
137490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
137590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;void vp8_loop_filter_simple_horizontal_edge_sse2
137690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;(
137790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned char *src_ptr,
137890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int  src_pixel_step,
137990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    const char *flimit,
138090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    const char *limit,
138190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    const char *thresh,
138290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int count
138390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;)
138490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberglobal sym(vp8_loop_filter_simple_horizontal_edge_sse2)
138590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_loop_filter_simple_horizontal_edge_sse2):
138690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rbp
138790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov         rbp, rsp
138890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    SHADOW_ARGS_TO_STACK 6
1389f71323e297a928af368937089d3ed71239786f86Andreas Huber    SAVE_XMM
139090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    GET_GOT     rbx
139190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rsi
139290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rdi
139390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; end prolog
139490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
139590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rsi, arg(0)             ;src_ptr
139690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rax, dword ptr arg(1)   ;src_pixel_step     ; destination pitch?
139790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rdx, arg(2) ;flimit     ; get flimit
139890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqa      xmm3, XMMWORD PTR [rdx]
139990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rdx, arg(3) ;limit
140090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqa      xmm7, XMMWORD PTR [rdx]
140190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
140290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddb       xmm3, xmm3              ; flimit*2 (less than 255)
140390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddb       xmm3, xmm7              ; flimit * 2 + limit (less than 255)
140490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
140590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rdi, rsi                ; rdi points to row +1 for indirect addressing
140690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rdi, rax
140790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        neg         rax
140890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
140990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; calculate mask
141090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqu      xmm1, [rsi+2*rax]       ; p1
141190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqu      xmm0, [rdi]             ; q1
141290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqa      xmm2, xmm1
141390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqa      xmm7, xmm0
141490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqa      xmm4, xmm0
141590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     xmm0, xmm1              ; q1-=p1
141690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     xmm1, xmm4              ; p1-=q1
141790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         xmm1, xmm0              ; abs(p1-q1)
1418538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pand        xmm1, [GLOBAL(tfe)]     ; set lsb of each byte to zero
141990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       xmm1, 1                 ; abs(p1-q1)/2
142090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
142190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqu      xmm5, [rsi+rax]         ; p0
142290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqu      xmm4, [rsi]             ; q0
142390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqa      xmm0, xmm4              ; q0
142490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqa      xmm6, xmm5              ; p0
142590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     xmm5, xmm4              ; p0-=q0
142690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     xmm4, xmm6              ; q0-=p0
142790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         xmm5, xmm4              ; abs(p0 - q0)
142890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddusb     xmm5, xmm5              ; abs(p0-q0)*2
142990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddusb     xmm5, xmm1              ; abs (p0 - q0) *2 + abs(p1-q1)/2
143090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
143190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     xmm5, xmm3              ; abs(p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
143290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        xmm3, xmm3
143390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pcmpeqb     xmm5, xmm3
143490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
143590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; start work on filters
1436538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        xmm2, [GLOBAL(t80)]     ; p1 offset to convert to signed values
1437538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        xmm7, [GLOBAL(t80)]     ; q1 offset to convert to signed values
143890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      xmm2, xmm7              ; p1 - q1
143990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1440538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        xmm6, [GLOBAL(t80)]     ; offset to convert to signed values
1441538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        xmm0, [GLOBAL(t80)]     ; offset to convert to signed values
144290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqa      xmm3, xmm0              ; q0
144390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      xmm0, xmm6              ; q0 - p0
144490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      xmm2, xmm0              ; p1 - q1 + 1 * (q0 - p0)
144590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      xmm2, xmm0              ; p1 - q1 + 2 * (q0 - p0)
144690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      xmm2, xmm0              ; p1 - q1 + 3 * (q0 - p0)
144790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pand        xmm5, xmm2              ; mask filter values we don't care about
144890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
144990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; do + 4 side
1450538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddsb      xmm5, [GLOBAL(t4)]      ; 3* (q0 - p0) + (p1 - q1) + 4
145190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
145290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqa      xmm0, xmm5              ; get a copy of filters
145390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psllw       xmm0, 8                 ; shift left 8
145490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       xmm0, 3                 ; arithmetic shift right 11
145590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       xmm0, 8
145690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqa      xmm1, xmm5              ; get a copy of filters
145790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       xmm1, 11                ; arithmetic shift right 11
145890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psllw       xmm1, 8                 ; shift left 8 to put it back
145990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
146090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         xmm0, xmm1              ; put the two together to get result
146190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
146290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      xmm3, xmm0              ; q0-= q0 add
1463538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        xmm3, [GLOBAL(t80)]     ; unoffset
146490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqu      [rsi], xmm3             ; write back
146590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
146690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; now do +3 side
1467538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        psubsb      xmm5, [GLOBAL(t1s)]     ; +3 instead of +4
146890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
146990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqa      xmm0, xmm5              ; get a copy of filters
147090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psllw       xmm0, 8                 ; shift left 8
147190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       xmm0, 3                 ; arithmetic shift right 11
147290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       xmm0, 8
147390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       xmm5, 11                ; arithmetic shift right 11
147490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psllw       xmm5, 8                 ; shift left 8 to put it back
147590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         xmm0, xmm5              ; put the two together to get result
147690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
147790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
147890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      xmm6, xmm0              ; p0+= p0 add
1479538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        xmm6, [GLOBAL(t80)]     ; unoffset
148090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqu      [rsi+rax], xmm6         ; write back
148190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
148290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; begin epilog
148390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rdi
148490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rsi
148590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    RESTORE_GOT
1486f71323e297a928af368937089d3ed71239786f86Andreas Huber    RESTORE_XMM
148790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    UNSHADOW_ARGS
148890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop         rbp
148990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ret
149090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
149190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
149290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;void vp8_loop_filter_simple_vertical_edge_sse2
149390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;(
149490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned char *src_ptr,
149590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int  src_pixel_step,
149690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    const char *flimit,
149790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    const char *limit,
149890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    const char *thresh,
149990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int count
150090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;)
150190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberglobal sym(vp8_loop_filter_simple_vertical_edge_sse2)
150290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_loop_filter_simple_vertical_edge_sse2):
150390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rbp         ; save old base pointer value.
150490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov         rbp, rsp    ; set new base pointer value.
150590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    SHADOW_ARGS_TO_STACK 6
1506f71323e297a928af368937089d3ed71239786f86Andreas Huber    SAVE_XMM
150790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    GET_GOT     rbx         ; save callee-saved reg
150890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rsi
150990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rdi
151090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; end prolog
151190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
151290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ALIGN_STACK 16, rax
151390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    sub         rsp, 32                         ; reserve 32 bytes
151490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    %define t0  [rsp + 0]    ;__declspec(align(16)) char t0[16];
151590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    %define t1  [rsp + 16]   ;__declspec(align(16)) char t1[16];
151690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
151790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rsi, arg(0) ;src_ptr
151890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
151990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
152090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea         rsi,        [rsi - 2 ]
152190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea         rdi,        [rsi + rax]
152290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea         rdx,        [rsi + rax*4]
152390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea         rcx,        [rdx + rax]
152490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
152590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqu      xmm0,       [rsi]                   ; (high 96 bits unused) 03 02 01 00
152690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqu      xmm1,       [rdx]                   ; (high 96 bits unused) 43 42 41 40
152790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqu      xmm2,       [rdi]                   ; 13 12 11 10
152890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqu      xmm3,       [rcx]                   ; 53 52 51 50
152990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckldq   xmm0,       xmm1                    ; (high 64 bits unused) 43 42 41 40 03 02 01 00
153090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckldq   xmm2,       xmm3                    ; 53 52 51 50 13 12 11 10
153190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
153290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqu      xmm4,       [rsi + rax*2]           ; 23 22 21 20
153390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqu      xmm5,       [rdx + rax*2]           ; 63 62 61 60
153490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqu      xmm6,       [rdi + rax*2]           ; 33 32 31 30
153590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqu      xmm7,       [rcx + rax*2]           ; 73 72 71 70
153690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckldq   xmm4,       xmm5                    ; 63 62 61 60 23 22 21 20
153790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckldq   xmm6,       xmm7                    ; 73 72 71 70 33 32 31 30
153890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
153990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   xmm0,       xmm2                    ; 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00
154090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   xmm4,       xmm6                    ; 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20
154190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
154290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqa      xmm1,       xmm0
154390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklwd   xmm0,       xmm4                    ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
154490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhwd   xmm1,       xmm4                    ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
154590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
154690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqa      xmm2,       xmm0
154790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckldq   xmm0,       xmm1                    ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
154890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhdq   xmm2,       xmm1                    ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
154990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
155090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqa      t0,         xmm0                    ; save to t0
155190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqa      t1,         xmm2                    ; save to t1
155290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
155390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea         rsi,        [rsi + rax*8]
155490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea         rdi,        [rsi + rax]
155590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea         rdx,        [rsi + rax*4]
155690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea         rcx,        [rdx + rax]
155790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
155890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqu      xmm4,       [rsi]                   ; 83 82 81 80
155990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqu      xmm1,       [rdx]                   ; c3 c2 c1 c0
156090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqu      xmm6,       [rdi]                   ; 93 92 91 90
156190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqu      xmm3,       [rcx]                   ; d3 d2 d1 d0
156290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckldq   xmm4,       xmm1                    ; c3 c2 c1 c0 83 82 81 80
156390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckldq   xmm6,       xmm3                    ; d3 d2 d1 d0 93 92 91 90
156490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
156590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqu      xmm0,       [rsi + rax*2]           ; a3 a2 a1 a0
156690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqu      xmm5,       [rdx + rax*2]           ; e3 e2 e1 e0
156790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqu      xmm2,       [rdi + rax*2]           ; b3 b2 b1 b0
156890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqu      xmm7,       [rcx + rax*2]           ; f3 f2 f1 f0
156990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckldq   xmm0,       xmm5                    ; e3 e2 e1 e0 a3 a2 a1 a0
157090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckldq   xmm2,       xmm7                    ; f3 f2 f1 f0 b3 b2 b1 b0
157190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
157290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   xmm4,       xmm6                    ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80
157390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   xmm0,       xmm2                    ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0
157490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
157590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqa      xmm1,       xmm4
157690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklwd   xmm4,       xmm0                    ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
157790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhwd   xmm1,       xmm0                    ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
157890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
157990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqa      xmm6,       xmm4
158090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckldq   xmm4,       xmm1                    ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
158190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhdq   xmm6,       xmm1                    ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
158290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
158390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqa      xmm0,       t0                      ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
158490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqa      xmm2,       t1                      ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
158590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqa      xmm1,       xmm0
158690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqa      xmm3,       xmm2
158790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
158890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklqdq  xmm0,       xmm4                    ; p1  f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
158990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhqdq  xmm1,       xmm4                    ; p0  f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
159090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklqdq  xmm2,       xmm6                    ; q0  f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
159190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhqdq  xmm3,       xmm6                    ; q1  f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
159290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
159390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; calculate mask
159490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqa      xmm6,       xmm0                            ; p1
159590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqa      xmm7,       xmm3                            ; q1
159690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     xmm7,       xmm0                            ; q1-=p1
159790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     xmm6,       xmm3                            ; p1-=q1
159890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         xmm6,       xmm7                            ; abs(p1-q1)
1599538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pand        xmm6,       [GLOBAL(tfe)]                   ; set lsb of each byte to zero
160090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       xmm6,       1                               ; abs(p1-q1)/2
160190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
160290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqa      xmm5,       xmm1                            ; p0
160390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqa      xmm4,       xmm2                            ; q0
160490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     xmm5,       xmm2                            ; p0-=q0
160590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     xmm4,       xmm1                            ; q0-=p0
160690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         xmm5,       xmm4                            ; abs(p0 - q0)
160790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddusb     xmm5,       xmm5                            ; abs(p0-q0)*2
160890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddusb     xmm5,       xmm6                            ; abs (p0 - q0) *2 + abs(p1-q1)/2
160990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
161090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rdx,        arg(2)                          ;flimit
161190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqa      xmm7, XMMWORD PTR [rdx]
161290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rdx,        arg(3)                          ; get limit
161390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqa      xmm6, XMMWORD PTR [rdx]
161490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddb       xmm7,        xmm7                           ; flimit*2 (less than 255)
161590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddb       xmm7,        xmm6                           ; flimit * 2 + limit (less than 255)
161690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
161790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubusb     xmm5,        xmm7                           ; abs(p0 - q0) *2 + abs(p1-q1)/2  > flimit * 2 + limit
161890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        xmm7,        xmm7
161990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pcmpeqb     xmm5,        xmm7                           ; mm5 = mask
162090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
162190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; start work on filters
162290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqa        t0,        xmm0
162390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqa        t1,        xmm3
162490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1625538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        xmm0,        [GLOBAL(t80)]                  ; p1 offset to convert to signed values
1626538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        xmm3,        [GLOBAL(t80)]                  ; q1 offset to convert to signed values
162790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
162890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      xmm0,        xmm3                           ; p1 - q1
162990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqa      xmm6,        xmm1                           ; p0
163090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
163190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqa      xmm7,        xmm2                           ; q0
1632538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        xmm6,        [GLOBAL(t80)]                  ; offset to convert to signed values
163390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1634538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        xmm7,        [GLOBAL(t80)]                  ; offset to convert to signed values
163590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqa      xmm3,        xmm7                           ; offseted ; q0
163690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
163790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      xmm7,        xmm6                           ; q0 - p0
163890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      xmm0,        xmm7                           ; p1 - q1 + 1 * (q0 - p0)
163990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
164090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      xmm0,        xmm7                           ; p1 - q1 + 2 * (q0 - p0)
164190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      xmm0,        xmm7                           ; p1 - q1 + 3 * (q0 - p0)
164290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
164390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pand        xmm5,        xmm0                           ; mask filter values we don't care about
164490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
164590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1646538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddsb      xmm5,        [GLOBAL(t4)]                   ;  3* (q0 - p0) + (p1 - q1) + 4
164790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
164890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqa      xmm0,        xmm5                           ; get a copy of filters
164990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psllw       xmm0,        8                              ; shift left 8
165090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
165190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       xmm0,        3                              ; arithmetic shift right 11
165290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       xmm0,        8
165390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
165490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqa      xmm7,        xmm5                           ; get a copy of filters
165590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       xmm7,        11                             ; arithmetic shift right 11
165690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
165790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psllw       xmm7,        8                              ; shift left 8 to put it back
165890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         xmm0,        xmm7                           ; put the two together to get result
165990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
166090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsb      xmm3,        xmm0                           ; q0-= q0sz add
1661538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        xmm3,        [GLOBAL(t80)]                  ; unoffset   q0
166290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
166390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; now do +3 side
1664538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        psubsb      xmm5,        [GLOBAL(t1s)]                  ; +3 instead of +4
166590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqa      xmm0,        xmm5                           ; get a copy of filters
166690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
166790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psllw       xmm0,        8                              ; shift left 8
166890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       xmm0,        3                              ; arithmetic shift right 11
166990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
167090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       xmm0,        8
167190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       xmm5,        11                             ; arithmetic shift right 11
167290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
167390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psllw       xmm5,        8                              ; shift left 8 to put it back
167490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         xmm0,        xmm5                           ; put the two together to get result
167590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
167690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddsb      xmm6,        xmm0                           ; p0+= p0 add
1677538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        pxor        xmm6,        [GLOBAL(t80)]                  ; unoffset   p0
167890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
167990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqa      xmm0,        t0                             ; p1
168090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqa      xmm4,        t1                             ; q1
168190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
168290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; transpose back to write out
168390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; p1  f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
168490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; p0  f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
168590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; q0  f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
168690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; q1  f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
168790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqa      xmm1,       xmm0
168890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   xmm0,       xmm6                               ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
168990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   xmm1,       xmm6                               ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
169090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
169190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqa      xmm5,       xmm3
169290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   xmm3,       xmm4                               ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
169390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   xmm5,       xmm4                               ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
169490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
169590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqa      xmm2,       xmm0
169690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklwd   xmm0,       xmm3                               ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
169790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhwd   xmm2,       xmm3                               ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
169890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
169990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movdqa      xmm3,       xmm1
170090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklwd   xmm1,       xmm5                               ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
170190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhwd   xmm3,       xmm5                               ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
170290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
170390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; write out order: xmm0 xmm2 xmm1 xmm3
170490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea         rdx,        [rsi + rax*4]
170590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
170690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        [rsi],      xmm1                               ; write the second 8-line result
170790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrldq      xmm1,       4
170890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        [rdi],      xmm1
170990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrldq      xmm1,       4
171090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        [rsi + rax*2], xmm1
171190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrldq      xmm1,       4
171290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        [rdi + rax*2], xmm1
171390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
171490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        [rdx],      xmm3
171590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrldq      xmm3,       4
171690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        [rcx],      xmm3
171790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrldq      xmm3,       4
171890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        [rdx + rax*2], xmm3
171990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrldq      xmm3,       4
172090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        [rcx + rax*2], xmm3
172190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
172290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        neg         rax
172390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea         rsi,        [rsi + rax*8]
172490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        neg         rax
172590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea         rdi,        [rsi + rax]
172690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea         rdx,        [rsi + rax*4]
172790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea         rcx,        [rdx + rax]
172890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
172990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        [rsi],      xmm0                                ; write the first 8-line result
173090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrldq      xmm0,       4
173190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        [rdi],      xmm0
173290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrldq      xmm0,       4
173390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        [rsi + rax*2], xmm0
173490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrldq      xmm0,       4
173590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        [rdi + rax*2], xmm0
173690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
173790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        [rdx],      xmm2
173890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrldq      xmm2,       4
173990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        [rcx],      xmm2
174090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrldq      xmm2,       4
174190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        [rdx + rax*2], xmm2
174290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrldq      xmm2,       4
174390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        [rcx + rax*2], xmm2
174490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
174590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    add rsp, 32
174690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rsp
174790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; begin epilog
174890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rdi
174990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rsi
175090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    RESTORE_GOT
1751f71323e297a928af368937089d3ed71239786f86Andreas Huber    RESTORE_XMM
175290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    UNSHADOW_ARGS
175390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop         rbp
175490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ret
175590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
175690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas HuberSECTION_RODATA
175790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16
175890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubertfe:
175990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 16 db 0xfe
176090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16
176190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubert80:
176290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 16 db 0x80
176390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16
176490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubert1s:
176590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 16 db 0x01
176690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16
176790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubert3:
176890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 16 db 0x03
176990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16
177090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubert4:
177190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 16 db 0x04
177290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16
177390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberones:
177490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 dw 0x0001
177590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16
177690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubers9:
177790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 dw 0x0900
177890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16
177990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubers63:
178090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 8 dw 0x003f
1781