1233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
2233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
4233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  Use of this source code is governed by a BSD-style license
5233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  that can be found in the LICENSE file in the root of the source
6233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  tree. An additional intellectual property rights grant can be found
7233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  in the file PATENTS.  All contributing project authors may
8233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  be found in the AUTHORS file in the root of the source tree.
9233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
10233d2500723e5594f3e7c70896ffeeef32b9c950ywan
11233d2500723e5594f3e7c70896ffeeef32b9c950ywan
12233d2500723e5594f3e7c70896ffeeef32b9c950ywan%include "vpx_ports/x86_abi_support.asm"
13233d2500723e5594f3e7c70896ffeeef32b9c950ywan
14233d2500723e5594f3e7c70896ffeeef32b9c950ywan;macro in deblock functions
15233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro FIRST_2_ROWS 0
16233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm4,       xmm0
17233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm6,       xmm0
18233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm5,       xmm1
19233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pavgb       xmm5,       xmm3
20233d2500723e5594f3e7c70896ffeeef32b9c950ywan
21233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ;calculate absolute value
22233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm4,       xmm1
23233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm1,       xmm0
24233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm6,       xmm3
25233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm3,       xmm0
26233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddusb     xmm4,       xmm1
27233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddusb     xmm6,       xmm3
28233d2500723e5594f3e7c70896ffeeef32b9c950ywan
29233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ;get threshold
30233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm2,       flimit
31233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm1,       xmm1
32233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm7,       xmm2
33233d2500723e5594f3e7c70896ffeeef32b9c950ywan
34233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ;get mask
35233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm2,       xmm4
36233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm7,       xmm6
37233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pcmpeqb     xmm2,       xmm1
38233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pcmpeqb     xmm7,       xmm1
39233d2500723e5594f3e7c70896ffeeef32b9c950ywan        por         xmm7,       xmm2
40233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
41233d2500723e5594f3e7c70896ffeeef32b9c950ywan
42233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro SECOND_2_ROWS 0
43233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm6,       xmm0
44233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm4,       xmm0
45233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm2,       xmm1
46233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pavgb       xmm1,       xmm3
47233d2500723e5594f3e7c70896ffeeef32b9c950ywan
48233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ;calculate absolute value
49233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm6,       xmm2
50233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm2,       xmm0
51233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm4,       xmm3
52233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm3,       xmm0
53233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddusb     xmm6,       xmm2
54233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddusb     xmm4,       xmm3
55233d2500723e5594f3e7c70896ffeeef32b9c950ywan
56233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pavgb       xmm5,       xmm1
57233d2500723e5594f3e7c70896ffeeef32b9c950ywan
58233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ;get threshold
59233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm2,       flimit
60233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pxor        xmm1,       xmm1
61233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm3,       xmm2
62233d2500723e5594f3e7c70896ffeeef32b9c950ywan
63233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ;get mask
64233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm2,       xmm6
65233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psubusb     xmm3,       xmm4
66233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pcmpeqb     xmm2,       xmm1
67233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pcmpeqb     xmm3,       xmm1
68233d2500723e5594f3e7c70896ffeeef32b9c950ywan
69233d2500723e5594f3e7c70896ffeeef32b9c950ywan        por         xmm7,       xmm2
70233d2500723e5594f3e7c70896ffeeef32b9c950ywan        por         xmm7,       xmm3
71233d2500723e5594f3e7c70896ffeeef32b9c950ywan
72233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pavgb       xmm5,       xmm0
73233d2500723e5594f3e7c70896ffeeef32b9c950ywan
74233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ;decide if or not to use filtered value
75233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pand        xmm0,       xmm7
76233d2500723e5594f3e7c70896ffeeef32b9c950ywan        pandn       xmm7,       xmm5
77233d2500723e5594f3e7c70896ffeeef32b9c950ywan        paddusb     xmm0,       xmm7
78233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
79233d2500723e5594f3e7c70896ffeeef32b9c950ywan
80233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro UPDATE_FLIMIT 0
81233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      xmm2,       XMMWORD PTR [rbx]
82233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqa      [rsp],      xmm2
83233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add         rbx,        16
84233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
85233d2500723e5594f3e7c70896ffeeef32b9c950ywan
86233d2500723e5594f3e7c70896ffeeef32b9c950ywan;void vp8_post_proc_down_and_across_mb_row_sse2
87233d2500723e5594f3e7c70896ffeeef32b9c950ywan;(
88233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned char *src_ptr,
89233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned char *dst_ptr,
90233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    int src_pixels_per_line,
91233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    int dst_pixels_per_line,
92233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    int cols,
93233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    int *flimits,
94233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    int size
95233d2500723e5594f3e7c70896ffeeef32b9c950ywan;)
96233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp8_post_proc_down_and_across_mb_row_sse2) PRIVATE
97233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp8_post_proc_down_and_across_mb_row_sse2):
98233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rbp
99233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         rbp, rsp
100233d2500723e5594f3e7c70896ffeeef32b9c950ywan    SHADOW_ARGS_TO_STACK 7
101233d2500723e5594f3e7c70896ffeeef32b9c950ywan    SAVE_XMM 7
102233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rbx
103233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rsi
104233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rdi
105233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; end prolog
106233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ALIGN_STACK 16, rax
107233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         rsp, 16
108233d2500723e5594f3e7c70896ffeeef32b9c950ywan
109233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; put flimit on stack
110233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rbx,        arg(5)           ;flimits ptr
111233d2500723e5594f3e7c70896ffeeef32b9c950ywan        UPDATE_FLIMIT
112233d2500723e5594f3e7c70896ffeeef32b9c950ywan
113233d2500723e5594f3e7c70896ffeeef32b9c950ywan%define flimit [rsp]
114233d2500723e5594f3e7c70896ffeeef32b9c950ywan
115233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rsi,        arg(0)           ;src_ptr
116233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rdi,        arg(1)           ;dst_ptr
117233d2500723e5594f3e7c70896ffeeef32b9c950ywan
118233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movsxd      rax,        DWORD PTR arg(2) ;src_pixels_per_line
119233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movsxd      rcx,        DWORD PTR arg(6) ;rows in a macroblock
120233d2500723e5594f3e7c70896ffeeef32b9c950ywan.nextrow:
121233d2500723e5594f3e7c70896ffeeef32b9c950ywan        xor         rdx,        rdx              ;col
122233d2500723e5594f3e7c70896ffeeef32b9c950ywan.nextcol:
123233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ;load current and next 2 rows
124233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqu      xmm0,       XMMWORD PTR [rsi]
125233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqu      xmm1,       XMMWORD PTR [rsi + rax]
126233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqu      xmm3,       XMMWORD PTR [rsi + 2*rax]
127233d2500723e5594f3e7c70896ffeeef32b9c950ywan
128233d2500723e5594f3e7c70896ffeeef32b9c950ywan        FIRST_2_ROWS
129233d2500723e5594f3e7c70896ffeeef32b9c950ywan
130233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ;load above 2 rows
131233d2500723e5594f3e7c70896ffeeef32b9c950ywan        neg         rax
132233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqu      xmm1,       XMMWORD PTR [rsi + 2*rax]
133233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqu      xmm3,       XMMWORD PTR [rsi + rax]
134233d2500723e5594f3e7c70896ffeeef32b9c950ywan
135233d2500723e5594f3e7c70896ffeeef32b9c950ywan        SECOND_2_ROWS
136233d2500723e5594f3e7c70896ffeeef32b9c950ywan
137233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqu      XMMWORD PTR [rdi], xmm0
138233d2500723e5594f3e7c70896ffeeef32b9c950ywan
139233d2500723e5594f3e7c70896ffeeef32b9c950ywan        neg         rax                          ; positive stride
140233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add         rsi,        16
141233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add         rdi,        16
142233d2500723e5594f3e7c70896ffeeef32b9c950ywan
143233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add         rdx,        16
144233d2500723e5594f3e7c70896ffeeef32b9c950ywan        cmp         edx,        dword arg(4)     ;cols
145233d2500723e5594f3e7c70896ffeeef32b9c950ywan        jge         .downdone
146233d2500723e5594f3e7c70896ffeeef32b9c950ywan        UPDATE_FLIMIT
147233d2500723e5594f3e7c70896ffeeef32b9c950ywan        jmp         .nextcol
148233d2500723e5594f3e7c70896ffeeef32b9c950ywan
149233d2500723e5594f3e7c70896ffeeef32b9c950ywan.downdone:
150233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; done with the all cols, start the across filtering in place
151233d2500723e5594f3e7c70896ffeeef32b9c950ywan        sub         rsi,        rdx
152233d2500723e5594f3e7c70896ffeeef32b9c950ywan        sub         rdi,        rdx
153233d2500723e5594f3e7c70896ffeeef32b9c950ywan
154233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rbx,        arg(5) ; flimits
155233d2500723e5594f3e7c70896ffeeef32b9c950ywan        UPDATE_FLIMIT
156233d2500723e5594f3e7c70896ffeeef32b9c950ywan
157233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; dup the first byte into the left border 8 times
158233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        mm1,   [rdi]
159233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   mm1,   mm1
160233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklwd   mm1,   mm1
161233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckldq   mm1,   mm1
162233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rdx,    -8
163233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        [rdi+rdx], mm1
164233d2500723e5594f3e7c70896ffeeef32b9c950ywan
165233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; dup the last byte into the right border
166233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movsxd      rdx,    dword arg(4)
167233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        mm1,   [rdi + rdx + -1]
168233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   mm1,   mm1
169233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklwd   mm1,   mm1
170233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckldq   mm1,   mm1
171233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        [rdi+rdx], mm1
172233d2500723e5594f3e7c70896ffeeef32b9c950ywan
173233d2500723e5594f3e7c70896ffeeef32b9c950ywan        xor         rdx,        rdx
174233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        mm0,        QWORD PTR [rdi-16];
175233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        mm1,        QWORD PTR [rdi-8];
176233d2500723e5594f3e7c70896ffeeef32b9c950ywan
177233d2500723e5594f3e7c70896ffeeef32b9c950ywan.acrossnextcol:
178233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqu      xmm0,       XMMWORD PTR [rdi + rdx]
179233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqu      xmm1,       XMMWORD PTR [rdi + rdx -2]
180233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqu      xmm3,       XMMWORD PTR [rdi + rdx -1]
181233d2500723e5594f3e7c70896ffeeef32b9c950ywan
182233d2500723e5594f3e7c70896ffeeef32b9c950ywan        FIRST_2_ROWS
183233d2500723e5594f3e7c70896ffeeef32b9c950ywan
184233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqu      xmm1,       XMMWORD PTR [rdi + rdx +1]
185233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdqu      xmm3,       XMMWORD PTR [rdi + rdx +2]
186233d2500723e5594f3e7c70896ffeeef32b9c950ywan
187233d2500723e5594f3e7c70896ffeeef32b9c950ywan        SECOND_2_ROWS
188233d2500723e5594f3e7c70896ffeeef32b9c950ywan
189233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        QWORD PTR [rdi+rdx-16], mm0  ; store previous 8 bytes
190233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        QWORD PTR [rdi+rdx-8], mm1   ; store previous 8 bytes
191233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdq2q     mm0,        xmm0
192233d2500723e5594f3e7c70896ffeeef32b9c950ywan        psrldq      xmm0,       8
193233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movdq2q     mm1,        xmm0
194233d2500723e5594f3e7c70896ffeeef32b9c950ywan
195233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add         rdx,        16
196233d2500723e5594f3e7c70896ffeeef32b9c950ywan        cmp         edx,        dword arg(4)     ;cols
197233d2500723e5594f3e7c70896ffeeef32b9c950ywan        jge         .acrossdone
198233d2500723e5594f3e7c70896ffeeef32b9c950ywan        UPDATE_FLIMIT
199233d2500723e5594f3e7c70896ffeeef32b9c950ywan        jmp         .acrossnextcol
200233d2500723e5594f3e7c70896ffeeef32b9c950ywan
201233d2500723e5594f3e7c70896ffeeef32b9c950ywan.acrossdone
202233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; last 16 pixels
203233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        QWORD PTR [rdi+rdx-16], mm0
204233d2500723e5594f3e7c70896ffeeef32b9c950ywan
205233d2500723e5594f3e7c70896ffeeef32b9c950ywan        cmp         edx,        dword arg(4)
206233d2500723e5594f3e7c70896ffeeef32b9c950ywan        jne         .throw_last_8
207233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        QWORD PTR [rdi+rdx-8], mm1
208233d2500723e5594f3e7c70896ffeeef32b9c950ywan.throw_last_8:
209233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; done with this rwo
210233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add         rsi,rax                      ;next src line
211233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         eax, dword arg(3)            ;dst_pixels_per_line
212233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add         rdi,rax                      ;next destination
213233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         eax, dword arg(2)            ;src_pixels_per_line
214233d2500723e5594f3e7c70896ffeeef32b9c950ywan
215233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rbx,        arg(5)           ;flimits
216233d2500723e5594f3e7c70896ffeeef32b9c950ywan        UPDATE_FLIMIT
217233d2500723e5594f3e7c70896ffeeef32b9c950ywan
218233d2500723e5594f3e7c70896ffeeef32b9c950ywan        dec         rcx                          ;decrement count
219233d2500723e5594f3e7c70896ffeeef32b9c950ywan        jnz         .nextrow                     ;next row
220233d2500723e5594f3e7c70896ffeeef32b9c950ywan
221233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add rsp, 16
222233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rsp
223233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; begin epilog
224233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rdi
225233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rsi
226233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rbx
227233d2500723e5594f3e7c70896ffeeef32b9c950ywan    RESTORE_XMM
228233d2500723e5594f3e7c70896ffeeef32b9c950ywan    UNSHADOW_ARGS
229233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop         rbp
230233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ret
231233d2500723e5594f3e7c70896ffeeef32b9c950ywan%undef flimit
232233d2500723e5594f3e7c70896ffeeef32b9c950ywan
233233d2500723e5594f3e7c70896ffeeef32b9c950ywan;void vp8_mbpost_proc_down_xmm(unsigned char *dst,
234233d2500723e5594f3e7c70896ffeeef32b9c950ywan;                            int pitch, int rows, int cols,int flimit)
235233d2500723e5594f3e7c70896ffeeef32b9c950ywanextern sym(vp8_rv)
236233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp8_mbpost_proc_down_xmm) PRIVATE
237233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp8_mbpost_proc_down_xmm):
238233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rbp
239233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         rbp, rsp
240233d2500723e5594f3e7c70896ffeeef32b9c950ywan    SHADOW_ARGS_TO_STACK 5
241233d2500723e5594f3e7c70896ffeeef32b9c950ywan    SAVE_XMM 7
242233d2500723e5594f3e7c70896ffeeef32b9c950ywan    GET_GOT     rbx
243233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rsi
244233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rdi
245233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; end prolog
246233d2500723e5594f3e7c70896ffeeef32b9c950ywan
247233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ALIGN_STACK 16, rax
248233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         rsp, 128+16
249233d2500723e5594f3e7c70896ffeeef32b9c950ywan
250233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; unsigned char d[16][8] at [rsp]
251233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; create flimit2 at [rsp+128]
252233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         eax, dword ptr arg(4) ;flimit
253233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         [rsp+128], eax
254233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         [rsp+128+4], eax
255233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         [rsp+128+8], eax
256233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         [rsp+128+12], eax
257233d2500723e5594f3e7c70896ffeeef32b9c950ywan%define flimit4 [rsp+128]
258233d2500723e5594f3e7c70896ffeeef32b9c950ywan
259233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if ABI_IS_32BIT=0
260233d2500723e5594f3e7c70896ffeeef32b9c950ywan    lea         r8,       [GLOBAL(sym(vp8_rv))]
261233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif
262233d2500723e5594f3e7c70896ffeeef32b9c950ywan
263233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;rows +=8;
264233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add         dword arg(2), 8
265233d2500723e5594f3e7c70896ffeeef32b9c950ywan
266233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;for(c=0; c<cols; c+=8)
267233d2500723e5594f3e7c70896ffeeef32b9c950ywan.loop_col:
268233d2500723e5594f3e7c70896ffeeef32b9c950ywan            mov         rsi,        arg(0) ; s
269233d2500723e5594f3e7c70896ffeeef32b9c950ywan            pxor        xmm0,       xmm0        ;
270233d2500723e5594f3e7c70896ffeeef32b9c950ywan
271233d2500723e5594f3e7c70896ffeeef32b9c950ywan            movsxd      rax,        dword ptr arg(1) ;pitch       ;
272233d2500723e5594f3e7c70896ffeeef32b9c950ywan
273233d2500723e5594f3e7c70896ffeeef32b9c950ywan            ; this copies the last row down into the border 8 rows
274233d2500723e5594f3e7c70896ffeeef32b9c950ywan            mov         rdi,        rsi
275233d2500723e5594f3e7c70896ffeeef32b9c950ywan            mov         rdx,        arg(2)
276233d2500723e5594f3e7c70896ffeeef32b9c950ywan            sub         rdx,        9
277233d2500723e5594f3e7c70896ffeeef32b9c950ywan            imul        rdx,        rax
278233d2500723e5594f3e7c70896ffeeef32b9c950ywan            lea         rdi,        [rdi+rdx]
279233d2500723e5594f3e7c70896ffeeef32b9c950ywan            movq        xmm1,       QWORD ptr[rdi]              ; first row
280233d2500723e5594f3e7c70896ffeeef32b9c950ywan            mov         rcx,        8
281233d2500723e5594f3e7c70896ffeeef32b9c950ywan.init_borderd                                                    ; initialize borders
282233d2500723e5594f3e7c70896ffeeef32b9c950ywan            lea         rdi,        [rdi + rax]
283233d2500723e5594f3e7c70896ffeeef32b9c950ywan            movq        [rdi],      xmm1
284233d2500723e5594f3e7c70896ffeeef32b9c950ywan
285233d2500723e5594f3e7c70896ffeeef32b9c950ywan            dec         rcx
286233d2500723e5594f3e7c70896ffeeef32b9c950ywan            jne         .init_borderd
287233d2500723e5594f3e7c70896ffeeef32b9c950ywan
288233d2500723e5594f3e7c70896ffeeef32b9c950ywan            neg         rax                                     ; rax = -pitch
289233d2500723e5594f3e7c70896ffeeef32b9c950ywan
290233d2500723e5594f3e7c70896ffeeef32b9c950ywan            ; this copies the first row up into the border 8 rows
291233d2500723e5594f3e7c70896ffeeef32b9c950ywan            mov         rdi,        rsi
292233d2500723e5594f3e7c70896ffeeef32b9c950ywan            movq        xmm1,       QWORD ptr[rdi]              ; first row
293233d2500723e5594f3e7c70896ffeeef32b9c950ywan            mov         rcx,        8
294233d2500723e5594f3e7c70896ffeeef32b9c950ywan.init_border                                                    ; initialize borders
295233d2500723e5594f3e7c70896ffeeef32b9c950ywan            lea         rdi,        [rdi + rax]
296233d2500723e5594f3e7c70896ffeeef32b9c950ywan            movq        [rdi],      xmm1
297233d2500723e5594f3e7c70896ffeeef32b9c950ywan
298233d2500723e5594f3e7c70896ffeeef32b9c950ywan            dec         rcx
299233d2500723e5594f3e7c70896ffeeef32b9c950ywan            jne         .init_border
300233d2500723e5594f3e7c70896ffeeef32b9c950ywan
301233d2500723e5594f3e7c70896ffeeef32b9c950ywan
302233d2500723e5594f3e7c70896ffeeef32b9c950ywan
303233d2500723e5594f3e7c70896ffeeef32b9c950ywan            lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]
304233d2500723e5594f3e7c70896ffeeef32b9c950ywan            neg         rax
305233d2500723e5594f3e7c70896ffeeef32b9c950ywan
306233d2500723e5594f3e7c70896ffeeef32b9c950ywan            pxor        xmm5,       xmm5
307233d2500723e5594f3e7c70896ffeeef32b9c950ywan            pxor        xmm6,       xmm6        ;
308233d2500723e5594f3e7c70896ffeeef32b9c950ywan
309233d2500723e5594f3e7c70896ffeeef32b9c950ywan            pxor        xmm7,       xmm7        ;
310233d2500723e5594f3e7c70896ffeeef32b9c950ywan            mov         rdi,        rsi
311233d2500723e5594f3e7c70896ffeeef32b9c950ywan
312233d2500723e5594f3e7c70896ffeeef32b9c950ywan            mov         rcx,        15          ;
313233d2500723e5594f3e7c70896ffeeef32b9c950ywan
314233d2500723e5594f3e7c70896ffeeef32b9c950ywan.loop_initvar:
315233d2500723e5594f3e7c70896ffeeef32b9c950ywan            movq        xmm1,       QWORD PTR [rdi];
316233d2500723e5594f3e7c70896ffeeef32b9c950ywan            punpcklbw   xmm1,       xmm0        ;
317233d2500723e5594f3e7c70896ffeeef32b9c950ywan
318233d2500723e5594f3e7c70896ffeeef32b9c950ywan            paddw       xmm5,       xmm1        ;
319233d2500723e5594f3e7c70896ffeeef32b9c950ywan            pmullw      xmm1,       xmm1        ;
320233d2500723e5594f3e7c70896ffeeef32b9c950ywan
321233d2500723e5594f3e7c70896ffeeef32b9c950ywan            movdqa      xmm2,       xmm1        ;
322233d2500723e5594f3e7c70896ffeeef32b9c950ywan            punpcklwd   xmm1,       xmm0        ;
323233d2500723e5594f3e7c70896ffeeef32b9c950ywan
324233d2500723e5594f3e7c70896ffeeef32b9c950ywan            punpckhwd   xmm2,       xmm0        ;
325233d2500723e5594f3e7c70896ffeeef32b9c950ywan            paddd       xmm6,       xmm1        ;
326233d2500723e5594f3e7c70896ffeeef32b9c950ywan
327233d2500723e5594f3e7c70896ffeeef32b9c950ywan            paddd       xmm7,       xmm2        ;
328233d2500723e5594f3e7c70896ffeeef32b9c950ywan            lea         rdi,        [rdi+rax]   ;
329233d2500723e5594f3e7c70896ffeeef32b9c950ywan
330233d2500723e5594f3e7c70896ffeeef32b9c950ywan            dec         rcx
331233d2500723e5594f3e7c70896ffeeef32b9c950ywan            jne         .loop_initvar
332233d2500723e5594f3e7c70896ffeeef32b9c950ywan            ;save the var and sum
333233d2500723e5594f3e7c70896ffeeef32b9c950ywan            xor         rdx,        rdx
334233d2500723e5594f3e7c70896ffeeef32b9c950ywan.loop_row:
335233d2500723e5594f3e7c70896ffeeef32b9c950ywan            movq        xmm1,       QWORD PTR [rsi]     ; [s-pitch*8]
336233d2500723e5594f3e7c70896ffeeef32b9c950ywan            movq        xmm2,       QWORD PTR [rdi]     ; [s+pitch*7]
337233d2500723e5594f3e7c70896ffeeef32b9c950ywan
338233d2500723e5594f3e7c70896ffeeef32b9c950ywan            punpcklbw   xmm1,       xmm0
339233d2500723e5594f3e7c70896ffeeef32b9c950ywan            punpcklbw   xmm2,       xmm0
340233d2500723e5594f3e7c70896ffeeef32b9c950ywan
341233d2500723e5594f3e7c70896ffeeef32b9c950ywan            paddw       xmm5,       xmm2
342233d2500723e5594f3e7c70896ffeeef32b9c950ywan            psubw       xmm5,       xmm1
343233d2500723e5594f3e7c70896ffeeef32b9c950ywan
344233d2500723e5594f3e7c70896ffeeef32b9c950ywan            pmullw      xmm2,       xmm2
345233d2500723e5594f3e7c70896ffeeef32b9c950ywan            movdqa      xmm4,       xmm2
346233d2500723e5594f3e7c70896ffeeef32b9c950ywan
347233d2500723e5594f3e7c70896ffeeef32b9c950ywan            punpcklwd   xmm2,       xmm0
348233d2500723e5594f3e7c70896ffeeef32b9c950ywan            punpckhwd   xmm4,       xmm0
349233d2500723e5594f3e7c70896ffeeef32b9c950ywan
350233d2500723e5594f3e7c70896ffeeef32b9c950ywan            paddd       xmm6,       xmm2
351233d2500723e5594f3e7c70896ffeeef32b9c950ywan            paddd       xmm7,       xmm4
352233d2500723e5594f3e7c70896ffeeef32b9c950ywan
353233d2500723e5594f3e7c70896ffeeef32b9c950ywan            pmullw      xmm1,       xmm1
354233d2500723e5594f3e7c70896ffeeef32b9c950ywan            movdqa      xmm2,       xmm1
355233d2500723e5594f3e7c70896ffeeef32b9c950ywan
356233d2500723e5594f3e7c70896ffeeef32b9c950ywan            punpcklwd   xmm1,       xmm0
357233d2500723e5594f3e7c70896ffeeef32b9c950ywan            psubd       xmm6,       xmm1
358233d2500723e5594f3e7c70896ffeeef32b9c950ywan
359233d2500723e5594f3e7c70896ffeeef32b9c950ywan            punpckhwd   xmm2,       xmm0
360233d2500723e5594f3e7c70896ffeeef32b9c950ywan            psubd       xmm7,       xmm2
361233d2500723e5594f3e7c70896ffeeef32b9c950ywan
362233d2500723e5594f3e7c70896ffeeef32b9c950ywan
363233d2500723e5594f3e7c70896ffeeef32b9c950ywan            movdqa      xmm3,       xmm6
364233d2500723e5594f3e7c70896ffeeef32b9c950ywan            pslld       xmm3,       4
365233d2500723e5594f3e7c70896ffeeef32b9c950ywan
366233d2500723e5594f3e7c70896ffeeef32b9c950ywan            psubd       xmm3,       xmm6
367233d2500723e5594f3e7c70896ffeeef32b9c950ywan            movdqa      xmm1,       xmm5
368233d2500723e5594f3e7c70896ffeeef32b9c950ywan
369233d2500723e5594f3e7c70896ffeeef32b9c950ywan            movdqa      xmm4,       xmm5
370233d2500723e5594f3e7c70896ffeeef32b9c950ywan            pmullw      xmm1,       xmm1
371233d2500723e5594f3e7c70896ffeeef32b9c950ywan
372233d2500723e5594f3e7c70896ffeeef32b9c950ywan            pmulhw      xmm4,       xmm4
373233d2500723e5594f3e7c70896ffeeef32b9c950ywan            movdqa      xmm2,       xmm1
374233d2500723e5594f3e7c70896ffeeef32b9c950ywan
375233d2500723e5594f3e7c70896ffeeef32b9c950ywan            punpcklwd   xmm1,       xmm4
376233d2500723e5594f3e7c70896ffeeef32b9c950ywan            punpckhwd   xmm2,       xmm4
377233d2500723e5594f3e7c70896ffeeef32b9c950ywan
378233d2500723e5594f3e7c70896ffeeef32b9c950ywan            movdqa      xmm4,       xmm7
379233d2500723e5594f3e7c70896ffeeef32b9c950ywan            pslld       xmm4,       4
380233d2500723e5594f3e7c70896ffeeef32b9c950ywan
381233d2500723e5594f3e7c70896ffeeef32b9c950ywan            psubd       xmm4,       xmm7
382233d2500723e5594f3e7c70896ffeeef32b9c950ywan
383233d2500723e5594f3e7c70896ffeeef32b9c950ywan            psubd       xmm3,       xmm1
384233d2500723e5594f3e7c70896ffeeef32b9c950ywan            psubd       xmm4,       xmm2
385233d2500723e5594f3e7c70896ffeeef32b9c950ywan
386233d2500723e5594f3e7c70896ffeeef32b9c950ywan            psubd       xmm3,       flimit4
387233d2500723e5594f3e7c70896ffeeef32b9c950ywan            psubd       xmm4,       flimit4
388233d2500723e5594f3e7c70896ffeeef32b9c950ywan
389233d2500723e5594f3e7c70896ffeeef32b9c950ywan            psrad       xmm3,       31
390233d2500723e5594f3e7c70896ffeeef32b9c950ywan            psrad       xmm4,       31
391233d2500723e5594f3e7c70896ffeeef32b9c950ywan
392233d2500723e5594f3e7c70896ffeeef32b9c950ywan            packssdw    xmm3,       xmm4
393233d2500723e5594f3e7c70896ffeeef32b9c950ywan            packsswb    xmm3,       xmm0
394233d2500723e5594f3e7c70896ffeeef32b9c950ywan
395233d2500723e5594f3e7c70896ffeeef32b9c950ywan            movq        xmm1,       QWORD PTR [rsi+rax*8]
396233d2500723e5594f3e7c70896ffeeef32b9c950ywan
397233d2500723e5594f3e7c70896ffeeef32b9c950ywan            movq        xmm2,       xmm1
398233d2500723e5594f3e7c70896ffeeef32b9c950ywan            punpcklbw   xmm1,       xmm0
399233d2500723e5594f3e7c70896ffeeef32b9c950ywan
400233d2500723e5594f3e7c70896ffeeef32b9c950ywan            paddw       xmm1,       xmm5
401233d2500723e5594f3e7c70896ffeeef32b9c950ywan            mov         rcx,        rdx
402233d2500723e5594f3e7c70896ffeeef32b9c950ywan
403233d2500723e5594f3e7c70896ffeeef32b9c950ywan            and         rcx,        127
404233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if ABI_IS_32BIT=1 && CONFIG_PIC=1
405233d2500723e5594f3e7c70896ffeeef32b9c950ywan            push        rax
406233d2500723e5594f3e7c70896ffeeef32b9c950ywan            lea         rax,        [GLOBAL(sym(vp8_rv))]
407233d2500723e5594f3e7c70896ffeeef32b9c950ywan            movdqu      xmm4,       [rax + rcx*2] ;vp8_rv[rcx*2]
408233d2500723e5594f3e7c70896ffeeef32b9c950ywan            pop         rax
409233d2500723e5594f3e7c70896ffeeef32b9c950ywan%elif ABI_IS_32BIT=0
410233d2500723e5594f3e7c70896ffeeef32b9c950ywan            movdqu      xmm4,       [r8 + rcx*2] ;vp8_rv[rcx*2]
411233d2500723e5594f3e7c70896ffeeef32b9c950ywan%else
412233d2500723e5594f3e7c70896ffeeef32b9c950ywan            movdqu      xmm4,       [sym(vp8_rv) + rcx*2]
413233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif
414233d2500723e5594f3e7c70896ffeeef32b9c950ywan
415233d2500723e5594f3e7c70896ffeeef32b9c950ywan            paddw       xmm1,       xmm4
416233d2500723e5594f3e7c70896ffeeef32b9c950ywan            ;paddw     xmm1,       eight8s
417233d2500723e5594f3e7c70896ffeeef32b9c950ywan            psraw       xmm1,       4
418233d2500723e5594f3e7c70896ffeeef32b9c950ywan
419233d2500723e5594f3e7c70896ffeeef32b9c950ywan            packuswb    xmm1,       xmm0
420233d2500723e5594f3e7c70896ffeeef32b9c950ywan            pand        xmm1,       xmm3
421233d2500723e5594f3e7c70896ffeeef32b9c950ywan
422233d2500723e5594f3e7c70896ffeeef32b9c950ywan            pandn       xmm3,       xmm2
423233d2500723e5594f3e7c70896ffeeef32b9c950ywan            por         xmm1,       xmm3
424233d2500723e5594f3e7c70896ffeeef32b9c950ywan
425233d2500723e5594f3e7c70896ffeeef32b9c950ywan            and         rcx,        15
426233d2500723e5594f3e7c70896ffeeef32b9c950ywan            movq        QWORD PTR   [rsp + rcx*8], xmm1 ;d[rcx*8]
427233d2500723e5594f3e7c70896ffeeef32b9c950ywan
428233d2500723e5594f3e7c70896ffeeef32b9c950ywan            cmp         edx,        8
429233d2500723e5594f3e7c70896ffeeef32b9c950ywan            jl          .skip_assignment
430233d2500723e5594f3e7c70896ffeeef32b9c950ywan
431233d2500723e5594f3e7c70896ffeeef32b9c950ywan            mov         rcx,        rdx
432233d2500723e5594f3e7c70896ffeeef32b9c950ywan            sub         rcx,        8
433233d2500723e5594f3e7c70896ffeeef32b9c950ywan            and         rcx,        15
434233d2500723e5594f3e7c70896ffeeef32b9c950ywan            movq        mm0,        [rsp + rcx*8] ;d[rcx*8]
435233d2500723e5594f3e7c70896ffeeef32b9c950ywan            movq        [rsi],      mm0
436233d2500723e5594f3e7c70896ffeeef32b9c950ywan
437233d2500723e5594f3e7c70896ffeeef32b9c950ywan.skip_assignment
438233d2500723e5594f3e7c70896ffeeef32b9c950ywan            lea         rsi,        [rsi+rax]
439233d2500723e5594f3e7c70896ffeeef32b9c950ywan
440233d2500723e5594f3e7c70896ffeeef32b9c950ywan            lea         rdi,        [rdi+rax]
441233d2500723e5594f3e7c70896ffeeef32b9c950ywan            add         rdx,        1
442233d2500723e5594f3e7c70896ffeeef32b9c950ywan
443233d2500723e5594f3e7c70896ffeeef32b9c950ywan            cmp         edx,        dword arg(2) ;rows
444233d2500723e5594f3e7c70896ffeeef32b9c950ywan            jl          .loop_row
445233d2500723e5594f3e7c70896ffeeef32b9c950ywan
446233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add         dword arg(0), 8 ; s += 8
447233d2500723e5594f3e7c70896ffeeef32b9c950ywan        sub         dword arg(3), 8 ; cols -= 8
448233d2500723e5594f3e7c70896ffeeef32b9c950ywan        cmp         dword arg(3), 0
449233d2500723e5594f3e7c70896ffeeef32b9c950ywan        jg          .loop_col
450233d2500723e5594f3e7c70896ffeeef32b9c950ywan
451233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add         rsp, 128+16
452233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop         rsp
453233d2500723e5594f3e7c70896ffeeef32b9c950ywan
454233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; begin epilog
455233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rdi
456233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rsi
457233d2500723e5594f3e7c70896ffeeef32b9c950ywan    RESTORE_GOT
458233d2500723e5594f3e7c70896ffeeef32b9c950ywan    RESTORE_XMM
459233d2500723e5594f3e7c70896ffeeef32b9c950ywan    UNSHADOW_ARGS
460233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop         rbp
461233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ret
462233d2500723e5594f3e7c70896ffeeef32b9c950ywan%undef flimit4
463233d2500723e5594f3e7c70896ffeeef32b9c950ywan
464233d2500723e5594f3e7c70896ffeeef32b9c950ywan
465233d2500723e5594f3e7c70896ffeeef32b9c950ywan;void vp8_mbpost_proc_across_ip_xmm(unsigned char *src,
466233d2500723e5594f3e7c70896ffeeef32b9c950ywan;                                int pitch, int rows, int cols,int flimit)
467233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp8_mbpost_proc_across_ip_xmm) PRIVATE
468233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp8_mbpost_proc_across_ip_xmm):
469233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rbp
470233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         rbp, rsp
471233d2500723e5594f3e7c70896ffeeef32b9c950ywan    SHADOW_ARGS_TO_STACK 5
472233d2500723e5594f3e7c70896ffeeef32b9c950ywan    SAVE_XMM 7
473233d2500723e5594f3e7c70896ffeeef32b9c950ywan    GET_GOT     rbx
474233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rsi
475233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rdi
476233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; end prolog
477233d2500723e5594f3e7c70896ffeeef32b9c950ywan
478233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ALIGN_STACK 16, rax
479233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         rsp, 16
480233d2500723e5594f3e7c70896ffeeef32b9c950ywan
481233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; create flimit4 at [rsp]
482233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         eax, dword ptr arg(4) ;flimit
483233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         [rsp], eax
484233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         [rsp+4], eax
485233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         [rsp+8], eax
486233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         [rsp+12], eax
487233d2500723e5594f3e7c70896ffeeef32b9c950ywan%define flimit4 [rsp]
488233d2500723e5594f3e7c70896ffeeef32b9c950ywan
489233d2500723e5594f3e7c70896ffeeef32b9c950ywan
490233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;for(r=0;r<rows;r++)
491233d2500723e5594f3e7c70896ffeeef32b9c950ywan.ip_row_loop:
492233d2500723e5594f3e7c70896ffeeef32b9c950ywan
493233d2500723e5594f3e7c70896ffeeef32b9c950ywan        xor         rdx,    rdx ;sumsq=0;
494233d2500723e5594f3e7c70896ffeeef32b9c950ywan        xor         rcx,    rcx ;sum=0;
495233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rsi,    arg(0); s
496233d2500723e5594f3e7c70896ffeeef32b9c950ywan
497233d2500723e5594f3e7c70896ffeeef32b9c950ywan
498233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; dup the first byte into the left border 8 times
499233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        mm1,   [rsi]
500233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   mm1,   mm1
501233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklwd   mm1,   mm1
502233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckldq   mm1,   mm1
503233d2500723e5594f3e7c70896ffeeef32b9c950ywan
504233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mov         rdi,    -8
505233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        [rsi+rdi], mm1
506233d2500723e5594f3e7c70896ffeeef32b9c950ywan
507233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ; dup the last byte into the right border
508233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movsxd      rdx,    dword arg(3)
509233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        mm1,   [rsi + rdx + -1]
510233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklbw   mm1,   mm1
511233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpcklwd   mm1,   mm1
512233d2500723e5594f3e7c70896ffeeef32b9c950ywan        punpckldq   mm1,   mm1
513233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movq        [rsi+rdx], mm1
514233d2500723e5594f3e7c70896ffeeef32b9c950ywan
515233d2500723e5594f3e7c70896ffeeef32b9c950ywan.ip_var_loop:
516233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ;for(i=-8;i<=6;i++)
517233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ;{
518233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ;    sumsq += s[i]*s[i];
519233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ;    sum   += s[i];
520233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ;}
521233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movzx       eax, byte [rsi+rdi]
522233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add         ecx, eax
523233d2500723e5594f3e7c70896ffeeef32b9c950ywan        mul         al
524233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add         edx, eax
525233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add         rdi, 1
526233d2500723e5594f3e7c70896ffeeef32b9c950ywan        cmp         rdi, 6
527233d2500723e5594f3e7c70896ffeeef32b9c950ywan        jle         .ip_var_loop
528233d2500723e5594f3e7c70896ffeeef32b9c950ywan
529233d2500723e5594f3e7c70896ffeeef32b9c950ywan
530233d2500723e5594f3e7c70896ffeeef32b9c950ywan            ;mov         rax,    sumsq
531233d2500723e5594f3e7c70896ffeeef32b9c950ywan            ;movd        xmm7,   rax
532233d2500723e5594f3e7c70896ffeeef32b9c950ywan            movd        xmm7,   edx
533233d2500723e5594f3e7c70896ffeeef32b9c950ywan
534233d2500723e5594f3e7c70896ffeeef32b9c950ywan            ;mov         rax,    sum
535233d2500723e5594f3e7c70896ffeeef32b9c950ywan            ;movd        xmm6,   rax
536233d2500723e5594f3e7c70896ffeeef32b9c950ywan            movd        xmm6,   ecx
537233d2500723e5594f3e7c70896ffeeef32b9c950ywan
538233d2500723e5594f3e7c70896ffeeef32b9c950ywan            mov         rsi,    arg(0) ;s
539233d2500723e5594f3e7c70896ffeeef32b9c950ywan            xor         rcx,    rcx
540233d2500723e5594f3e7c70896ffeeef32b9c950ywan
541233d2500723e5594f3e7c70896ffeeef32b9c950ywan            movsxd      rdx,    dword arg(3) ;cols
542233d2500723e5594f3e7c70896ffeeef32b9c950ywan            add         rdx,    8
543233d2500723e5594f3e7c70896ffeeef32b9c950ywan            pxor        mm0,    mm0
544233d2500723e5594f3e7c70896ffeeef32b9c950ywan            pxor        mm1,    mm1
545233d2500723e5594f3e7c70896ffeeef32b9c950ywan
546233d2500723e5594f3e7c70896ffeeef32b9c950ywan            pxor        xmm0,   xmm0
547233d2500723e5594f3e7c70896ffeeef32b9c950ywan.nextcol4:
548233d2500723e5594f3e7c70896ffeeef32b9c950ywan
549233d2500723e5594f3e7c70896ffeeef32b9c950ywan            movd        xmm1,   DWORD PTR [rsi+rcx-8]   ; -8 -7 -6 -5
550233d2500723e5594f3e7c70896ffeeef32b9c950ywan            movd        xmm2,   DWORD PTR [rsi+rcx+7]   ; +7 +8 +9 +10
551233d2500723e5594f3e7c70896ffeeef32b9c950ywan
552233d2500723e5594f3e7c70896ffeeef32b9c950ywan            punpcklbw   xmm1,   xmm0                    ; expanding
553233d2500723e5594f3e7c70896ffeeef32b9c950ywan            punpcklbw   xmm2,   xmm0                    ; expanding
554233d2500723e5594f3e7c70896ffeeef32b9c950ywan
555233d2500723e5594f3e7c70896ffeeef32b9c950ywan            punpcklwd   xmm1,   xmm0                    ; expanding to dwords
556233d2500723e5594f3e7c70896ffeeef32b9c950ywan            punpcklwd   xmm2,   xmm0                    ; expanding to dwords
557233d2500723e5594f3e7c70896ffeeef32b9c950ywan
558233d2500723e5594f3e7c70896ffeeef32b9c950ywan            psubd       xmm2,   xmm1                    ; 7--8   8--7   9--6 10--5
559233d2500723e5594f3e7c70896ffeeef32b9c950ywan            paddd       xmm1,   xmm1                    ; -8*2   -7*2   -6*2 -5*2
560233d2500723e5594f3e7c70896ffeeef32b9c950ywan
561233d2500723e5594f3e7c70896ffeeef32b9c950ywan            paddd       xmm1,   xmm2                    ; 7+-8   8+-7   9+-6 10+-5
562233d2500723e5594f3e7c70896ffeeef32b9c950ywan            pmaddwd     xmm1,   xmm2                    ; squared of 7+-8   8+-7   9+-6 10+-5
563233d2500723e5594f3e7c70896ffeeef32b9c950ywan
564233d2500723e5594f3e7c70896ffeeef32b9c950ywan            paddd       xmm6,   xmm2
565233d2500723e5594f3e7c70896ffeeef32b9c950ywan            paddd       xmm7,   xmm1
566233d2500723e5594f3e7c70896ffeeef32b9c950ywan
567233d2500723e5594f3e7c70896ffeeef32b9c950ywan            pshufd      xmm6,   xmm6,   0               ; duplicate the last ones
568233d2500723e5594f3e7c70896ffeeef32b9c950ywan            pshufd      xmm7,   xmm7,   0               ; duplicate the last ones
569233d2500723e5594f3e7c70896ffeeef32b9c950ywan
570233d2500723e5594f3e7c70896ffeeef32b9c950ywan            psrldq      xmm1,       4                   ; 8--7   9--6 10--5  0000
571233d2500723e5594f3e7c70896ffeeef32b9c950ywan            psrldq      xmm2,       4                   ; 8--7   9--6 10--5  0000
572233d2500723e5594f3e7c70896ffeeef32b9c950ywan
573233d2500723e5594f3e7c70896ffeeef32b9c950ywan            pshufd      xmm3,   xmm1,   3               ; 0000  8--7   8--7   8--7 squared
574233d2500723e5594f3e7c70896ffeeef32b9c950ywan            pshufd      xmm4,   xmm2,   3               ; 0000  8--7   8--7   8--7 squared
575233d2500723e5594f3e7c70896ffeeef32b9c950ywan
576233d2500723e5594f3e7c70896ffeeef32b9c950ywan            paddd       xmm6,   xmm4
577233d2500723e5594f3e7c70896ffeeef32b9c950ywan            paddd       xmm7,   xmm3
578233d2500723e5594f3e7c70896ffeeef32b9c950ywan
579233d2500723e5594f3e7c70896ffeeef32b9c950ywan            pshufd      xmm3,   xmm1,   01011111b       ; 0000  0000   9--6   9--6 squared
580233d2500723e5594f3e7c70896ffeeef32b9c950ywan            pshufd      xmm4,   xmm2,   01011111b       ; 0000  0000   9--6   9--6 squared
581233d2500723e5594f3e7c70896ffeeef32b9c950ywan
582233d2500723e5594f3e7c70896ffeeef32b9c950ywan            paddd       xmm7,   xmm3
583233d2500723e5594f3e7c70896ffeeef32b9c950ywan            paddd       xmm6,   xmm4
584233d2500723e5594f3e7c70896ffeeef32b9c950ywan
585233d2500723e5594f3e7c70896ffeeef32b9c950ywan            pshufd      xmm3,   xmm1,   10111111b       ; 0000  0000   8--7   8--7 squared
586233d2500723e5594f3e7c70896ffeeef32b9c950ywan            pshufd      xmm4,   xmm2,   10111111b       ; 0000  0000   8--7   8--7 squared
587233d2500723e5594f3e7c70896ffeeef32b9c950ywan
588233d2500723e5594f3e7c70896ffeeef32b9c950ywan            paddd       xmm7,   xmm3
589233d2500723e5594f3e7c70896ffeeef32b9c950ywan            paddd       xmm6,   xmm4
590233d2500723e5594f3e7c70896ffeeef32b9c950ywan
591233d2500723e5594f3e7c70896ffeeef32b9c950ywan            movdqa      xmm3,   xmm6
592233d2500723e5594f3e7c70896ffeeef32b9c950ywan            pmaddwd     xmm3,   xmm3
593233d2500723e5594f3e7c70896ffeeef32b9c950ywan
594233d2500723e5594f3e7c70896ffeeef32b9c950ywan            movdqa      xmm5,   xmm7
595233d2500723e5594f3e7c70896ffeeef32b9c950ywan            pslld       xmm5,   4
596233d2500723e5594f3e7c70896ffeeef32b9c950ywan
597233d2500723e5594f3e7c70896ffeeef32b9c950ywan            psubd       xmm5,   xmm7
598233d2500723e5594f3e7c70896ffeeef32b9c950ywan            psubd       xmm5,   xmm3
599233d2500723e5594f3e7c70896ffeeef32b9c950ywan
600233d2500723e5594f3e7c70896ffeeef32b9c950ywan            psubd       xmm5,   flimit4
601233d2500723e5594f3e7c70896ffeeef32b9c950ywan            psrad       xmm5,   31
602233d2500723e5594f3e7c70896ffeeef32b9c950ywan
603233d2500723e5594f3e7c70896ffeeef32b9c950ywan            packssdw    xmm5,   xmm0
604233d2500723e5594f3e7c70896ffeeef32b9c950ywan            packsswb    xmm5,   xmm0
605233d2500723e5594f3e7c70896ffeeef32b9c950ywan
606233d2500723e5594f3e7c70896ffeeef32b9c950ywan            movd        xmm1,   DWORD PTR [rsi+rcx]
607233d2500723e5594f3e7c70896ffeeef32b9c950ywan            movq        xmm2,   xmm1
608233d2500723e5594f3e7c70896ffeeef32b9c950ywan
609233d2500723e5594f3e7c70896ffeeef32b9c950ywan            punpcklbw   xmm1,   xmm0
610233d2500723e5594f3e7c70896ffeeef32b9c950ywan            punpcklwd   xmm1,   xmm0
611233d2500723e5594f3e7c70896ffeeef32b9c950ywan
612233d2500723e5594f3e7c70896ffeeef32b9c950ywan            paddd       xmm1,   xmm6
613233d2500723e5594f3e7c70896ffeeef32b9c950ywan            paddd       xmm1,   [GLOBAL(four8s)]
614233d2500723e5594f3e7c70896ffeeef32b9c950ywan
615233d2500723e5594f3e7c70896ffeeef32b9c950ywan            psrad       xmm1,   4
616233d2500723e5594f3e7c70896ffeeef32b9c950ywan            packssdw    xmm1,   xmm0
617233d2500723e5594f3e7c70896ffeeef32b9c950ywan
618233d2500723e5594f3e7c70896ffeeef32b9c950ywan            packuswb    xmm1,   xmm0
619233d2500723e5594f3e7c70896ffeeef32b9c950ywan            pand        xmm1,   xmm5
620233d2500723e5594f3e7c70896ffeeef32b9c950ywan
621233d2500723e5594f3e7c70896ffeeef32b9c950ywan            pandn       xmm5,   xmm2
622233d2500723e5594f3e7c70896ffeeef32b9c950ywan            por         xmm5,   xmm1
623233d2500723e5594f3e7c70896ffeeef32b9c950ywan
624233d2500723e5594f3e7c70896ffeeef32b9c950ywan            movd        [rsi+rcx-8],  mm0
625233d2500723e5594f3e7c70896ffeeef32b9c950ywan            movq        mm0,    mm1
626233d2500723e5594f3e7c70896ffeeef32b9c950ywan
627233d2500723e5594f3e7c70896ffeeef32b9c950ywan            movdq2q     mm1,    xmm5
628233d2500723e5594f3e7c70896ffeeef32b9c950ywan            psrldq      xmm7,   12
629233d2500723e5594f3e7c70896ffeeef32b9c950ywan
630233d2500723e5594f3e7c70896ffeeef32b9c950ywan            psrldq      xmm6,   12
631233d2500723e5594f3e7c70896ffeeef32b9c950ywan            add         rcx,    4
632233d2500723e5594f3e7c70896ffeeef32b9c950ywan
633233d2500723e5594f3e7c70896ffeeef32b9c950ywan            cmp         rcx,    rdx
634233d2500723e5594f3e7c70896ffeeef32b9c950ywan            jl          .nextcol4
635233d2500723e5594f3e7c70896ffeeef32b9c950ywan
636233d2500723e5594f3e7c70896ffeeef32b9c950ywan        ;s+=pitch;
637233d2500723e5594f3e7c70896ffeeef32b9c950ywan        movsxd rax, dword arg(1)
638233d2500723e5594f3e7c70896ffeeef32b9c950ywan        add    arg(0), rax
639233d2500723e5594f3e7c70896ffeeef32b9c950ywan
640233d2500723e5594f3e7c70896ffeeef32b9c950ywan        sub dword arg(2), 1 ;rows-=1
641233d2500723e5594f3e7c70896ffeeef32b9c950ywan        cmp dword arg(2), 0
642233d2500723e5594f3e7c70896ffeeef32b9c950ywan        jg .ip_row_loop
643233d2500723e5594f3e7c70896ffeeef32b9c950ywan
644233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add         rsp, 16
645233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop         rsp
646233d2500723e5594f3e7c70896ffeeef32b9c950ywan
647233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; begin epilog
648233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rdi
649233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rsi
650233d2500723e5594f3e7c70896ffeeef32b9c950ywan    RESTORE_GOT
651233d2500723e5594f3e7c70896ffeeef32b9c950ywan    RESTORE_XMM
652233d2500723e5594f3e7c70896ffeeef32b9c950ywan    UNSHADOW_ARGS
653233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop         rbp
654233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ret
655233d2500723e5594f3e7c70896ffeeef32b9c950ywan%undef flimit4
656233d2500723e5594f3e7c70896ffeeef32b9c950ywan
657233d2500723e5594f3e7c70896ffeeef32b9c950ywan
658233d2500723e5594f3e7c70896ffeeef32b9c950ywan;void vp8_plane_add_noise_wmt (unsigned char *Start, unsigned char *noise,
659233d2500723e5594f3e7c70896ffeeef32b9c950ywan;                            unsigned char blackclamp[16],
660233d2500723e5594f3e7c70896ffeeef32b9c950ywan;                            unsigned char whiteclamp[16],
661233d2500723e5594f3e7c70896ffeeef32b9c950ywan;                            unsigned char bothclamp[16],
662233d2500723e5594f3e7c70896ffeeef32b9c950ywan;                            unsigned int Width, unsigned int Height, int Pitch)
663233d2500723e5594f3e7c70896ffeeef32b9c950ywanextern sym(rand)
664233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp8_plane_add_noise_wmt) PRIVATE
665233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp8_plane_add_noise_wmt):
666233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rbp
667233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         rbp, rsp
668233d2500723e5594f3e7c70896ffeeef32b9c950ywan    SHADOW_ARGS_TO_STACK 8
669233d2500723e5594f3e7c70896ffeeef32b9c950ywan    GET_GOT     rbx
670233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rsi
671233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rdi
672233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; end prolog
673233d2500723e5594f3e7c70896ffeeef32b9c950ywan
674233d2500723e5594f3e7c70896ffeeef32b9c950ywan.addnoise_loop:
675233d2500723e5594f3e7c70896ffeeef32b9c950ywan    call sym(rand) WRT_PLT
676233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov     rcx, arg(1) ;noise
677233d2500723e5594f3e7c70896ffeeef32b9c950ywan    and     rax, 0xff
678233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add     rcx, rax
679233d2500723e5594f3e7c70896ffeeef32b9c950ywan
680233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; we rely on the fact that the clamping vectors are stored contiguously
681233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; in black/white/both order. Note that we have to reload this here because
682233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; rdx could be trashed by rand()
683233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov     rdx, arg(2) ; blackclamp
684233d2500723e5594f3e7c70896ffeeef32b9c950ywan
685233d2500723e5594f3e7c70896ffeeef32b9c950ywan
686233d2500723e5594f3e7c70896ffeeef32b9c950ywan            mov     rdi, rcx
687233d2500723e5594f3e7c70896ffeeef32b9c950ywan            movsxd  rcx, dword arg(5) ;[Width]
688233d2500723e5594f3e7c70896ffeeef32b9c950ywan            mov     rsi, arg(0) ;Pos
689233d2500723e5594f3e7c70896ffeeef32b9c950ywan            xor         rax,rax
690233d2500723e5594f3e7c70896ffeeef32b9c950ywan
691233d2500723e5594f3e7c70896ffeeef32b9c950ywan.addnoise_nextset:
692233d2500723e5594f3e7c70896ffeeef32b9c950ywan            movdqu      xmm1,[rsi+rax]         ; get the source
693233d2500723e5594f3e7c70896ffeeef32b9c950ywan
694233d2500723e5594f3e7c70896ffeeef32b9c950ywan            psubusb     xmm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
695233d2500723e5594f3e7c70896ffeeef32b9c950ywan            paddusb     xmm1, [rdx+32] ;bothclamp
696233d2500723e5594f3e7c70896ffeeef32b9c950ywan            psubusb     xmm1, [rdx+16] ;whiteclamp
697233d2500723e5594f3e7c70896ffeeef32b9c950ywan
698233d2500723e5594f3e7c70896ffeeef32b9c950ywan            movdqu      xmm2,[rdi+rax]         ; get the noise for this line
699233d2500723e5594f3e7c70896ffeeef32b9c950ywan            paddb       xmm1,xmm2              ; add it in
700233d2500723e5594f3e7c70896ffeeef32b9c950ywan            movdqu      [rsi+rax],xmm1         ; store the result
701233d2500723e5594f3e7c70896ffeeef32b9c950ywan
702233d2500723e5594f3e7c70896ffeeef32b9c950ywan            add         rax,16                 ; move to the next line
703233d2500723e5594f3e7c70896ffeeef32b9c950ywan
704233d2500723e5594f3e7c70896ffeeef32b9c950ywan            cmp         rax, rcx
705233d2500723e5594f3e7c70896ffeeef32b9c950ywan            jl          .addnoise_nextset
706233d2500723e5594f3e7c70896ffeeef32b9c950ywan
707233d2500723e5594f3e7c70896ffeeef32b9c950ywan    movsxd  rax, dword arg(7) ; Pitch
708233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add     arg(0), rax ; Start += Pitch
709233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub     dword arg(6), 1   ; Height -= 1
710233d2500723e5594f3e7c70896ffeeef32b9c950ywan    jg      .addnoise_loop
711233d2500723e5594f3e7c70896ffeeef32b9c950ywan
712233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; begin epilog
713233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rdi
714233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop rsi
715233d2500723e5594f3e7c70896ffeeef32b9c950ywan    RESTORE_GOT
716233d2500723e5594f3e7c70896ffeeef32b9c950ywan    UNSHADOW_ARGS
717233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop         rbp
718233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ret
719233d2500723e5594f3e7c70896ffeeef32b9c950ywan
720233d2500723e5594f3e7c70896ffeeef32b9c950ywan
721233d2500723e5594f3e7c70896ffeeef32b9c950ywanSECTION_RODATA
722233d2500723e5594f3e7c70896ffeeef32b9c950ywanalign 16
723233d2500723e5594f3e7c70896ffeeef32b9c950ywanfour8s:
724233d2500723e5594f3e7c70896ffeeef32b9c950ywan    times 4 dd 8
725