1ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;
2ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;
4ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;  Use of this source code is governed by a BSD-style license
5ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;  that can be found in the LICENSE file in the root of the source
6ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;  tree. An additional intellectual property rights grant can be found
7ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;  in the file PATENTS.  All contributing project authors may
8ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;  be found in the AUTHORS file in the root of the source tree.
9ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;
10ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
11ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
12ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%include "vpx_ports/x86_abi_support.asm"
13ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
14ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;void vp9_post_proc_down_and_across_xmm
15ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;(
16ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    unsigned char *src_ptr,
17ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    unsigned char *dst_ptr,
18ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    int src_pixels_per_line,
19ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    int dst_pixels_per_line,
20ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    int rows,
21ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    int cols,
22ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;    int flimit
23ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;)
24ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangglobal sym(vp9_post_proc_down_and_across_xmm) PRIVATE
25ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangsym(vp9_post_proc_down_and_across_xmm):
26ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push        rbp
27ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    mov         rbp, rsp
28ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    SHADOW_ARGS_TO_STACK 7
29ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    SAVE_XMM 7
30ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    GET_GOT     rbx
31ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push        rsi
32ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push        rdi
33ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ; end prolog
34ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
35ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%if ABI_IS_32BIT=1 && CONFIG_PIC=1
36ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ALIGN_STACK 16, rax
37ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ; move the global rd onto the stack, since we don't have enough registers
38ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ; to do PIC addressing
39ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    movdqa      xmm0, [GLOBAL(rd42)]
40ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    sub         rsp, 16
41ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    movdqa      [rsp], xmm0
42ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%define RD42 [rsp]
43ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%else
44ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%define RD42 [GLOBAL(rd42)]
45ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endif
46ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
47ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
48ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movd        xmm2,       dword ptr arg(6) ;flimit
49ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklwd   xmm2,       xmm2
50ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpckldq   xmm2,       xmm2
51ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklqdq  xmm2,       xmm2
52ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
53ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        mov         rsi,        arg(0) ;src_ptr
54ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        mov         rdi,        arg(1) ;dst_ptr
55ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
56ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movsxd      rcx,        DWORD PTR arg(4) ;rows
57ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movsxd      rax,        DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
58ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pxor        xmm0,       xmm0              ; mm0 = 00000000
59ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
60ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang.nextrow:
61ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
62ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        xor         rdx,        rdx       ; clear out rdx for use as loop counter
63ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang.nextcol:
64ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        xmm3,       QWORD PTR [rsi]         ; mm4 = r0 p0..p7
65ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   xmm3,       xmm0                    ; mm3 = p0..p3
66ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movdqa      xmm1,       xmm3                    ; mm1 = p0..p3
67ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psllw       xmm3,       2                       ;
68ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
69ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        xmm5,       QWORD PTR [rsi + rax]   ; mm4 = r1 p0..p7
70ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   xmm5,       xmm0                    ; mm5 = r1 p0..p3
71ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddusw     xmm3,       xmm5                    ; mm3 += mm6
72ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
73ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        ; thresholding
74ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movdqa      xmm7,       xmm1                    ; mm7 = r0 p0..p3
75ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubusw     xmm7,       xmm5                    ; mm7 = r0 p0..p3 - r1 p0..p3
76ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubusw     xmm5,       xmm1                    ; mm5 = r1 p0..p3 - r0 p0..p3
77ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddusw     xmm7,       xmm5                    ; mm7 = abs(r0 p0..p3 - r1 p0..p3)
78ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pcmpgtw     xmm7,       xmm2
79ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
80ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        xmm5,       QWORD PTR [rsi + 2*rax] ; mm4 = r2 p0..p7
81ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   xmm5,       xmm0                    ; mm5 = r2 p0..p3
82ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddusw     xmm3,       xmm5                    ; mm3 += mm5
83ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
84ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        ; thresholding
85ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3
86ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubusw     xmm6,       xmm5                    ; mm6 = r0 p0..p3 - r2 p0..p3
87ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubusw     xmm5,       xmm1                    ; mm5 = r2 p0..p3 - r2 p0..p3
88ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddusw     xmm6,       xmm5                    ; mm6 = abs(r0 p0..p3 - r2 p0..p3)
89ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pcmpgtw     xmm6,       xmm2
90ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        por         xmm7,       xmm6                    ; accumulate thresholds
91ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
92ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
93ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        neg         rax
94ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        xmm5,       QWORD PTR [rsi+2*rax]   ; mm4 = r-2 p0..p7
95ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   xmm5,       xmm0                    ; mm5 = r-2 p0..p3
96ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddusw     xmm3,       xmm5                    ; mm3 += mm5
97ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
98ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        ; thresholding
99ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3
100ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubusw     xmm6,       xmm5                    ; mm6 = p0..p3 - r-2 p0..p3
101ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubusw     xmm5,       xmm1                    ; mm5 = r-2 p0..p3 - p0..p3
102ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddusw     xmm6,       xmm5                    ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)
103ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pcmpgtw     xmm6,       xmm2
104ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        por         xmm7,       xmm6                    ; accumulate thresholds
105ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
106ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        xmm4,       QWORD PTR [rsi+rax]     ; mm4 = r-1 p0..p7
107ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   xmm4,       xmm0                    ; mm4 = r-1 p0..p3
108ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddusw     xmm3,       xmm4                    ; mm3 += mm5
109ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
110ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        ; thresholding
111ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3
112ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubusw     xmm6,       xmm4                    ; mm6 = p0..p3 - r-2 p0..p3
113ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubusw     xmm4,       xmm1                    ; mm5 = r-1 p0..p3 - p0..p3
114ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddusw     xmm6,       xmm4                    ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)
115ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pcmpgtw     xmm6,       xmm2
116ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        por         xmm7,       xmm6                    ; accumulate thresholds
117ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
118ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
119ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddusw     xmm3,       RD42                    ; mm3 += round value
120ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psraw       xmm3,       3                       ; mm3 /= 8
121ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
122ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pand        xmm1,       xmm7                    ; mm1 select vals > thresh from source
123ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pandn       xmm7,       xmm3                    ; mm7 select vals < thresh from blurred result
124ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddusw     xmm1,       xmm7                    ; combination
125ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
126ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        packuswb    xmm1,       xmm0                    ; pack to bytes
127ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        QWORD PTR [rdi], xmm1             ;
128ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
129ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        neg         rax                   ; pitch is positive
130ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add         rsi,        8
131ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add         rdi,        8
132ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
133ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add         rdx,        8
134ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        cmp         edx,        dword arg(5) ;cols
135ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
136ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        jl          .nextcol
137ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
138ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        ; done with the all cols, start the across filtering in place
139ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        sub         rsi,        rdx
140ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        sub         rdi,        rdx
141ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
142ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        xor         rdx,        rdx
143ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        mm0,        QWORD PTR [rdi-8];
144ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
145ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang.acrossnextcol:
146ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        xmm7,       QWORD PTR [rdi +rdx -2]
147ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movd        xmm4,       DWORD PTR [rdi +rdx +6]
148ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
149ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pslldq      xmm4,       8
150ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        por         xmm4,       xmm7
151ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
152ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movdqa      xmm3,       xmm4
153ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psrldq      xmm3,       2
154ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   xmm3,       xmm0              ; mm3 = p0..p3
155ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movdqa      xmm1,       xmm3              ; mm1 = p0..p3
156ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psllw       xmm3,       2
157ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
158ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
159ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movdqa      xmm5,       xmm4
160ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psrldq      xmm5,       3
161ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   xmm5,       xmm0              ; mm5 = p1..p4
162ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddusw     xmm3,       xmm5              ; mm3 += mm6
163ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
164ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        ; thresholding
165ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movdqa      xmm7,       xmm1              ; mm7 = p0..p3
166ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubusw     xmm7,       xmm5              ; mm7 = p0..p3 - p1..p4
167ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3
168ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddusw     xmm7,       xmm5              ; mm7 = abs(p0..p3 - p1..p4)
169ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pcmpgtw     xmm7,       xmm2
170ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
171ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movdqa      xmm5,       xmm4
172ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psrldq      xmm5,       4
173ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   xmm5,       xmm0              ; mm5 = p2..p5
174ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddusw     xmm3,       xmm5              ; mm3 += mm5
175ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
176ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        ; thresholding
177ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movdqa      xmm6,       xmm1              ; mm6 = p0..p3
178ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubusw     xmm6,       xmm5              ; mm6 = p0..p3 - p1..p4
179ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3
180ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddusw     xmm6,       xmm5              ; mm6 = abs(p0..p3 - p1..p4)
181ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pcmpgtw     xmm6,       xmm2
182ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        por         xmm7,       xmm6              ; accumulate thresholds
183ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
184ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
185ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movdqa      xmm5,       xmm4              ; mm5 = p-2..p5
186ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   xmm5,       xmm0              ; mm5 = p-2..p1
187ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddusw     xmm3,       xmm5              ; mm3 += mm5
188ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
189ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        ; thresholding
190ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movdqa      xmm6,       xmm1              ; mm6 = p0..p3
191ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubusw     xmm6,       xmm5              ; mm6 = p0..p3 - p1..p4
192ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3
193ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddusw     xmm6,       xmm5              ; mm6 = abs(p0..p3 - p1..p4)
194ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pcmpgtw     xmm6,       xmm2
195ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        por         xmm7,       xmm6              ; accumulate thresholds
196ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
197ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psrldq      xmm4,       1                   ; mm4 = p-1..p5
198ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        punpcklbw   xmm4,       xmm0              ; mm4 = p-1..p2
199ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddusw     xmm3,       xmm4              ; mm3 += mm5
200ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
201ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        ; thresholding
202ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movdqa      xmm6,       xmm1              ; mm6 = p0..p3
203ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubusw     xmm6,       xmm4              ; mm6 = p0..p3 - p1..p4
204ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psubusw     xmm4,       xmm1              ; mm5 = p1..p4 - p0..p3
205ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddusw     xmm6,       xmm4              ; mm6 = abs(p0..p3 - p1..p4)
206ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pcmpgtw     xmm6,       xmm2
207ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        por         xmm7,       xmm6              ; accumulate thresholds
208ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
209ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddusw     xmm3,       RD42              ; mm3 += round value
210ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        psraw       xmm3,       3                 ; mm3 /= 8
211ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
212ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pand        xmm1,       xmm7              ; mm1 select vals > thresh from source
213ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        pandn       xmm7,       xmm3              ; mm7 select vals < thresh from blurred result
214ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        paddusw     xmm1,       xmm7              ; combination
215ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
216ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        packuswb    xmm1,       xmm0              ; pack to bytes
217ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        QWORD PTR [rdi+rdx-8],  mm0   ; store previous four bytes
218ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movdq2q     mm0,        xmm1
219ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
220ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add         rdx,        8
221ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        cmp         edx,        dword arg(5) ;cols
222ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        jl          .acrossnextcol;
223ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
224ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        ; last 8 pixels
225ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movq        QWORD PTR [rdi+rdx-8],  mm0
226ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
227ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        ; done with this rwo
228ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add         rsi,rax               ; next line
229ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        mov         eax, dword arg(3) ;dst_pixels_per_line ; destination pitch?
230ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add         rdi,rax               ; next destination
231ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        mov         eax, dword arg(2) ;src_pixels_per_line ; destination pitch?
232ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
233ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        dec         rcx                   ; decrement count
234ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        jnz         .nextrow              ; next row
235ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
236ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%if ABI_IS_32BIT=1 && CONFIG_PIC=1
237ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    add rsp,16
238ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop rsp
239ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endif
240ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ; begin epilog
241ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop rdi
242ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop rsi
243ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RESTORE_GOT
244ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RESTORE_XMM
245ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    UNSHADOW_ARGS
246ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop         rbp
247ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ret
248ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%undef RD42
249ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
250ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
251ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;void vp9_mbpost_proc_down_xmm(unsigned char *dst,
252ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;                            int pitch, int rows, int cols,int flimit)
253ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangextern sym(vp9_rv)
254ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangglobal sym(vp9_mbpost_proc_down_xmm) PRIVATE
255ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangsym(vp9_mbpost_proc_down_xmm):
256ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push        rbp
257ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    mov         rbp, rsp
258ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    SHADOW_ARGS_TO_STACK 5
259ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    SAVE_XMM 7
260ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    GET_GOT     rbx
261ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push        rsi
262ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push        rdi
263ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ; end prolog
264ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
265ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ALIGN_STACK 16, rax
266ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    sub         rsp, 128+16
267ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
268ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ; unsigned char d[16][8] at [rsp]
269ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ; create flimit2 at [rsp+128]
270ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    mov         eax, dword ptr arg(4) ;flimit
271ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    mov         [rsp+128], eax
272ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    mov         [rsp+128+4], eax
273ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    mov         [rsp+128+8], eax
274ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    mov         [rsp+128+12], eax
275ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%define flimit4 [rsp+128]
276ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
277ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%if ABI_IS_32BIT=0
278ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    lea         r8,       [GLOBAL(sym(vp9_rv))]
279ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endif
280ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
281ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ;rows +=8;
282ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    add         dword arg(2), 8
283ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
284ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ;for(c=0; c<cols; c+=8)
285ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang.loop_col:
286ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            mov         rsi,        arg(0) ; s
287ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            pxor        xmm0,       xmm0        ;
288ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
289ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            movsxd      rax,        dword ptr arg(1) ;pitch       ;
290ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            neg         rax                                     ; rax = -pitch
291ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
292ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]
293ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            neg         rax
294ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
295ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
296ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            pxor        xmm5,       xmm5
297ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            pxor        xmm6,       xmm6        ;
298ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
299ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            pxor        xmm7,       xmm7        ;
300ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            mov         rdi,        rsi
301ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
302ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            mov         rcx,        15          ;
303ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
304ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang.loop_initvar:
305ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            movq        xmm1,       QWORD PTR [rdi];
306ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            punpcklbw   xmm1,       xmm0        ;
307ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
308ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            paddw       xmm5,       xmm1        ;
309ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            pmullw      xmm1,       xmm1        ;
310ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
311ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            movdqa      xmm2,       xmm1        ;
312ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            punpcklwd   xmm1,       xmm0        ;
313ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
314ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            punpckhwd   xmm2,       xmm0        ;
315ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            paddd       xmm6,       xmm1        ;
316ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
317ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            paddd       xmm7,       xmm2        ;
318ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            lea         rdi,        [rdi+rax]   ;
319ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
320ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            dec         rcx
321ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            jne         .loop_initvar
322ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            ;save the var and sum
323ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            xor         rdx,        rdx
324ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang.loop_row:
325ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            movq        xmm1,       QWORD PTR [rsi]     ; [s-pitch*8]
326ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            movq        xmm2,       QWORD PTR [rdi]     ; [s+pitch*7]
327ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
328ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            punpcklbw   xmm1,       xmm0
329ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            punpcklbw   xmm2,       xmm0
330ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
331ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            paddw       xmm5,       xmm2
332ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            psubw       xmm5,       xmm1
333ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
334ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            pmullw      xmm2,       xmm2
335ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            movdqa      xmm4,       xmm2
336ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
337ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            punpcklwd   xmm2,       xmm0
338ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            punpckhwd   xmm4,       xmm0
339ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
340ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            paddd       xmm6,       xmm2
341ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            paddd       xmm7,       xmm4
342ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
343ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            pmullw      xmm1,       xmm1
344ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            movdqa      xmm2,       xmm1
345ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
346ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            punpcklwd   xmm1,       xmm0
347ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            psubd       xmm6,       xmm1
348ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
349ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            punpckhwd   xmm2,       xmm0
350ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            psubd       xmm7,       xmm2
351ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
352ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
353ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            movdqa      xmm3,       xmm6
354ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            pslld       xmm3,       4
355ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
356ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            psubd       xmm3,       xmm6
357ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            movdqa      xmm1,       xmm5
358ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
359ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            movdqa      xmm4,       xmm5
360ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            pmullw      xmm1,       xmm1
361ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
362ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            pmulhw      xmm4,       xmm4
363ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            movdqa      xmm2,       xmm1
364ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
365ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            punpcklwd   xmm1,       xmm4
366ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            punpckhwd   xmm2,       xmm4
367ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
368ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            movdqa      xmm4,       xmm7
369ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            pslld       xmm4,       4
370ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
371ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            psubd       xmm4,       xmm7
372ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
373ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            psubd       xmm3,       xmm1
374ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            psubd       xmm4,       xmm2
375ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
376ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            psubd       xmm3,       flimit4
377ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            psubd       xmm4,       flimit4
378ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
379ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            psrad       xmm3,       31
380ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            psrad       xmm4,       31
381ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
382ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            packssdw    xmm3,       xmm4
383ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            packsswb    xmm3,       xmm0
384ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
385ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            movq        xmm1,       QWORD PTR [rsi+rax*8]
386ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
387ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            movq        xmm2,       xmm1
388ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            punpcklbw   xmm1,       xmm0
389ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
390ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            paddw       xmm1,       xmm5
391ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            mov         rcx,        rdx
392ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
393ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            and         rcx,        127
394ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%if ABI_IS_32BIT=1 && CONFIG_PIC=1
395ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            push        rax
396ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            lea         rax,        [GLOBAL(sym(vp9_rv))]
397ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            movdqu      xmm4,       [rax + rcx*2] ;vp9_rv[rcx*2]
398ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            pop         rax
399ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%elif ABI_IS_32BIT=0
400ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            movdqu      xmm4,       [r8 + rcx*2] ;vp9_rv[rcx*2]
401ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%else
402ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            movdqu      xmm4,       [sym(vp9_rv) + rcx*2]
403ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endif
404ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
405ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            paddw       xmm1,       xmm4
406ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            ;paddw     xmm1,       eight8s
407ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            psraw       xmm1,       4
408ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
409ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            packuswb    xmm1,       xmm0
410ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            pand        xmm1,       xmm3
411ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
412ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            pandn       xmm3,       xmm2
413ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            por         xmm1,       xmm3
414ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
415ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            and         rcx,        15
416ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            movq        QWORD PTR   [rsp + rcx*8], xmm1 ;d[rcx*8]
417ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
418ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            mov         rcx,        rdx
419ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            sub         rcx,        8
420ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
421ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            and         rcx,        15
422ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            movq        mm0,        [rsp + rcx*8] ;d[rcx*8]
423ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
424ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            movq        [rsi],      mm0
425ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            lea         rsi,        [rsi+rax]
426ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
427ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            lea         rdi,        [rdi+rax]
428ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            add         rdx,        1
429ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
430ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            cmp         edx,        dword arg(2) ;rows
431ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            jl          .loop_row
432ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
433ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add         dword arg(0), 8 ; s += 8
434ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        sub         dword arg(3), 8 ; cols -= 8
435ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        cmp         dword arg(3), 0
436ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        jg          .loop_col
437ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
438ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    add         rsp, 128+16
439ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop         rsp
440ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
441ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ; begin epilog
442ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop rdi
443ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop rsi
444ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RESTORE_GOT
445ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RESTORE_XMM
446ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    UNSHADOW_ARGS
447ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop         rbp
448ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ret
449ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%undef flimit4
450ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
451ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
452ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;void vp9_mbpost_proc_across_ip_xmm(unsigned char *src,
453ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;                                int pitch, int rows, int cols,int flimit)
454ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangglobal sym(vp9_mbpost_proc_across_ip_xmm) PRIVATE
455ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangsym(vp9_mbpost_proc_across_ip_xmm):
456ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push        rbp
457ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    mov         rbp, rsp
458ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    SHADOW_ARGS_TO_STACK 5
459ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    SAVE_XMM 7
460ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    GET_GOT     rbx
461ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push        rsi
462ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push        rdi
463ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ; end prolog
464ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
465ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ALIGN_STACK 16, rax
466ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    sub         rsp, 16
467ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
468ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ; create flimit4 at [rsp]
469ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    mov         eax, dword ptr arg(4) ;flimit
470ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    mov         [rsp], eax
471ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    mov         [rsp+4], eax
472ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    mov         [rsp+8], eax
473ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    mov         [rsp+12], eax
474ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%define flimit4 [rsp]
475ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
476ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
477ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ;for(r=0;r<rows;r++)
478ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang.ip_row_loop:
479ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
480ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        xor         rdx,    rdx ;sumsq=0;
481ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        xor         rcx,    rcx ;sum=0;
482ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        mov         rsi,    arg(0); s
483ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        mov         rdi,    -8
484ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang.ip_var_loop:
485ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        ;for(i=-8;i<=6;i++)
486ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        ;{
487ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        ;    sumsq += s[i]*s[i];
488ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        ;    sum   += s[i];
489ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        ;}
490ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movzx       eax, byte [rsi+rdi]
491ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add         ecx, eax
492ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        mul         al
493ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add         edx, eax
494ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add         rdi, 1
495ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        cmp         rdi, 6
496ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        jle         .ip_var_loop
497ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
498ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
499ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            ;mov         rax,    sumsq
500ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            ;movd        xmm7,   rax
501ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            movd        xmm7,   edx
502ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
503ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            ;mov         rax,    sum
504ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            ;movd        xmm6,   rax
505ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            movd        xmm6,   ecx
506ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
507ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            mov         rsi,    arg(0) ;s
508ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            xor         rcx,    rcx
509ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
510ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            movsxd      rdx,    dword arg(3) ;cols
511ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            add         rdx,    8
512ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            pxor        mm0,    mm0
513ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            pxor        mm1,    mm1
514ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
515ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            pxor        xmm0,   xmm0
516ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang.nextcol4:
517ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
518ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            movd        xmm1,   DWORD PTR [rsi+rcx-8]   ; -8 -7 -6 -5
519ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            movd        xmm2,   DWORD PTR [rsi+rcx+7]   ; +7 +8 +9 +10
520ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
521ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            punpcklbw   xmm1,   xmm0                    ; expanding
522ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            punpcklbw   xmm2,   xmm0                    ; expanding
523ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
524ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            punpcklwd   xmm1,   xmm0                    ; expanding to dwords
525ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            punpcklwd   xmm2,   xmm0                    ; expanding to dwords
526ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
527ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            psubd       xmm2,   xmm1                    ; 7--8   8--7   9--6 10--5
528ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            paddd       xmm1,   xmm1                    ; -8*2   -7*2   -6*2 -5*2
529ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
530ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            paddd       xmm1,   xmm2                    ; 7+-8   8+-7   9+-6 10+-5
531ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            pmaddwd     xmm1,   xmm2                    ; squared of 7+-8   8+-7   9+-6 10+-5
532ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
533ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            paddd       xmm6,   xmm2
534ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            paddd       xmm7,   xmm1
535ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
536ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            pshufd      xmm6,   xmm6,   0               ; duplicate the last ones
537ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            pshufd      xmm7,   xmm7,   0               ; duplicate the last ones
538ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
539ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            psrldq      xmm1,       4                   ; 8--7   9--6 10--5  0000
540ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            psrldq      xmm2,       4                   ; 8--7   9--6 10--5  0000
541ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
542ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            pshufd      xmm3,   xmm1,   3               ; 0000  8--7   8--7   8--7 squared
543ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            pshufd      xmm4,   xmm2,   3               ; 0000  8--7   8--7   8--7 squared
544ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
545ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            paddd       xmm6,   xmm4
546ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            paddd       xmm7,   xmm3
547ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
548ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            pshufd      xmm3,   xmm1,   01011111b       ; 0000  0000   9--6   9--6 squared
549ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            pshufd      xmm4,   xmm2,   01011111b       ; 0000  0000   9--6   9--6 squared
550ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
551ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            paddd       xmm7,   xmm3
552ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            paddd       xmm6,   xmm4
553ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
554ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            pshufd      xmm3,   xmm1,   10111111b       ; 0000  0000   8--7   8--7 squared
555ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            pshufd      xmm4,   xmm2,   10111111b       ; 0000  0000   8--7   8--7 squared
556ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
557ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            paddd       xmm7,   xmm3
558ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            paddd       xmm6,   xmm4
559ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
560ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            movdqa      xmm3,   xmm6
561ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            pmaddwd     xmm3,   xmm3
562ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
563ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            movdqa      xmm5,   xmm7
564ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            pslld       xmm5,   4
565ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
566ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            psubd       xmm5,   xmm7
567ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            psubd       xmm5,   xmm3
568ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
569ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            psubd       xmm5,   flimit4
570ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            psrad       xmm5,   31
571ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
572ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            packssdw    xmm5,   xmm0
573ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            packsswb    xmm5,   xmm0
574ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
575ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            movd        xmm1,   DWORD PTR [rsi+rcx]
576ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            movq        xmm2,   xmm1
577ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
578ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            punpcklbw   xmm1,   xmm0
579ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            punpcklwd   xmm1,   xmm0
580ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
581ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            paddd       xmm1,   xmm6
582ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            paddd       xmm1,   [GLOBAL(four8s)]
583ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
584ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            psrad       xmm1,   4
585ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            packssdw    xmm1,   xmm0
586ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
587ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            packuswb    xmm1,   xmm0
588ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            pand        xmm1,   xmm5
589ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
590ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            pandn       xmm5,   xmm2
591ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            por         xmm5,   xmm1
592ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
593ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            movd        [rsi+rcx-8],  mm0
594ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            movq        mm0,    mm1
595ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
596ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            movdq2q     mm1,    xmm5
597ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            psrldq      xmm7,   12
598ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
599ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            psrldq      xmm6,   12
600ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            add         rcx,    4
601ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
602ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            cmp         rcx,    rdx
603ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            jl          .nextcol4
604ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
605ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        ;s+=pitch;
606ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        movsxd rax, dword arg(1)
607ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        add    arg(0), rax
608ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
609ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        sub dword arg(2), 1 ;rows-=1
610ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        cmp dword arg(2), 0
611ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang        jg .ip_row_loop
612ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
613ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    add         rsp, 16
614ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop         rsp
615ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
616ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ; begin epilog
617ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop rdi
618ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop rsi
619ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RESTORE_GOT
620ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RESTORE_XMM
621ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    UNSHADOW_ARGS
622ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop         rbp
623ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ret
624ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%undef flimit4
625ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
626ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
627ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;void vp9_plane_add_noise_wmt (unsigned char *start, unsigned char *noise,
628ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;                            unsigned char blackclamp[16],
629ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;                            unsigned char whiteclamp[16],
630ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;                            unsigned char bothclamp[16],
631ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;                            unsigned int width, unsigned int height, int pitch)
632ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangglobal sym(vp9_plane_add_noise_wmt) PRIVATE
633ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangsym(vp9_plane_add_noise_wmt):
634ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push        rbp
635ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    mov         rbp, rsp
636ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    SHADOW_ARGS_TO_STACK 8
637ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    GET_GOT     rbx
638ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push        rsi
639ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    push        rdi
640ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ; end prolog
641ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
642ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang.addnoise_loop:
643614a6a21483b59b4ab557785c160c8ca4722b062Johann    call sym(LIBVPX_RAND) WRT_PLT
644ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    mov     rcx, arg(1) ;noise
645ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    and     rax, 0xff
646ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    add     rcx, rax
647ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
648ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ; we rely on the fact that the clamping vectors are stored contiguously
649ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ; in black/white/both order. Note that we have to reload this here because
650ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ; rdx could be trashed by rand()
651ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    mov     rdx, arg(2) ; blackclamp
652ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
653ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
654ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            mov     rdi, rcx
655ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            movsxd  rcx, dword arg(5) ;[Width]
656ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            mov     rsi, arg(0) ;Pos
657ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            xor         rax,rax
658ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
659ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang.addnoise_nextset:
660ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            movdqu      xmm1,[rsi+rax]         ; get the source
661ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
662ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            psubusb     xmm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
663ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            paddusb     xmm1, [rdx+32] ;bothclamp
664ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            psubusb     xmm1, [rdx+16] ;whiteclamp
665ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
666ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            movdqu      xmm2,[rdi+rax]         ; get the noise for this line
667ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            paddb       xmm1,xmm2              ; add it in
668ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            movdqu      [rsi+rax],xmm1         ; store the result
669ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
670ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            add         rax,16                 ; move to the next line
671ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
672ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            cmp         rax, rcx
673ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang            jl          .addnoise_nextset
674ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
675ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    movsxd  rax, dword arg(7) ; Pitch
676ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    add     arg(0), rax ; Start += Pitch
677ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    sub     dword arg(6), 1   ; Height -= 1
678ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    jg      .addnoise_loop
679ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
680ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ; begin epilog
681ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop rdi
682ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop rsi
683ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    RESTORE_GOT
684ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    UNSHADOW_ARGS
685ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    pop         rbp
686ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    ret
687ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
688ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang
689ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangSECTION_RODATA
690ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangalign 16
691ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangrd42:
692ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    times 8 dw 0x04
693ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangfour8s:
694ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang    times 4 dd 8
695