1233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
2233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
4233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  Use of this source code is governed by a BSD-style license
5233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  that can be found in the LICENSE file in the root of the source
6233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  tree. An additional intellectual property rights grant can be found
7233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  in the file PATENTS.  All contributing project authors may
8233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  be found in the AUTHORS file in the root of the source tree.
9233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
10233d2500723e5594f3e7c70896ffeeef32b9c950ywan
11233d2500723e5594f3e7c70896ffeeef32b9c950ywan
12233d2500723e5594f3e7c70896ffeeef32b9c950ywan%include "vpx_ports/x86_abi_support.asm"
13233d2500723e5594f3e7c70896ffeeef32b9c950ywan
14233d2500723e5594f3e7c70896ffeeef32b9c950ywan;void vp8_filter_by_weight16x16_sse2
15233d2500723e5594f3e7c70896ffeeef32b9c950ywan;(
16233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned char *src,
17233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    int            src_stride,
18233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned char *dst,
19233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    int            dst_stride,
20233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    int            src_weight
21233d2500723e5594f3e7c70896ffeeef32b9c950ywan;)
22233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp8_filter_by_weight16x16_sse2) PRIVATE
23233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp8_filter_by_weight16x16_sse2):
24233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rbp
25233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         rbp, rsp
26233d2500723e5594f3e7c70896ffeeef32b9c950ywan    SHADOW_ARGS_TO_STACK 5
27233d2500723e5594f3e7c70896ffeeef32b9c950ywan    SAVE_XMM 6
28233d2500723e5594f3e7c70896ffeeef32b9c950ywan    GET_GOT     rbx
29233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rsi
30233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rdi
31233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; end prolog
32233d2500723e5594f3e7c70896ffeeef32b9c950ywan
33233d2500723e5594f3e7c70896ffeeef32b9c950ywan    movd        xmm0, arg(4)                ; src_weight
34233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pshuflw     xmm0, xmm0, 0x0             ; replicate to all low words
35233d2500723e5594f3e7c70896ffeeef32b9c950ywan    punpcklqdq  xmm0, xmm0                  ; replicate to all hi words
36233d2500723e5594f3e7c70896ffeeef32b9c950ywan
37233d2500723e5594f3e7c70896ffeeef32b9c950ywan    movdqa      xmm1, [GLOBAL(tMFQE)]
38233d2500723e5594f3e7c70896ffeeef32b9c950ywan    psubw       xmm1, xmm0                  ; dst_weight
39233d2500723e5594f3e7c70896ffeeef32b9c950ywan
40233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         rax, arg(0)                 ; src
41233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         rsi, arg(1)                 ; src_stride
42233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         rdx, arg(2)                 ; dst
43233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         rdi, arg(3)                 ; dst_stride
44233d2500723e5594f3e7c70896ffeeef32b9c950ywan
45233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         rcx, 16                     ; loop count
46233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pxor        xmm6, xmm6
47233d2500723e5594f3e7c70896ffeeef32b9c950ywan
48233d2500723e5594f3e7c70896ffeeef32b9c950ywan.combine
49233d2500723e5594f3e7c70896ffeeef32b9c950ywan    movdqa      xmm2, [rax]
50233d2500723e5594f3e7c70896ffeeef32b9c950ywan    movdqa      xmm4, [rdx]
51233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add         rax, rsi
52233d2500723e5594f3e7c70896ffeeef32b9c950ywan
53233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; src * src_weight
54233d2500723e5594f3e7c70896ffeeef32b9c950ywan    movdqa      xmm3, xmm2
55233d2500723e5594f3e7c70896ffeeef32b9c950ywan    punpcklbw   xmm2, xmm6
56233d2500723e5594f3e7c70896ffeeef32b9c950ywan    punpckhbw   xmm3, xmm6
57233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pmullw      xmm2, xmm0
58233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pmullw      xmm3, xmm0
59233d2500723e5594f3e7c70896ffeeef32b9c950ywan
60233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; dst * dst_weight
61233d2500723e5594f3e7c70896ffeeef32b9c950ywan    movdqa      xmm5, xmm4
62233d2500723e5594f3e7c70896ffeeef32b9c950ywan    punpcklbw   xmm4, xmm6
63233d2500723e5594f3e7c70896ffeeef32b9c950ywan    punpckhbw   xmm5, xmm6
64233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pmullw      xmm4, xmm1
65233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pmullw      xmm5, xmm1
66233d2500723e5594f3e7c70896ffeeef32b9c950ywan
67233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; sum, round and shift
68233d2500723e5594f3e7c70896ffeeef32b9c950ywan    paddw       xmm2, xmm4
69233d2500723e5594f3e7c70896ffeeef32b9c950ywan    paddw       xmm3, xmm5
70233d2500723e5594f3e7c70896ffeeef32b9c950ywan    paddw       xmm2, [GLOBAL(tMFQE_round)]
71233d2500723e5594f3e7c70896ffeeef32b9c950ywan    paddw       xmm3, [GLOBAL(tMFQE_round)]
72233d2500723e5594f3e7c70896ffeeef32b9c950ywan    psrlw       xmm2, 4
73233d2500723e5594f3e7c70896ffeeef32b9c950ywan    psrlw       xmm3, 4
74233d2500723e5594f3e7c70896ffeeef32b9c950ywan
75233d2500723e5594f3e7c70896ffeeef32b9c950ywan    packuswb    xmm2, xmm3
76233d2500723e5594f3e7c70896ffeeef32b9c950ywan    movdqa      [rdx], xmm2
77233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add         rdx, rdi
78233d2500723e5594f3e7c70896ffeeef32b9c950ywan
79233d2500723e5594f3e7c70896ffeeef32b9c950ywan    dec         rcx
80233d2500723e5594f3e7c70896ffeeef32b9c950ywan    jnz         .combine
81233d2500723e5594f3e7c70896ffeeef32b9c950ywan
82233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; begin epilog
83233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop         rdi
84233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop         rsi
85233d2500723e5594f3e7c70896ffeeef32b9c950ywan    RESTORE_GOT
86233d2500723e5594f3e7c70896ffeeef32b9c950ywan    RESTORE_XMM
87233d2500723e5594f3e7c70896ffeeef32b9c950ywan    UNSHADOW_ARGS
88233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop         rbp
89233d2500723e5594f3e7c70896ffeeef32b9c950ywan
90233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ret
91233d2500723e5594f3e7c70896ffeeef32b9c950ywan
92233d2500723e5594f3e7c70896ffeeef32b9c950ywan;void vp8_filter_by_weight8x8_sse2
93233d2500723e5594f3e7c70896ffeeef32b9c950ywan;(
94233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned char *src,
95233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    int            src_stride,
96233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned char *dst,
97233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    int            dst_stride,
98233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    int            src_weight
99233d2500723e5594f3e7c70896ffeeef32b9c950ywan;)
100233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp8_filter_by_weight8x8_sse2) PRIVATE
101233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp8_filter_by_weight8x8_sse2):
102233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rbp
103233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         rbp, rsp
104233d2500723e5594f3e7c70896ffeeef32b9c950ywan    SHADOW_ARGS_TO_STACK 5
105233d2500723e5594f3e7c70896ffeeef32b9c950ywan    GET_GOT     rbx
106233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rsi
107233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rdi
108233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; end prolog
109233d2500723e5594f3e7c70896ffeeef32b9c950ywan
110233d2500723e5594f3e7c70896ffeeef32b9c950ywan    movd        xmm0, arg(4)                ; src_weight
111233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pshuflw     xmm0, xmm0, 0x0             ; replicate to all low words
112233d2500723e5594f3e7c70896ffeeef32b9c950ywan    punpcklqdq  xmm0, xmm0                  ; replicate to all hi words
113233d2500723e5594f3e7c70896ffeeef32b9c950ywan
114233d2500723e5594f3e7c70896ffeeef32b9c950ywan    movdqa      xmm1, [GLOBAL(tMFQE)]
115233d2500723e5594f3e7c70896ffeeef32b9c950ywan    psubw       xmm1, xmm0                  ; dst_weight
116233d2500723e5594f3e7c70896ffeeef32b9c950ywan
117233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         rax, arg(0)                 ; src
118233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         rsi, arg(1)                 ; src_stride
119233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         rdx, arg(2)                 ; dst
120233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         rdi, arg(3)                 ; dst_stride
121233d2500723e5594f3e7c70896ffeeef32b9c950ywan
122233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         rcx, 8                      ; loop count
123233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pxor        xmm4, xmm4
124233d2500723e5594f3e7c70896ffeeef32b9c950ywan
125233d2500723e5594f3e7c70896ffeeef32b9c950ywan.combine
126233d2500723e5594f3e7c70896ffeeef32b9c950ywan    movq        xmm2, [rax]
127233d2500723e5594f3e7c70896ffeeef32b9c950ywan    movq        xmm3, [rdx]
128233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add         rax, rsi
129233d2500723e5594f3e7c70896ffeeef32b9c950ywan
130233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; src * src_weight
131233d2500723e5594f3e7c70896ffeeef32b9c950ywan    punpcklbw   xmm2, xmm4
132233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pmullw      xmm2, xmm0
133233d2500723e5594f3e7c70896ffeeef32b9c950ywan
134233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; dst * dst_weight
135233d2500723e5594f3e7c70896ffeeef32b9c950ywan    punpcklbw   xmm3, xmm4
136233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pmullw      xmm3, xmm1
137233d2500723e5594f3e7c70896ffeeef32b9c950ywan
138233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; sum, round and shift
139233d2500723e5594f3e7c70896ffeeef32b9c950ywan    paddw       xmm2, xmm3
140233d2500723e5594f3e7c70896ffeeef32b9c950ywan    paddw       xmm2, [GLOBAL(tMFQE_round)]
141233d2500723e5594f3e7c70896ffeeef32b9c950ywan    psrlw       xmm2, 4
142233d2500723e5594f3e7c70896ffeeef32b9c950ywan
143233d2500723e5594f3e7c70896ffeeef32b9c950ywan    packuswb    xmm2, xmm4
144233d2500723e5594f3e7c70896ffeeef32b9c950ywan    movq        [rdx], xmm2
145233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add         rdx, rdi
146233d2500723e5594f3e7c70896ffeeef32b9c950ywan
147233d2500723e5594f3e7c70896ffeeef32b9c950ywan    dec         rcx
148233d2500723e5594f3e7c70896ffeeef32b9c950ywan    jnz         .combine
149233d2500723e5594f3e7c70896ffeeef32b9c950ywan
150233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; begin epilog
151233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop         rdi
152233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop         rsi
153233d2500723e5594f3e7c70896ffeeef32b9c950ywan    RESTORE_GOT
154233d2500723e5594f3e7c70896ffeeef32b9c950ywan    UNSHADOW_ARGS
155233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop         rbp
156233d2500723e5594f3e7c70896ffeeef32b9c950ywan
157233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ret
158233d2500723e5594f3e7c70896ffeeef32b9c950ywan
159233d2500723e5594f3e7c70896ffeeef32b9c950ywan;void vp8_variance_and_sad_16x16_sse2 | arg
160233d2500723e5594f3e7c70896ffeeef32b9c950ywan;(
161233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned char *src1,          0
162233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    int            stride1,       1
163233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned char *src2,          2
164233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    int            stride2,       3
165233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned int  *variance,      4
166233d2500723e5594f3e7c70896ffeeef32b9c950ywan;    unsigned int  *sad,           5
167233d2500723e5594f3e7c70896ffeeef32b9c950ywan;)
168233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp8_variance_and_sad_16x16_sse2) PRIVATE
169233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp8_variance_and_sad_16x16_sse2):
170233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rbp
171233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         rbp, rsp
172233d2500723e5594f3e7c70896ffeeef32b9c950ywan    SHADOW_ARGS_TO_STACK 6
173233d2500723e5594f3e7c70896ffeeef32b9c950ywan    GET_GOT     rbx
174233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rsi
175233d2500723e5594f3e7c70896ffeeef32b9c950ywan    push        rdi
176233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; end prolog
177233d2500723e5594f3e7c70896ffeeef32b9c950ywan
178233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         rax,        arg(0)          ; src1
179233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         rcx,        arg(1)          ; stride1
180233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         rdx,        arg(2)          ; src2
181233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         rdi,        arg(3)          ; stride2
182233d2500723e5594f3e7c70896ffeeef32b9c950ywan
183233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         rsi,        16              ; block height
184233d2500723e5594f3e7c70896ffeeef32b9c950ywan
185233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; Prep accumulator registers
186233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pxor        xmm3, xmm3                  ; SAD
187233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pxor        xmm4, xmm4                  ; sum of src2
188233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pxor        xmm5, xmm5                  ; sum of src2^2
189233d2500723e5594f3e7c70896ffeeef32b9c950ywan
190233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; Because we're working with the actual output frames
191233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; we can't depend on any kind of data alignment.
192233d2500723e5594f3e7c70896ffeeef32b9c950ywan.accumulate
193233d2500723e5594f3e7c70896ffeeef32b9c950ywan    movdqa      xmm0, [rax]                 ; src1
194233d2500723e5594f3e7c70896ffeeef32b9c950ywan    movdqa      xmm1, [rdx]                 ; src2
195233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add         rax, rcx                    ; src1 + stride1
196233d2500723e5594f3e7c70896ffeeef32b9c950ywan    add         rdx, rdi                    ; src2 + stride2
197233d2500723e5594f3e7c70896ffeeef32b9c950ywan
198233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; SAD(src1, src2)
199233d2500723e5594f3e7c70896ffeeef32b9c950ywan    psadbw      xmm0, xmm1
200233d2500723e5594f3e7c70896ffeeef32b9c950ywan    paddusw     xmm3, xmm0
201233d2500723e5594f3e7c70896ffeeef32b9c950ywan
202233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; SUM(src2)
203233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pxor        xmm2, xmm2
204233d2500723e5594f3e7c70896ffeeef32b9c950ywan    psadbw      xmm2, xmm1                  ; sum src2 by misusing SAD against 0
205233d2500723e5594f3e7c70896ffeeef32b9c950ywan    paddusw     xmm4, xmm2
206233d2500723e5594f3e7c70896ffeeef32b9c950ywan
207233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; pmaddubsw would be ideal if it took two unsigned values. instead,
208233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; it expects a signed and an unsigned value. so instead we zero extend
209233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; and operate on words.
210233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pxor        xmm2, xmm2
211233d2500723e5594f3e7c70896ffeeef32b9c950ywan    movdqa      xmm0, xmm1
212233d2500723e5594f3e7c70896ffeeef32b9c950ywan    punpcklbw   xmm0, xmm2
213233d2500723e5594f3e7c70896ffeeef32b9c950ywan    punpckhbw   xmm1, xmm2
214233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pmaddwd     xmm0, xmm0
215233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pmaddwd     xmm1, xmm1
216233d2500723e5594f3e7c70896ffeeef32b9c950ywan    paddd       xmm5, xmm0
217233d2500723e5594f3e7c70896ffeeef32b9c950ywan    paddd       xmm5, xmm1
218233d2500723e5594f3e7c70896ffeeef32b9c950ywan
219233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sub         rsi,        1
220233d2500723e5594f3e7c70896ffeeef32b9c950ywan    jnz         .accumulate
221233d2500723e5594f3e7c70896ffeeef32b9c950ywan
222233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; phaddd only operates on adjacent double words.
223233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; Finalize SAD and store
224233d2500723e5594f3e7c70896ffeeef32b9c950ywan    movdqa      xmm0, xmm3
225233d2500723e5594f3e7c70896ffeeef32b9c950ywan    psrldq      xmm0, 8
226233d2500723e5594f3e7c70896ffeeef32b9c950ywan    paddusw     xmm0, xmm3
227233d2500723e5594f3e7c70896ffeeef32b9c950ywan    paddd       xmm0, [GLOBAL(t128)]
228233d2500723e5594f3e7c70896ffeeef32b9c950ywan    psrld       xmm0, 8
229233d2500723e5594f3e7c70896ffeeef32b9c950ywan
230233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         rax,  arg(5)
231233d2500723e5594f3e7c70896ffeeef32b9c950ywan    movd        [rax], xmm0
232233d2500723e5594f3e7c70896ffeeef32b9c950ywan
233233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; Accumulate sum of src2
234233d2500723e5594f3e7c70896ffeeef32b9c950ywan    movdqa      xmm0, xmm4
235233d2500723e5594f3e7c70896ffeeef32b9c950ywan    psrldq      xmm0, 8
236233d2500723e5594f3e7c70896ffeeef32b9c950ywan    paddusw     xmm0, xmm4
237233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; Square src2. Ignore high value
238233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pmuludq     xmm0, xmm0
239233d2500723e5594f3e7c70896ffeeef32b9c950ywan    psrld       xmm0, 8
240233d2500723e5594f3e7c70896ffeeef32b9c950ywan
241233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; phaddw could be used to sum adjacent values but we want
242233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; all the values summed. promote to doubles, accumulate,
243233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; shift and sum
244233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pxor        xmm2, xmm2
245233d2500723e5594f3e7c70896ffeeef32b9c950ywan    movdqa      xmm1, xmm5
246233d2500723e5594f3e7c70896ffeeef32b9c950ywan    punpckldq   xmm1, xmm2
247233d2500723e5594f3e7c70896ffeeef32b9c950ywan    punpckhdq   xmm5, xmm2
248233d2500723e5594f3e7c70896ffeeef32b9c950ywan    paddd       xmm1, xmm5
249233d2500723e5594f3e7c70896ffeeef32b9c950ywan    movdqa      xmm2, xmm1
250233d2500723e5594f3e7c70896ffeeef32b9c950ywan    psrldq      xmm1, 8
251233d2500723e5594f3e7c70896ffeeef32b9c950ywan    paddd       xmm1, xmm2
252233d2500723e5594f3e7c70896ffeeef32b9c950ywan
253233d2500723e5594f3e7c70896ffeeef32b9c950ywan    psubd       xmm1, xmm0
254233d2500723e5594f3e7c70896ffeeef32b9c950ywan
255233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; (variance + 128) >> 8
256233d2500723e5594f3e7c70896ffeeef32b9c950ywan    paddd       xmm1, [GLOBAL(t128)]
257233d2500723e5594f3e7c70896ffeeef32b9c950ywan    psrld       xmm1, 8
258233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mov         rax,  arg(4)
259233d2500723e5594f3e7c70896ffeeef32b9c950ywan
260233d2500723e5594f3e7c70896ffeeef32b9c950ywan    movd        [rax], xmm1
261233d2500723e5594f3e7c70896ffeeef32b9c950ywan
262233d2500723e5594f3e7c70896ffeeef32b9c950ywan
263233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ; begin epilog
264233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop         rdi
265233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop         rsi
266233d2500723e5594f3e7c70896ffeeef32b9c950ywan    RESTORE_GOT
267233d2500723e5594f3e7c70896ffeeef32b9c950ywan    UNSHADOW_ARGS
268233d2500723e5594f3e7c70896ffeeef32b9c950ywan    pop         rbp
269233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ret
270233d2500723e5594f3e7c70896ffeeef32b9c950ywan
271233d2500723e5594f3e7c70896ffeeef32b9c950ywanSECTION_RODATA
272233d2500723e5594f3e7c70896ffeeef32b9c950ywanalign 16
273233d2500723e5594f3e7c70896ffeeef32b9c950ywant128:
274233d2500723e5594f3e7c70896ffeeef32b9c950ywan%ifndef __NASM_VER__
275233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ddq 128
276233d2500723e5594f3e7c70896ffeeef32b9c950ywan%elif CONFIG_BIG_ENDIAN
277233d2500723e5594f3e7c70896ffeeef32b9c950ywan    dq  0, 128
278233d2500723e5594f3e7c70896ffeeef32b9c950ywan%else
279233d2500723e5594f3e7c70896ffeeef32b9c950ywan    dq  128, 0
280233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif
281233d2500723e5594f3e7c70896ffeeef32b9c950ywanalign 16
282233d2500723e5594f3e7c70896ffeeef32b9c950ywantMFQE: ; 1 << MFQE_PRECISION
283233d2500723e5594f3e7c70896ffeeef32b9c950ywan    times 8 dw 0x10
284233d2500723e5594f3e7c70896ffeeef32b9c950ywanalign 16
285233d2500723e5594f3e7c70896ffeeef32b9c950ywantMFQE_round: ; 1 << (MFQE_PRECISION - 1)
286233d2500723e5594f3e7c70896ffeeef32b9c950ywan    times 8 dw 0x08
287233d2500723e5594f3e7c70896ffeeef32b9c950ywan
288