1233d2500723e5594f3e7c70896ffeeef32b9c950ywan; 2233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Copyright (c) 2012 The WebM project authors. All Rights Reserved. 3233d2500723e5594f3e7c70896ffeeef32b9c950ywan; 4233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Use of this source code is governed by a BSD-style license 5233d2500723e5594f3e7c70896ffeeef32b9c950ywan; that can be found in the LICENSE file in the root of the source 6233d2500723e5594f3e7c70896ffeeef32b9c950ywan; tree. An additional intellectual property rights grant can be found 7233d2500723e5594f3e7c70896ffeeef32b9c950ywan; in the file PATENTS. All contributing project authors may 8233d2500723e5594f3e7c70896ffeeef32b9c950ywan; be found in the AUTHORS file in the root of the source tree. 9233d2500723e5594f3e7c70896ffeeef32b9c950ywan; 10233d2500723e5594f3e7c70896ffeeef32b9c950ywan 11233d2500723e5594f3e7c70896ffeeef32b9c950ywan 12233d2500723e5594f3e7c70896ffeeef32b9c950ywan%include "vpx_ports/x86_abi_support.asm" 13233d2500723e5594f3e7c70896ffeeef32b9c950ywan 14233d2500723e5594f3e7c70896ffeeef32b9c950ywan;void vp8_filter_by_weight16x16_sse2 15233d2500723e5594f3e7c70896ffeeef32b9c950ywan;( 16233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned char *src, 17233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int src_stride, 18233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned char *dst, 19233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int dst_stride, 20233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int src_weight 21233d2500723e5594f3e7c70896ffeeef32b9c950ywan;) 22233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp8_filter_by_weight16x16_sse2) PRIVATE 23233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp8_filter_by_weight16x16_sse2): 24233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rbp 25233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rbp, rsp 26233d2500723e5594f3e7c70896ffeeef32b9c950ywan SHADOW_ARGS_TO_STACK 5 27233d2500723e5594f3e7c70896ffeeef32b9c950ywan SAVE_XMM 6 28233d2500723e5594f3e7c70896ffeeef32b9c950ywan GET_GOT rbx 29233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rsi 30233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rdi 31233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; end prolog 32233d2500723e5594f3e7c70896ffeeef32b9c950ywan 33233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd xmm0, arg(4) ; src_weight 34233d2500723e5594f3e7c70896ffeeef32b9c950ywan pshuflw xmm0, xmm0, 0x0 ; replicate to all low words 35233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklqdq xmm0, xmm0 ; replicate to all hi words 36233d2500723e5594f3e7c70896ffeeef32b9c950ywan 37233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm1, [GLOBAL(tMFQE)] 38233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubw xmm1, xmm0 ; dst_weight 39233d2500723e5594f3e7c70896ffeeef32b9c950ywan 40233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rax, arg(0) ; src 41233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rsi, arg(1) ; src_stride 42233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rdx, arg(2) ; dst 43233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rdi, arg(3) ; dst_stride 44233d2500723e5594f3e7c70896ffeeef32b9c950ywan 45233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rcx, 16 ; loop count 46233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm6, xmm6 47233d2500723e5594f3e7c70896ffeeef32b9c950ywan 48233d2500723e5594f3e7c70896ffeeef32b9c950ywan.combine 49233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm2, [rax] 50233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm4, [rdx] 51233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rax, rsi 52233d2500723e5594f3e7c70896ffeeef32b9c950ywan 53233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; src * src_weight 54233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm3, xmm2 55233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm2, xmm6 56233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhbw xmm3, xmm6 57233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmullw xmm2, xmm0 58233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmullw xmm3, xmm0 59233d2500723e5594f3e7c70896ffeeef32b9c950ywan 60233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; dst * dst_weight 61233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm5, xmm4 62233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm4, xmm6 63233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhbw xmm5, xmm6 64233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmullw xmm4, xmm1 65233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmullw xmm5, xmm1 66233d2500723e5594f3e7c70896ffeeef32b9c950ywan 67233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; sum, round and shift 68233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm2, xmm4 69233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm3, xmm5 70233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm2, [GLOBAL(tMFQE_round)] 71233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm3, [GLOBAL(tMFQE_round)] 72233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrlw xmm2, 4 73233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrlw xmm3, 4 74233d2500723e5594f3e7c70896ffeeef32b9c950ywan 75233d2500723e5594f3e7c70896ffeeef32b9c950ywan packuswb xmm2, xmm3 76233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa [rdx], xmm2 77233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rdx, rdi 78233d2500723e5594f3e7c70896ffeeef32b9c950ywan 79233d2500723e5594f3e7c70896ffeeef32b9c950ywan dec rcx 80233d2500723e5594f3e7c70896ffeeef32b9c950ywan jnz .combine 81233d2500723e5594f3e7c70896ffeeef32b9c950ywan 82233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; begin epilog 83233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rdi 84233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rsi 85233d2500723e5594f3e7c70896ffeeef32b9c950ywan RESTORE_GOT 86233d2500723e5594f3e7c70896ffeeef32b9c950ywan RESTORE_XMM 87233d2500723e5594f3e7c70896ffeeef32b9c950ywan UNSHADOW_ARGS 88233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rbp 89233d2500723e5594f3e7c70896ffeeef32b9c950ywan 90233d2500723e5594f3e7c70896ffeeef32b9c950ywan ret 91233d2500723e5594f3e7c70896ffeeef32b9c950ywan 92233d2500723e5594f3e7c70896ffeeef32b9c950ywan;void vp8_filter_by_weight8x8_sse2 93233d2500723e5594f3e7c70896ffeeef32b9c950ywan;( 94233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned char *src, 95233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int src_stride, 96233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned char *dst, 97233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int dst_stride, 98233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int src_weight 99233d2500723e5594f3e7c70896ffeeef32b9c950ywan;) 100233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp8_filter_by_weight8x8_sse2) PRIVATE 101233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp8_filter_by_weight8x8_sse2): 102233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rbp 103233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rbp, rsp 104233d2500723e5594f3e7c70896ffeeef32b9c950ywan SHADOW_ARGS_TO_STACK 5 105233d2500723e5594f3e7c70896ffeeef32b9c950ywan GET_GOT rbx 106233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rsi 107233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rdi 108233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; end prolog 109233d2500723e5594f3e7c70896ffeeef32b9c950ywan 110233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd xmm0, arg(4) ; src_weight 111233d2500723e5594f3e7c70896ffeeef32b9c950ywan pshuflw xmm0, xmm0, 0x0 ; replicate to all low words 112233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklqdq xmm0, xmm0 ; replicate to all hi words 113233d2500723e5594f3e7c70896ffeeef32b9c950ywan 114233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm1, [GLOBAL(tMFQE)] 115233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubw xmm1, xmm0 ; dst_weight 116233d2500723e5594f3e7c70896ffeeef32b9c950ywan 117233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rax, arg(0) ; src 118233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rsi, arg(1) ; src_stride 119233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rdx, arg(2) ; dst 120233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rdi, arg(3) ; dst_stride 121233d2500723e5594f3e7c70896ffeeef32b9c950ywan 122233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rcx, 8 ; loop count 123233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm4, xmm4 124233d2500723e5594f3e7c70896ffeeef32b9c950ywan 125233d2500723e5594f3e7c70896ffeeef32b9c950ywan.combine 126233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm2, [rax] 127233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm3, [rdx] 128233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rax, rsi 129233d2500723e5594f3e7c70896ffeeef32b9c950ywan 130233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; src * src_weight 131233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm2, xmm4 132233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmullw xmm2, xmm0 133233d2500723e5594f3e7c70896ffeeef32b9c950ywan 134233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; dst * dst_weight 135233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm3, xmm4 136233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmullw xmm3, xmm1 137233d2500723e5594f3e7c70896ffeeef32b9c950ywan 138233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; sum, round and shift 139233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm2, xmm3 140233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm2, [GLOBAL(tMFQE_round)] 141233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrlw xmm2, 4 142233d2500723e5594f3e7c70896ffeeef32b9c950ywan 143233d2500723e5594f3e7c70896ffeeef32b9c950ywan packuswb xmm2, xmm4 144233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq [rdx], xmm2 145233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rdx, rdi 146233d2500723e5594f3e7c70896ffeeef32b9c950ywan 147233d2500723e5594f3e7c70896ffeeef32b9c950ywan dec rcx 148233d2500723e5594f3e7c70896ffeeef32b9c950ywan jnz .combine 149233d2500723e5594f3e7c70896ffeeef32b9c950ywan 150233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; begin epilog 151233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rdi 152233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rsi 153233d2500723e5594f3e7c70896ffeeef32b9c950ywan RESTORE_GOT 154233d2500723e5594f3e7c70896ffeeef32b9c950ywan UNSHADOW_ARGS 155233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rbp 156233d2500723e5594f3e7c70896ffeeef32b9c950ywan 157233d2500723e5594f3e7c70896ffeeef32b9c950ywan ret 158233d2500723e5594f3e7c70896ffeeef32b9c950ywan 159233d2500723e5594f3e7c70896ffeeef32b9c950ywan;void vp8_variance_and_sad_16x16_sse2 | arg 160233d2500723e5594f3e7c70896ffeeef32b9c950ywan;( 161233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned char *src1, 0 162233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int stride1, 1 163233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned char *src2, 2 164233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int stride2, 3 165233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned int *variance, 4 166233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned int *sad, 5 167233d2500723e5594f3e7c70896ffeeef32b9c950ywan;) 168233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp8_variance_and_sad_16x16_sse2) PRIVATE 169233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp8_variance_and_sad_16x16_sse2): 170233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rbp 171233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rbp, rsp 172233d2500723e5594f3e7c70896ffeeef32b9c950ywan SHADOW_ARGS_TO_STACK 6 173233d2500723e5594f3e7c70896ffeeef32b9c950ywan GET_GOT rbx 174233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rsi 175233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rdi 176233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; end prolog 177233d2500723e5594f3e7c70896ffeeef32b9c950ywan 178233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rax, arg(0) ; src1 179233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rcx, arg(1) ; stride1 180233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rdx, arg(2) ; src2 181233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rdi, arg(3) ; stride2 182233d2500723e5594f3e7c70896ffeeef32b9c950ywan 183233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rsi, 16 ; block height 184233d2500723e5594f3e7c70896ffeeef32b9c950ywan 185233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; Prep accumulator registers 186233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm3, xmm3 ; SAD 187233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm4, xmm4 ; sum of src2 188233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm5, xmm5 ; sum of src2^2 189233d2500723e5594f3e7c70896ffeeef32b9c950ywan 190233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; Because we're working with the actual output frames 191233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; we can't depend on any kind of data alignment. 192233d2500723e5594f3e7c70896ffeeef32b9c950ywan.accumulate 193233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm0, [rax] ; src1 194233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm1, [rdx] ; src2 195233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rax, rcx ; src1 + stride1 196233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rdx, rdi ; src2 + stride2 197233d2500723e5594f3e7c70896ffeeef32b9c950ywan 198233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; SAD(src1, src2) 199233d2500723e5594f3e7c70896ffeeef32b9c950ywan psadbw xmm0, xmm1 200233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddusw xmm3, xmm0 201233d2500723e5594f3e7c70896ffeeef32b9c950ywan 202233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; SUM(src2) 203233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm2, xmm2 204233d2500723e5594f3e7c70896ffeeef32b9c950ywan psadbw xmm2, xmm1 ; sum src2 by misusing SAD against 0 205233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddusw xmm4, xmm2 206233d2500723e5594f3e7c70896ffeeef32b9c950ywan 207233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; pmaddubsw would be ideal if it took two unsigned values. instead, 208233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; it expects a signed and an unsigned value. so instead we zero extend 209233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; and operate on words. 210233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm2, xmm2 211233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm0, xmm1 212233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm0, xmm2 213233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhbw xmm1, xmm2 214233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaddwd xmm0, xmm0 215233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaddwd xmm1, xmm1 216233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd xmm5, xmm0 217233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd xmm5, xmm1 218233d2500723e5594f3e7c70896ffeeef32b9c950ywan 219233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub rsi, 1 220233d2500723e5594f3e7c70896ffeeef32b9c950ywan jnz .accumulate 221233d2500723e5594f3e7c70896ffeeef32b9c950ywan 222233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; phaddd only operates on adjacent double words. 223233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; Finalize SAD and store 224233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm0, xmm3 225233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrldq xmm0, 8 226233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddusw xmm0, xmm3 227233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd xmm0, [GLOBAL(t128)] 228233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrld xmm0, 8 229233d2500723e5594f3e7c70896ffeeef32b9c950ywan 230233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rax, arg(5) 231233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd [rax], xmm0 232233d2500723e5594f3e7c70896ffeeef32b9c950ywan 233233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; Accumulate sum of src2 234233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm0, xmm4 235233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrldq xmm0, 8 236233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddusw xmm0, xmm4 237233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; Square src2. Ignore high value 238233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmuludq xmm0, xmm0 239233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrld xmm0, 8 240233d2500723e5594f3e7c70896ffeeef32b9c950ywan 241233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; phaddw could be used to sum adjacent values but we want 242233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; all the values summed. promote to doubles, accumulate, 243233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; shift and sum 244233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm2, xmm2 245233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm1, xmm5 246233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckldq xmm1, xmm2 247233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhdq xmm5, xmm2 248233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd xmm1, xmm5 249233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm2, xmm1 250233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrldq xmm1, 8 251233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd xmm1, xmm2 252233d2500723e5594f3e7c70896ffeeef32b9c950ywan 253233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubd xmm1, xmm0 254233d2500723e5594f3e7c70896ffeeef32b9c950ywan 255233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; (variance + 128) >> 8 256233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd xmm1, [GLOBAL(t128)] 257233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrld xmm1, 8 258233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rax, arg(4) 259233d2500723e5594f3e7c70896ffeeef32b9c950ywan 260233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd [rax], xmm1 261233d2500723e5594f3e7c70896ffeeef32b9c950ywan 262233d2500723e5594f3e7c70896ffeeef32b9c950ywan 263233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; begin epilog 264233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rdi 265233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rsi 266233d2500723e5594f3e7c70896ffeeef32b9c950ywan RESTORE_GOT 267233d2500723e5594f3e7c70896ffeeef32b9c950ywan UNSHADOW_ARGS 268233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rbp 269233d2500723e5594f3e7c70896ffeeef32b9c950ywan ret 270233d2500723e5594f3e7c70896ffeeef32b9c950ywan 271233d2500723e5594f3e7c70896ffeeef32b9c950ywanSECTION_RODATA 272233d2500723e5594f3e7c70896ffeeef32b9c950ywanalign 16 273233d2500723e5594f3e7c70896ffeeef32b9c950ywant128: 274233d2500723e5594f3e7c70896ffeeef32b9c950ywan%ifndef __NASM_VER__ 275233d2500723e5594f3e7c70896ffeeef32b9c950ywan ddq 128 276233d2500723e5594f3e7c70896ffeeef32b9c950ywan%elif CONFIG_BIG_ENDIAN 277233d2500723e5594f3e7c70896ffeeef32b9c950ywan dq 0, 128 278233d2500723e5594f3e7c70896ffeeef32b9c950ywan%else 279233d2500723e5594f3e7c70896ffeeef32b9c950ywan dq 128, 0 280233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif 281233d2500723e5594f3e7c70896ffeeef32b9c950ywanalign 16 282233d2500723e5594f3e7c70896ffeeef32b9c950ywantMFQE: ; 1 << MFQE_PRECISION 283233d2500723e5594f3e7c70896ffeeef32b9c950ywan times 8 dw 0x10 284233d2500723e5594f3e7c70896ffeeef32b9c950ywanalign 16 285233d2500723e5594f3e7c70896ffeeef32b9c950ywantMFQE_round: ; 1 << (MFQE_PRECISION - 1) 286233d2500723e5594f3e7c70896ffeeef32b9c950ywan times 8 dw 0x08 287233d2500723e5594f3e7c70896ffeeef32b9c950ywan 288