11b362b15af34006e6a11974088a46d42b903418eJohann; 21b362b15af34006e6a11974088a46d42b903418eJohann; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 31b362b15af34006e6a11974088a46d42b903418eJohann; 41b362b15af34006e6a11974088a46d42b903418eJohann; Use of this source code is governed by a BSD-style license and patent 51b362b15af34006e6a11974088a46d42b903418eJohann; grant that can be found in the LICENSE file in the root of the source 61b362b15af34006e6a11974088a46d42b903418eJohann; tree. All contributing project authors may be found in the AUTHORS 71b362b15af34006e6a11974088a46d42b903418eJohann; file in the root of the source tree. 81b362b15af34006e6a11974088a46d42b903418eJohann; 91b362b15af34006e6a11974088a46d42b903418eJohann 101b362b15af34006e6a11974088a46d42b903418eJohann 111b362b15af34006e6a11974088a46d42b903418eJohann%include "vpx_ports/x86_abi_support.asm" 12ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%include "vp8_asm_enc_offsets.asm" 131b362b15af34006e6a11974088a46d42b903418eJohann 141b362b15af34006e6a11974088a46d42b903418eJohann 151b362b15af34006e6a11974088a46d42b903418eJohann; void vp8_regular_quantize_b_sse4 | arg 161b362b15af34006e6a11974088a46d42b903418eJohann; (BLOCK *b, | 0 171b362b15af34006e6a11974088a46d42b903418eJohann; BLOCKD *d) | 1 181b362b15af34006e6a11974088a46d42b903418eJohann 191b362b15af34006e6a11974088a46d42b903418eJohannglobal sym(vp8_regular_quantize_b_sse4) PRIVATE 201b362b15af34006e6a11974088a46d42b903418eJohannsym(vp8_regular_quantize_b_sse4): 211b362b15af34006e6a11974088a46d42b903418eJohann 221b362b15af34006e6a11974088a46d42b903418eJohann%if ABI_IS_32BIT 231b362b15af34006e6a11974088a46d42b903418eJohann push rbp 241b362b15af34006e6a11974088a46d42b903418eJohann mov rbp, rsp 251b362b15af34006e6a11974088a46d42b903418eJohann GET_GOT rbx 261b362b15af34006e6a11974088a46d42b903418eJohann push rdi 271b362b15af34006e6a11974088a46d42b903418eJohann push rsi 281b362b15af34006e6a11974088a46d42b903418eJohann 291b362b15af34006e6a11974088a46d42b903418eJohann ALIGN_STACK 16, rax 301b362b15af34006e6a11974088a46d42b903418eJohann %define qcoeff 0 ; 32 311b362b15af34006e6a11974088a46d42b903418eJohann %define stack_size 32 321b362b15af34006e6a11974088a46d42b903418eJohann sub rsp, stack_size 331b362b15af34006e6a11974088a46d42b903418eJohann%else 34ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %if LIBVPX_YASM_WIN64 351b362b15af34006e6a11974088a46d42b903418eJohann SAVE_XMM 8, u 361b362b15af34006e6a11974088a46d42b903418eJohann push rdi 371b362b15af34006e6a11974088a46d42b903418eJohann push rsi 381b362b15af34006e6a11974088a46d42b903418eJohann %endif 391b362b15af34006e6a11974088a46d42b903418eJohann%endif 401b362b15af34006e6a11974088a46d42b903418eJohann ; end prolog 411b362b15af34006e6a11974088a46d42b903418eJohann 421b362b15af34006e6a11974088a46d42b903418eJohann%if ABI_IS_32BIT 431b362b15af34006e6a11974088a46d42b903418eJohann mov rdi, arg(0) ; BLOCK *b 441b362b15af34006e6a11974088a46d42b903418eJohann mov rsi, arg(1) ; BLOCKD *d 451b362b15af34006e6a11974088a46d42b903418eJohann%else 46ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %if LIBVPX_YASM_WIN64 471b362b15af34006e6a11974088a46d42b903418eJohann mov rdi, rcx ; BLOCK *b 481b362b15af34006e6a11974088a46d42b903418eJohann mov rsi, rdx ; BLOCKD *d 491b362b15af34006e6a11974088a46d42b903418eJohann %else 501b362b15af34006e6a11974088a46d42b903418eJohann ;mov rdi, rdi ; BLOCK *b 511b362b15af34006e6a11974088a46d42b903418eJohann ;mov rsi, rsi ; BLOCKD *d 521b362b15af34006e6a11974088a46d42b903418eJohann %endif 531b362b15af34006e6a11974088a46d42b903418eJohann%endif 541b362b15af34006e6a11974088a46d42b903418eJohann 551b362b15af34006e6a11974088a46d42b903418eJohann mov rax, [rdi + vp8_block_coeff] 561b362b15af34006e6a11974088a46d42b903418eJohann mov rcx, [rdi + vp8_block_zbin] 571b362b15af34006e6a11974088a46d42b903418eJohann mov rdx, [rdi + vp8_block_round] 581b362b15af34006e6a11974088a46d42b903418eJohann movd xmm7, [rdi + vp8_block_zbin_extra] 591b362b15af34006e6a11974088a46d42b903418eJohann 601b362b15af34006e6a11974088a46d42b903418eJohann ; z 611b362b15af34006e6a11974088a46d42b903418eJohann movdqa xmm0, [rax] 621b362b15af34006e6a11974088a46d42b903418eJohann movdqa xmm1, [rax + 16] 631b362b15af34006e6a11974088a46d42b903418eJohann 641b362b15af34006e6a11974088a46d42b903418eJohann ; duplicate zbin_oq_value 651b362b15af34006e6a11974088a46d42b903418eJohann pshuflw xmm7, xmm7, 0 661b362b15af34006e6a11974088a46d42b903418eJohann punpcklwd xmm7, xmm7 671b362b15af34006e6a11974088a46d42b903418eJohann 681b362b15af34006e6a11974088a46d42b903418eJohann movdqa xmm2, xmm0 691b362b15af34006e6a11974088a46d42b903418eJohann movdqa xmm3, xmm1 701b362b15af34006e6a11974088a46d42b903418eJohann 711b362b15af34006e6a11974088a46d42b903418eJohann ; sz 721b362b15af34006e6a11974088a46d42b903418eJohann psraw xmm0, 15 731b362b15af34006e6a11974088a46d42b903418eJohann psraw xmm1, 15 741b362b15af34006e6a11974088a46d42b903418eJohann 751b362b15af34006e6a11974088a46d42b903418eJohann ; (z ^ sz) 761b362b15af34006e6a11974088a46d42b903418eJohann pxor xmm2, xmm0 771b362b15af34006e6a11974088a46d42b903418eJohann pxor xmm3, xmm1 781b362b15af34006e6a11974088a46d42b903418eJohann 791b362b15af34006e6a11974088a46d42b903418eJohann ; x = abs(z) 801b362b15af34006e6a11974088a46d42b903418eJohann psubw xmm2, xmm0 811b362b15af34006e6a11974088a46d42b903418eJohann psubw xmm3, xmm1 821b362b15af34006e6a11974088a46d42b903418eJohann 831b362b15af34006e6a11974088a46d42b903418eJohann ; zbin 841b362b15af34006e6a11974088a46d42b903418eJohann movdqa xmm4, [rcx] 851b362b15af34006e6a11974088a46d42b903418eJohann movdqa xmm5, [rcx + 16] 861b362b15af34006e6a11974088a46d42b903418eJohann 871b362b15af34006e6a11974088a46d42b903418eJohann ; *zbin_ptr + zbin_oq_value 881b362b15af34006e6a11974088a46d42b903418eJohann paddw xmm4, xmm7 891b362b15af34006e6a11974088a46d42b903418eJohann paddw xmm5, xmm7 901b362b15af34006e6a11974088a46d42b903418eJohann 911b362b15af34006e6a11974088a46d42b903418eJohann movdqa xmm6, xmm2 921b362b15af34006e6a11974088a46d42b903418eJohann movdqa xmm7, xmm3 931b362b15af34006e6a11974088a46d42b903418eJohann 941b362b15af34006e6a11974088a46d42b903418eJohann ; x - (*zbin_ptr + zbin_oq_value) 951b362b15af34006e6a11974088a46d42b903418eJohann psubw xmm6, xmm4 961b362b15af34006e6a11974088a46d42b903418eJohann psubw xmm7, xmm5 971b362b15af34006e6a11974088a46d42b903418eJohann 981b362b15af34006e6a11974088a46d42b903418eJohann ; round 991b362b15af34006e6a11974088a46d42b903418eJohann movdqa xmm4, [rdx] 1001b362b15af34006e6a11974088a46d42b903418eJohann movdqa xmm5, [rdx + 16] 1011b362b15af34006e6a11974088a46d42b903418eJohann 1021b362b15af34006e6a11974088a46d42b903418eJohann mov rax, [rdi + vp8_block_quant_shift] 1031b362b15af34006e6a11974088a46d42b903418eJohann mov rcx, [rdi + vp8_block_quant] 1041b362b15af34006e6a11974088a46d42b903418eJohann mov rdx, [rdi + vp8_block_zrun_zbin_boost] 1051b362b15af34006e6a11974088a46d42b903418eJohann 1061b362b15af34006e6a11974088a46d42b903418eJohann ; x + round 1071b362b15af34006e6a11974088a46d42b903418eJohann paddw xmm2, xmm4 1081b362b15af34006e6a11974088a46d42b903418eJohann paddw xmm3, xmm5 1091b362b15af34006e6a11974088a46d42b903418eJohann 1101b362b15af34006e6a11974088a46d42b903418eJohann ; quant 1111b362b15af34006e6a11974088a46d42b903418eJohann movdqa xmm4, [rcx] 1121b362b15af34006e6a11974088a46d42b903418eJohann movdqa xmm5, [rcx + 16] 1131b362b15af34006e6a11974088a46d42b903418eJohann 1141b362b15af34006e6a11974088a46d42b903418eJohann ; y = x * quant_ptr >> 16 1151b362b15af34006e6a11974088a46d42b903418eJohann pmulhw xmm4, xmm2 1161b362b15af34006e6a11974088a46d42b903418eJohann pmulhw xmm5, xmm3 1171b362b15af34006e6a11974088a46d42b903418eJohann 1181b362b15af34006e6a11974088a46d42b903418eJohann ; y += x 1191b362b15af34006e6a11974088a46d42b903418eJohann paddw xmm2, xmm4 1201b362b15af34006e6a11974088a46d42b903418eJohann paddw xmm3, xmm5 1211b362b15af34006e6a11974088a46d42b903418eJohann 1221b362b15af34006e6a11974088a46d42b903418eJohann pxor xmm4, xmm4 1231b362b15af34006e6a11974088a46d42b903418eJohann%if ABI_IS_32BIT 1241b362b15af34006e6a11974088a46d42b903418eJohann movdqa [rsp + qcoeff], xmm4 1251b362b15af34006e6a11974088a46d42b903418eJohann movdqa [rsp + qcoeff + 16], xmm4 1261b362b15af34006e6a11974088a46d42b903418eJohann%else 1271b362b15af34006e6a11974088a46d42b903418eJohann pxor xmm8, xmm8 1281b362b15af34006e6a11974088a46d42b903418eJohann%endif 1291b362b15af34006e6a11974088a46d42b903418eJohann 1301b362b15af34006e6a11974088a46d42b903418eJohann ; quant_shift 1311b362b15af34006e6a11974088a46d42b903418eJohann movdqa xmm5, [rax] 1321b362b15af34006e6a11974088a46d42b903418eJohann 1331b362b15af34006e6a11974088a46d42b903418eJohann ; zrun_zbin_boost 1341b362b15af34006e6a11974088a46d42b903418eJohann mov rax, rdx 1351b362b15af34006e6a11974088a46d42b903418eJohann 1361b362b15af34006e6a11974088a46d42b903418eJohann%macro ZIGZAG_LOOP 5 1371b362b15af34006e6a11974088a46d42b903418eJohann ; x 1381b362b15af34006e6a11974088a46d42b903418eJohann pextrw ecx, %4, %2 1391b362b15af34006e6a11974088a46d42b903418eJohann 1401b362b15af34006e6a11974088a46d42b903418eJohann ; if (x >= zbin) 1411b362b15af34006e6a11974088a46d42b903418eJohann sub cx, WORD PTR[rdx] ; x - zbin 1421b362b15af34006e6a11974088a46d42b903418eJohann lea rdx, [rdx + 2] ; zbin_boost_ptr++ 1431b362b15af34006e6a11974088a46d42b903418eJohann jl .rq_zigzag_loop_%1 ; x < zbin 1441b362b15af34006e6a11974088a46d42b903418eJohann 1451b362b15af34006e6a11974088a46d42b903418eJohann pextrw edi, %3, %2 ; y 1461b362b15af34006e6a11974088a46d42b903418eJohann 1471b362b15af34006e6a11974088a46d42b903418eJohann ; downshift by quant_shift[rc] 1481b362b15af34006e6a11974088a46d42b903418eJohann pextrb ecx, xmm5, %1 ; quant_shift[rc] 1491b362b15af34006e6a11974088a46d42b903418eJohann sar edi, cl ; also sets Z bit 1501b362b15af34006e6a11974088a46d42b903418eJohann je .rq_zigzag_loop_%1 ; !y 1511b362b15af34006e6a11974088a46d42b903418eJohann%if ABI_IS_32BIT 1521b362b15af34006e6a11974088a46d42b903418eJohann mov WORD PTR[rsp + qcoeff + %1 *2], di 1531b362b15af34006e6a11974088a46d42b903418eJohann%else 1541b362b15af34006e6a11974088a46d42b903418eJohann pinsrw %5, edi, %2 ; qcoeff[rc] 1551b362b15af34006e6a11974088a46d42b903418eJohann%endif 1561b362b15af34006e6a11974088a46d42b903418eJohann mov rdx, rax ; reset to b->zrun_zbin_boost 1571b362b15af34006e6a11974088a46d42b903418eJohann.rq_zigzag_loop_%1: 1581b362b15af34006e6a11974088a46d42b903418eJohann%endmacro 1591b362b15af34006e6a11974088a46d42b903418eJohann; in vp8_default_zig_zag1d order: see vp8/common/entropy.c 1601b362b15af34006e6a11974088a46d42b903418eJohannZIGZAG_LOOP 0, 0, xmm2, xmm6, xmm4 1611b362b15af34006e6a11974088a46d42b903418eJohannZIGZAG_LOOP 1, 1, xmm2, xmm6, xmm4 1621b362b15af34006e6a11974088a46d42b903418eJohannZIGZAG_LOOP 4, 4, xmm2, xmm6, xmm4 1631b362b15af34006e6a11974088a46d42b903418eJohannZIGZAG_LOOP 8, 0, xmm3, xmm7, xmm8 1641b362b15af34006e6a11974088a46d42b903418eJohannZIGZAG_LOOP 5, 5, xmm2, xmm6, xmm4 1651b362b15af34006e6a11974088a46d42b903418eJohannZIGZAG_LOOP 2, 2, xmm2, xmm6, xmm4 1661b362b15af34006e6a11974088a46d42b903418eJohannZIGZAG_LOOP 3, 3, xmm2, xmm6, xmm4 1671b362b15af34006e6a11974088a46d42b903418eJohannZIGZAG_LOOP 6, 6, xmm2, xmm6, xmm4 1681b362b15af34006e6a11974088a46d42b903418eJohannZIGZAG_LOOP 9, 1, xmm3, xmm7, xmm8 1691b362b15af34006e6a11974088a46d42b903418eJohannZIGZAG_LOOP 12, 4, xmm3, xmm7, xmm8 1701b362b15af34006e6a11974088a46d42b903418eJohannZIGZAG_LOOP 13, 5, xmm3, xmm7, xmm8 1711b362b15af34006e6a11974088a46d42b903418eJohannZIGZAG_LOOP 10, 2, xmm3, xmm7, xmm8 1721b362b15af34006e6a11974088a46d42b903418eJohannZIGZAG_LOOP 7, 7, xmm2, xmm6, xmm4 1731b362b15af34006e6a11974088a46d42b903418eJohannZIGZAG_LOOP 11, 3, xmm3, xmm7, xmm8 1741b362b15af34006e6a11974088a46d42b903418eJohannZIGZAG_LOOP 14, 6, xmm3, xmm7, xmm8 1751b362b15af34006e6a11974088a46d42b903418eJohannZIGZAG_LOOP 15, 7, xmm3, xmm7, xmm8 1761b362b15af34006e6a11974088a46d42b903418eJohann 1771b362b15af34006e6a11974088a46d42b903418eJohann mov rcx, [rsi + vp8_blockd_dequant] 1781b362b15af34006e6a11974088a46d42b903418eJohann mov rdi, [rsi + vp8_blockd_dqcoeff] 1791b362b15af34006e6a11974088a46d42b903418eJohann 1801b362b15af34006e6a11974088a46d42b903418eJohann%if ABI_IS_32BIT 1811b362b15af34006e6a11974088a46d42b903418eJohann movdqa xmm4, [rsp + qcoeff] 1821b362b15af34006e6a11974088a46d42b903418eJohann movdqa xmm5, [rsp + qcoeff + 16] 1831b362b15af34006e6a11974088a46d42b903418eJohann%else 1841b362b15af34006e6a11974088a46d42b903418eJohann %define xmm5 xmm8 1851b362b15af34006e6a11974088a46d42b903418eJohann%endif 1861b362b15af34006e6a11974088a46d42b903418eJohann 1871b362b15af34006e6a11974088a46d42b903418eJohann ; y ^ sz 1881b362b15af34006e6a11974088a46d42b903418eJohann pxor xmm4, xmm0 1891b362b15af34006e6a11974088a46d42b903418eJohann pxor xmm5, xmm1 1901b362b15af34006e6a11974088a46d42b903418eJohann ; x = (y ^ sz) - sz 1911b362b15af34006e6a11974088a46d42b903418eJohann psubw xmm4, xmm0 1921b362b15af34006e6a11974088a46d42b903418eJohann psubw xmm5, xmm1 1931b362b15af34006e6a11974088a46d42b903418eJohann 1941b362b15af34006e6a11974088a46d42b903418eJohann ; dequant 1951b362b15af34006e6a11974088a46d42b903418eJohann movdqa xmm0, [rcx] 1961b362b15af34006e6a11974088a46d42b903418eJohann movdqa xmm1, [rcx + 16] 1971b362b15af34006e6a11974088a46d42b903418eJohann 1981b362b15af34006e6a11974088a46d42b903418eJohann mov rcx, [rsi + vp8_blockd_qcoeff] 1991b362b15af34006e6a11974088a46d42b903418eJohann 2001b362b15af34006e6a11974088a46d42b903418eJohann pmullw xmm0, xmm4 2011b362b15af34006e6a11974088a46d42b903418eJohann pmullw xmm1, xmm5 2021b362b15af34006e6a11974088a46d42b903418eJohann 2031b362b15af34006e6a11974088a46d42b903418eJohann ; store qcoeff 2041b362b15af34006e6a11974088a46d42b903418eJohann movdqa [rcx], xmm4 2051b362b15af34006e6a11974088a46d42b903418eJohann movdqa [rcx + 16], xmm5 2061b362b15af34006e6a11974088a46d42b903418eJohann 2071b362b15af34006e6a11974088a46d42b903418eJohann ; store dqcoeff 2081b362b15af34006e6a11974088a46d42b903418eJohann movdqa [rdi], xmm0 2091b362b15af34006e6a11974088a46d42b903418eJohann movdqa [rdi + 16], xmm1 2101b362b15af34006e6a11974088a46d42b903418eJohann 2111b362b15af34006e6a11974088a46d42b903418eJohann mov rcx, [rsi + vp8_blockd_eob] 2121b362b15af34006e6a11974088a46d42b903418eJohann 2131b362b15af34006e6a11974088a46d42b903418eJohann ; select the last value (in zig_zag order) for EOB 2141b362b15af34006e6a11974088a46d42b903418eJohann pxor xmm6, xmm6 2151b362b15af34006e6a11974088a46d42b903418eJohann pcmpeqw xmm4, xmm6 2161b362b15af34006e6a11974088a46d42b903418eJohann pcmpeqw xmm5, xmm6 2171b362b15af34006e6a11974088a46d42b903418eJohann 2181b362b15af34006e6a11974088a46d42b903418eJohann packsswb xmm4, xmm5 2191b362b15af34006e6a11974088a46d42b903418eJohann pshufb xmm4, [GLOBAL(zig_zag1d)] 2201b362b15af34006e6a11974088a46d42b903418eJohann pmovmskb edx, xmm4 2211b362b15af34006e6a11974088a46d42b903418eJohann xor rdi, rdi 2221b362b15af34006e6a11974088a46d42b903418eJohann mov eax, -1 2231b362b15af34006e6a11974088a46d42b903418eJohann xor dx, ax 2241b362b15af34006e6a11974088a46d42b903418eJohann bsr eax, edx 2251b362b15af34006e6a11974088a46d42b903418eJohann sub edi, edx 2261b362b15af34006e6a11974088a46d42b903418eJohann sar edi, 31 2271b362b15af34006e6a11974088a46d42b903418eJohann add eax, 1 2281b362b15af34006e6a11974088a46d42b903418eJohann and eax, edi 2291b362b15af34006e6a11974088a46d42b903418eJohann 2301b362b15af34006e6a11974088a46d42b903418eJohann mov BYTE PTR [rcx], al ; store eob 2311b362b15af34006e6a11974088a46d42b903418eJohann 2321b362b15af34006e6a11974088a46d42b903418eJohann ; begin epilog 2331b362b15af34006e6a11974088a46d42b903418eJohann%if ABI_IS_32BIT 2341b362b15af34006e6a11974088a46d42b903418eJohann add rsp, stack_size 2351b362b15af34006e6a11974088a46d42b903418eJohann pop rsp 2361b362b15af34006e6a11974088a46d42b903418eJohann 2371b362b15af34006e6a11974088a46d42b903418eJohann pop rsi 2381b362b15af34006e6a11974088a46d42b903418eJohann pop rdi 2391b362b15af34006e6a11974088a46d42b903418eJohann RESTORE_GOT 2401b362b15af34006e6a11974088a46d42b903418eJohann pop rbp 2411b362b15af34006e6a11974088a46d42b903418eJohann%else 2421b362b15af34006e6a11974088a46d42b903418eJohann %undef xmm5 243ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang %if LIBVPX_YASM_WIN64 2441b362b15af34006e6a11974088a46d42b903418eJohann pop rsi 2451b362b15af34006e6a11974088a46d42b903418eJohann pop rdi 2461b362b15af34006e6a11974088a46d42b903418eJohann RESTORE_XMM 2471b362b15af34006e6a11974088a46d42b903418eJohann %endif 2481b362b15af34006e6a11974088a46d42b903418eJohann%endif 2491b362b15af34006e6a11974088a46d42b903418eJohann 2501b362b15af34006e6a11974088a46d42b903418eJohann ret 2511b362b15af34006e6a11974088a46d42b903418eJohann 2521b362b15af34006e6a11974088a46d42b903418eJohannSECTION_RODATA 2531b362b15af34006e6a11974088a46d42b903418eJohannalign 16 2541b362b15af34006e6a11974088a46d42b903418eJohann; vp8/common/entropy.c: vp8_default_zig_zag1d 2551b362b15af34006e6a11974088a46d42b903418eJohannzig_zag1d: 2561b362b15af34006e6a11974088a46d42b903418eJohann db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15 257