11b362b15af34006e6a11974088a46d42b903418eJohann;
21b362b15af34006e6a11974088a46d42b903418eJohann;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
31b362b15af34006e6a11974088a46d42b903418eJohann;
41b362b15af34006e6a11974088a46d42b903418eJohann;  Use of this source code is governed by a BSD-style license and patent
51b362b15af34006e6a11974088a46d42b903418eJohann;  grant that can be found in the LICENSE file in the root of the source
61b362b15af34006e6a11974088a46d42b903418eJohann;  tree. All contributing project authors may be found in the AUTHORS
71b362b15af34006e6a11974088a46d42b903418eJohann;  file in the root of the source tree.
81b362b15af34006e6a11974088a46d42b903418eJohann;
91b362b15af34006e6a11974088a46d42b903418eJohann
101b362b15af34006e6a11974088a46d42b903418eJohann
111b362b15af34006e6a11974088a46d42b903418eJohann%include "vpx_ports/x86_abi_support.asm"
12ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%include "vp8_asm_enc_offsets.asm"
131b362b15af34006e6a11974088a46d42b903418eJohann
141b362b15af34006e6a11974088a46d42b903418eJohann
151b362b15af34006e6a11974088a46d42b903418eJohann; void vp8_regular_quantize_b_sse4 | arg
161b362b15af34006e6a11974088a46d42b903418eJohann;  (BLOCK  *b,                     |  0
171b362b15af34006e6a11974088a46d42b903418eJohann;   BLOCKD *d)                     |  1
181b362b15af34006e6a11974088a46d42b903418eJohann
191b362b15af34006e6a11974088a46d42b903418eJohannglobal sym(vp8_regular_quantize_b_sse4) PRIVATE
201b362b15af34006e6a11974088a46d42b903418eJohannsym(vp8_regular_quantize_b_sse4):
211b362b15af34006e6a11974088a46d42b903418eJohann
221b362b15af34006e6a11974088a46d42b903418eJohann%if ABI_IS_32BIT
231b362b15af34006e6a11974088a46d42b903418eJohann    push        rbp
241b362b15af34006e6a11974088a46d42b903418eJohann    mov         rbp, rsp
251b362b15af34006e6a11974088a46d42b903418eJohann    GET_GOT     rbx
261b362b15af34006e6a11974088a46d42b903418eJohann    push        rdi
271b362b15af34006e6a11974088a46d42b903418eJohann    push        rsi
281b362b15af34006e6a11974088a46d42b903418eJohann
291b362b15af34006e6a11974088a46d42b903418eJohann    ALIGN_STACK 16, rax
301b362b15af34006e6a11974088a46d42b903418eJohann    %define qcoeff      0 ; 32
311b362b15af34006e6a11974088a46d42b903418eJohann    %define stack_size 32
321b362b15af34006e6a11974088a46d42b903418eJohann    sub         rsp, stack_size
331b362b15af34006e6a11974088a46d42b903418eJohann%else
34ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  %if LIBVPX_YASM_WIN64
351b362b15af34006e6a11974088a46d42b903418eJohann    SAVE_XMM 8, u
361b362b15af34006e6a11974088a46d42b903418eJohann    push        rdi
371b362b15af34006e6a11974088a46d42b903418eJohann    push        rsi
381b362b15af34006e6a11974088a46d42b903418eJohann  %endif
391b362b15af34006e6a11974088a46d42b903418eJohann%endif
401b362b15af34006e6a11974088a46d42b903418eJohann    ; end prolog
411b362b15af34006e6a11974088a46d42b903418eJohann
421b362b15af34006e6a11974088a46d42b903418eJohann%if ABI_IS_32BIT
431b362b15af34006e6a11974088a46d42b903418eJohann    mov         rdi, arg(0)                 ; BLOCK *b
441b362b15af34006e6a11974088a46d42b903418eJohann    mov         rsi, arg(1)                 ; BLOCKD *d
451b362b15af34006e6a11974088a46d42b903418eJohann%else
46ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  %if LIBVPX_YASM_WIN64
471b362b15af34006e6a11974088a46d42b903418eJohann    mov         rdi, rcx                    ; BLOCK *b
481b362b15af34006e6a11974088a46d42b903418eJohann    mov         rsi, rdx                    ; BLOCKD *d
491b362b15af34006e6a11974088a46d42b903418eJohann  %else
501b362b15af34006e6a11974088a46d42b903418eJohann    ;mov         rdi, rdi                    ; BLOCK *b
511b362b15af34006e6a11974088a46d42b903418eJohann    ;mov         rsi, rsi                    ; BLOCKD *d
521b362b15af34006e6a11974088a46d42b903418eJohann  %endif
531b362b15af34006e6a11974088a46d42b903418eJohann%endif
541b362b15af34006e6a11974088a46d42b903418eJohann
551b362b15af34006e6a11974088a46d42b903418eJohann    mov         rax, [rdi + vp8_block_coeff]
561b362b15af34006e6a11974088a46d42b903418eJohann    mov         rcx, [rdi + vp8_block_zbin]
571b362b15af34006e6a11974088a46d42b903418eJohann    mov         rdx, [rdi + vp8_block_round]
581b362b15af34006e6a11974088a46d42b903418eJohann    movd        xmm7, [rdi + vp8_block_zbin_extra]
591b362b15af34006e6a11974088a46d42b903418eJohann
601b362b15af34006e6a11974088a46d42b903418eJohann    ; z
611b362b15af34006e6a11974088a46d42b903418eJohann    movdqa      xmm0, [rax]
621b362b15af34006e6a11974088a46d42b903418eJohann    movdqa      xmm1, [rax + 16]
631b362b15af34006e6a11974088a46d42b903418eJohann
641b362b15af34006e6a11974088a46d42b903418eJohann    ; duplicate zbin_oq_value
651b362b15af34006e6a11974088a46d42b903418eJohann    pshuflw     xmm7, xmm7, 0
661b362b15af34006e6a11974088a46d42b903418eJohann    punpcklwd   xmm7, xmm7
671b362b15af34006e6a11974088a46d42b903418eJohann
681b362b15af34006e6a11974088a46d42b903418eJohann    movdqa      xmm2, xmm0
691b362b15af34006e6a11974088a46d42b903418eJohann    movdqa      xmm3, xmm1
701b362b15af34006e6a11974088a46d42b903418eJohann
711b362b15af34006e6a11974088a46d42b903418eJohann    ; sz
721b362b15af34006e6a11974088a46d42b903418eJohann    psraw       xmm0, 15
731b362b15af34006e6a11974088a46d42b903418eJohann    psraw       xmm1, 15
741b362b15af34006e6a11974088a46d42b903418eJohann
751b362b15af34006e6a11974088a46d42b903418eJohann    ; (z ^ sz)
761b362b15af34006e6a11974088a46d42b903418eJohann    pxor        xmm2, xmm0
771b362b15af34006e6a11974088a46d42b903418eJohann    pxor        xmm3, xmm1
781b362b15af34006e6a11974088a46d42b903418eJohann
791b362b15af34006e6a11974088a46d42b903418eJohann    ; x = abs(z)
801b362b15af34006e6a11974088a46d42b903418eJohann    psubw       xmm2, xmm0
811b362b15af34006e6a11974088a46d42b903418eJohann    psubw       xmm3, xmm1
821b362b15af34006e6a11974088a46d42b903418eJohann
831b362b15af34006e6a11974088a46d42b903418eJohann    ; zbin
841b362b15af34006e6a11974088a46d42b903418eJohann    movdqa      xmm4, [rcx]
851b362b15af34006e6a11974088a46d42b903418eJohann    movdqa      xmm5, [rcx + 16]
861b362b15af34006e6a11974088a46d42b903418eJohann
871b362b15af34006e6a11974088a46d42b903418eJohann    ; *zbin_ptr + zbin_oq_value
881b362b15af34006e6a11974088a46d42b903418eJohann    paddw       xmm4, xmm7
891b362b15af34006e6a11974088a46d42b903418eJohann    paddw       xmm5, xmm7
901b362b15af34006e6a11974088a46d42b903418eJohann
911b362b15af34006e6a11974088a46d42b903418eJohann    movdqa      xmm6, xmm2
921b362b15af34006e6a11974088a46d42b903418eJohann    movdqa      xmm7, xmm3
931b362b15af34006e6a11974088a46d42b903418eJohann
941b362b15af34006e6a11974088a46d42b903418eJohann    ; x - (*zbin_ptr + zbin_oq_value)
951b362b15af34006e6a11974088a46d42b903418eJohann    psubw       xmm6, xmm4
961b362b15af34006e6a11974088a46d42b903418eJohann    psubw       xmm7, xmm5
971b362b15af34006e6a11974088a46d42b903418eJohann
981b362b15af34006e6a11974088a46d42b903418eJohann    ; round
991b362b15af34006e6a11974088a46d42b903418eJohann    movdqa      xmm4, [rdx]
1001b362b15af34006e6a11974088a46d42b903418eJohann    movdqa      xmm5, [rdx + 16]
1011b362b15af34006e6a11974088a46d42b903418eJohann
1021b362b15af34006e6a11974088a46d42b903418eJohann    mov         rax, [rdi + vp8_block_quant_shift]
1031b362b15af34006e6a11974088a46d42b903418eJohann    mov         rcx, [rdi + vp8_block_quant]
1041b362b15af34006e6a11974088a46d42b903418eJohann    mov         rdx, [rdi + vp8_block_zrun_zbin_boost]
1051b362b15af34006e6a11974088a46d42b903418eJohann
1061b362b15af34006e6a11974088a46d42b903418eJohann    ; x + round
1071b362b15af34006e6a11974088a46d42b903418eJohann    paddw       xmm2, xmm4
1081b362b15af34006e6a11974088a46d42b903418eJohann    paddw       xmm3, xmm5
1091b362b15af34006e6a11974088a46d42b903418eJohann
1101b362b15af34006e6a11974088a46d42b903418eJohann    ; quant
1111b362b15af34006e6a11974088a46d42b903418eJohann    movdqa      xmm4, [rcx]
1121b362b15af34006e6a11974088a46d42b903418eJohann    movdqa      xmm5, [rcx + 16]
1131b362b15af34006e6a11974088a46d42b903418eJohann
1141b362b15af34006e6a11974088a46d42b903418eJohann    ; y = x * quant_ptr >> 16
1151b362b15af34006e6a11974088a46d42b903418eJohann    pmulhw      xmm4, xmm2
1161b362b15af34006e6a11974088a46d42b903418eJohann    pmulhw      xmm5, xmm3
1171b362b15af34006e6a11974088a46d42b903418eJohann
1181b362b15af34006e6a11974088a46d42b903418eJohann    ; y += x
1191b362b15af34006e6a11974088a46d42b903418eJohann    paddw       xmm2, xmm4
1201b362b15af34006e6a11974088a46d42b903418eJohann    paddw       xmm3, xmm5
1211b362b15af34006e6a11974088a46d42b903418eJohann
1221b362b15af34006e6a11974088a46d42b903418eJohann    pxor        xmm4, xmm4
1231b362b15af34006e6a11974088a46d42b903418eJohann%if ABI_IS_32BIT
1241b362b15af34006e6a11974088a46d42b903418eJohann    movdqa      [rsp + qcoeff], xmm4
1251b362b15af34006e6a11974088a46d42b903418eJohann    movdqa      [rsp + qcoeff + 16], xmm4
1261b362b15af34006e6a11974088a46d42b903418eJohann%else
1271b362b15af34006e6a11974088a46d42b903418eJohann    pxor        xmm8, xmm8
1281b362b15af34006e6a11974088a46d42b903418eJohann%endif
1291b362b15af34006e6a11974088a46d42b903418eJohann
1301b362b15af34006e6a11974088a46d42b903418eJohann    ; quant_shift
1311b362b15af34006e6a11974088a46d42b903418eJohann    movdqa      xmm5, [rax]
1321b362b15af34006e6a11974088a46d42b903418eJohann
1331b362b15af34006e6a11974088a46d42b903418eJohann    ; zrun_zbin_boost
1341b362b15af34006e6a11974088a46d42b903418eJohann    mov         rax, rdx
1351b362b15af34006e6a11974088a46d42b903418eJohann
1361b362b15af34006e6a11974088a46d42b903418eJohann%macro ZIGZAG_LOOP 5
1371b362b15af34006e6a11974088a46d42b903418eJohann    ; x
1381b362b15af34006e6a11974088a46d42b903418eJohann    pextrw      ecx, %4, %2
1391b362b15af34006e6a11974088a46d42b903418eJohann
1401b362b15af34006e6a11974088a46d42b903418eJohann    ; if (x >= zbin)
1411b362b15af34006e6a11974088a46d42b903418eJohann    sub         cx, WORD PTR[rdx]           ; x - zbin
1421b362b15af34006e6a11974088a46d42b903418eJohann    lea         rdx, [rdx + 2]              ; zbin_boost_ptr++
1431b362b15af34006e6a11974088a46d42b903418eJohann    jl          .rq_zigzag_loop_%1          ; x < zbin
1441b362b15af34006e6a11974088a46d42b903418eJohann
1451b362b15af34006e6a11974088a46d42b903418eJohann    pextrw      edi, %3, %2                 ; y
1461b362b15af34006e6a11974088a46d42b903418eJohann
1471b362b15af34006e6a11974088a46d42b903418eJohann    ; downshift by quant_shift[rc]
1481b362b15af34006e6a11974088a46d42b903418eJohann    pextrb      ecx, xmm5, %1               ; quant_shift[rc]
1491b362b15af34006e6a11974088a46d42b903418eJohann    sar         edi, cl                     ; also sets Z bit
1501b362b15af34006e6a11974088a46d42b903418eJohann    je          .rq_zigzag_loop_%1          ; !y
1511b362b15af34006e6a11974088a46d42b903418eJohann%if ABI_IS_32BIT
1521b362b15af34006e6a11974088a46d42b903418eJohann    mov         WORD PTR[rsp + qcoeff + %1 *2], di
1531b362b15af34006e6a11974088a46d42b903418eJohann%else
1541b362b15af34006e6a11974088a46d42b903418eJohann    pinsrw      %5, edi, %2                 ; qcoeff[rc]
1551b362b15af34006e6a11974088a46d42b903418eJohann%endif
1561b362b15af34006e6a11974088a46d42b903418eJohann    mov         rdx, rax                    ; reset to b->zrun_zbin_boost
1571b362b15af34006e6a11974088a46d42b903418eJohann.rq_zigzag_loop_%1:
1581b362b15af34006e6a11974088a46d42b903418eJohann%endmacro
1591b362b15af34006e6a11974088a46d42b903418eJohann; in vp8_default_zig_zag1d order: see vp8/common/entropy.c
1601b362b15af34006e6a11974088a46d42b903418eJohannZIGZAG_LOOP  0, 0, xmm2, xmm6, xmm4
1611b362b15af34006e6a11974088a46d42b903418eJohannZIGZAG_LOOP  1, 1, xmm2, xmm6, xmm4
1621b362b15af34006e6a11974088a46d42b903418eJohannZIGZAG_LOOP  4, 4, xmm2, xmm6, xmm4
1631b362b15af34006e6a11974088a46d42b903418eJohannZIGZAG_LOOP  8, 0, xmm3, xmm7, xmm8
1641b362b15af34006e6a11974088a46d42b903418eJohannZIGZAG_LOOP  5, 5, xmm2, xmm6, xmm4
1651b362b15af34006e6a11974088a46d42b903418eJohannZIGZAG_LOOP  2, 2, xmm2, xmm6, xmm4
1661b362b15af34006e6a11974088a46d42b903418eJohannZIGZAG_LOOP  3, 3, xmm2, xmm6, xmm4
1671b362b15af34006e6a11974088a46d42b903418eJohannZIGZAG_LOOP  6, 6, xmm2, xmm6, xmm4
1681b362b15af34006e6a11974088a46d42b903418eJohannZIGZAG_LOOP  9, 1, xmm3, xmm7, xmm8
1691b362b15af34006e6a11974088a46d42b903418eJohannZIGZAG_LOOP 12, 4, xmm3, xmm7, xmm8
1701b362b15af34006e6a11974088a46d42b903418eJohannZIGZAG_LOOP 13, 5, xmm3, xmm7, xmm8
1711b362b15af34006e6a11974088a46d42b903418eJohannZIGZAG_LOOP 10, 2, xmm3, xmm7, xmm8
1721b362b15af34006e6a11974088a46d42b903418eJohannZIGZAG_LOOP  7, 7, xmm2, xmm6, xmm4
1731b362b15af34006e6a11974088a46d42b903418eJohannZIGZAG_LOOP 11, 3, xmm3, xmm7, xmm8
1741b362b15af34006e6a11974088a46d42b903418eJohannZIGZAG_LOOP 14, 6, xmm3, xmm7, xmm8
1751b362b15af34006e6a11974088a46d42b903418eJohannZIGZAG_LOOP 15, 7, xmm3, xmm7, xmm8
1761b362b15af34006e6a11974088a46d42b903418eJohann
1771b362b15af34006e6a11974088a46d42b903418eJohann    mov         rcx, [rsi + vp8_blockd_dequant]
1781b362b15af34006e6a11974088a46d42b903418eJohann    mov         rdi, [rsi + vp8_blockd_dqcoeff]
1791b362b15af34006e6a11974088a46d42b903418eJohann
1801b362b15af34006e6a11974088a46d42b903418eJohann%if ABI_IS_32BIT
1811b362b15af34006e6a11974088a46d42b903418eJohann    movdqa      xmm4, [rsp + qcoeff]
1821b362b15af34006e6a11974088a46d42b903418eJohann    movdqa      xmm5, [rsp + qcoeff + 16]
1831b362b15af34006e6a11974088a46d42b903418eJohann%else
1841b362b15af34006e6a11974088a46d42b903418eJohann    %define     xmm5 xmm8
1851b362b15af34006e6a11974088a46d42b903418eJohann%endif
1861b362b15af34006e6a11974088a46d42b903418eJohann
1871b362b15af34006e6a11974088a46d42b903418eJohann    ; y ^ sz
1881b362b15af34006e6a11974088a46d42b903418eJohann    pxor        xmm4, xmm0
1891b362b15af34006e6a11974088a46d42b903418eJohann    pxor        xmm5, xmm1
1901b362b15af34006e6a11974088a46d42b903418eJohann    ; x = (y ^ sz) - sz
1911b362b15af34006e6a11974088a46d42b903418eJohann    psubw       xmm4, xmm0
1921b362b15af34006e6a11974088a46d42b903418eJohann    psubw       xmm5, xmm1
1931b362b15af34006e6a11974088a46d42b903418eJohann
1941b362b15af34006e6a11974088a46d42b903418eJohann    ; dequant
1951b362b15af34006e6a11974088a46d42b903418eJohann    movdqa      xmm0, [rcx]
1961b362b15af34006e6a11974088a46d42b903418eJohann    movdqa      xmm1, [rcx + 16]
1971b362b15af34006e6a11974088a46d42b903418eJohann
1981b362b15af34006e6a11974088a46d42b903418eJohann    mov         rcx, [rsi + vp8_blockd_qcoeff]
1991b362b15af34006e6a11974088a46d42b903418eJohann
2001b362b15af34006e6a11974088a46d42b903418eJohann    pmullw      xmm0, xmm4
2011b362b15af34006e6a11974088a46d42b903418eJohann    pmullw      xmm1, xmm5
2021b362b15af34006e6a11974088a46d42b903418eJohann
2031b362b15af34006e6a11974088a46d42b903418eJohann    ; store qcoeff
2041b362b15af34006e6a11974088a46d42b903418eJohann    movdqa      [rcx], xmm4
2051b362b15af34006e6a11974088a46d42b903418eJohann    movdqa      [rcx + 16], xmm5
2061b362b15af34006e6a11974088a46d42b903418eJohann
2071b362b15af34006e6a11974088a46d42b903418eJohann    ; store dqcoeff
2081b362b15af34006e6a11974088a46d42b903418eJohann    movdqa      [rdi], xmm0
2091b362b15af34006e6a11974088a46d42b903418eJohann    movdqa      [rdi + 16], xmm1
2101b362b15af34006e6a11974088a46d42b903418eJohann
2111b362b15af34006e6a11974088a46d42b903418eJohann    mov         rcx, [rsi + vp8_blockd_eob]
2121b362b15af34006e6a11974088a46d42b903418eJohann
2131b362b15af34006e6a11974088a46d42b903418eJohann    ; select the last value (in zig_zag order) for EOB
2141b362b15af34006e6a11974088a46d42b903418eJohann    pxor        xmm6, xmm6
2151b362b15af34006e6a11974088a46d42b903418eJohann    pcmpeqw     xmm4, xmm6
2161b362b15af34006e6a11974088a46d42b903418eJohann    pcmpeqw     xmm5, xmm6
2171b362b15af34006e6a11974088a46d42b903418eJohann
2181b362b15af34006e6a11974088a46d42b903418eJohann    packsswb    xmm4, xmm5
2191b362b15af34006e6a11974088a46d42b903418eJohann    pshufb      xmm4, [GLOBAL(zig_zag1d)]
2201b362b15af34006e6a11974088a46d42b903418eJohann    pmovmskb    edx, xmm4
2211b362b15af34006e6a11974088a46d42b903418eJohann    xor         rdi, rdi
2221b362b15af34006e6a11974088a46d42b903418eJohann    mov         eax, -1
2231b362b15af34006e6a11974088a46d42b903418eJohann    xor         dx, ax
2241b362b15af34006e6a11974088a46d42b903418eJohann    bsr         eax, edx
2251b362b15af34006e6a11974088a46d42b903418eJohann    sub         edi, edx
2261b362b15af34006e6a11974088a46d42b903418eJohann    sar         edi, 31
2271b362b15af34006e6a11974088a46d42b903418eJohann    add         eax, 1
2281b362b15af34006e6a11974088a46d42b903418eJohann    and         eax, edi
2291b362b15af34006e6a11974088a46d42b903418eJohann
2301b362b15af34006e6a11974088a46d42b903418eJohann    mov         BYTE PTR [rcx], al          ; store eob
2311b362b15af34006e6a11974088a46d42b903418eJohann
2321b362b15af34006e6a11974088a46d42b903418eJohann    ; begin epilog
2331b362b15af34006e6a11974088a46d42b903418eJohann%if ABI_IS_32BIT
2341b362b15af34006e6a11974088a46d42b903418eJohann    add         rsp, stack_size
2351b362b15af34006e6a11974088a46d42b903418eJohann    pop         rsp
2361b362b15af34006e6a11974088a46d42b903418eJohann
2371b362b15af34006e6a11974088a46d42b903418eJohann    pop         rsi
2381b362b15af34006e6a11974088a46d42b903418eJohann    pop         rdi
2391b362b15af34006e6a11974088a46d42b903418eJohann    RESTORE_GOT
2401b362b15af34006e6a11974088a46d42b903418eJohann    pop         rbp
2411b362b15af34006e6a11974088a46d42b903418eJohann%else
2421b362b15af34006e6a11974088a46d42b903418eJohann  %undef xmm5
243ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang  %if LIBVPX_YASM_WIN64
2441b362b15af34006e6a11974088a46d42b903418eJohann    pop         rsi
2451b362b15af34006e6a11974088a46d42b903418eJohann    pop         rdi
2461b362b15af34006e6a11974088a46d42b903418eJohann    RESTORE_XMM
2471b362b15af34006e6a11974088a46d42b903418eJohann  %endif
2481b362b15af34006e6a11974088a46d42b903418eJohann%endif
2491b362b15af34006e6a11974088a46d42b903418eJohann
2501b362b15af34006e6a11974088a46d42b903418eJohann    ret
2511b362b15af34006e6a11974088a46d42b903418eJohann
2521b362b15af34006e6a11974088a46d42b903418eJohannSECTION_RODATA
2531b362b15af34006e6a11974088a46d42b903418eJohannalign 16
2541b362b15af34006e6a11974088a46d42b903418eJohann; vp8/common/entropy.c: vp8_default_zig_zag1d
2551b362b15af34006e6a11974088a46d42b903418eJohannzig_zag1d:
2561b362b15af34006e6a11974088a46d42b903418eJohann    db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
257