1f71323e297a928af368937089d3ed71239786f86Andreas Huber;
2f71323e297a928af368937089d3ed71239786f86Andreas Huber;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3f71323e297a928af368937089d3ed71239786f86Andreas Huber;
4f71323e297a928af368937089d3ed71239786f86Andreas Huber;  Use of this source code is governed by a BSD-style license and patent
5f71323e297a928af368937089d3ed71239786f86Andreas Huber;  grant that can be found in the LICENSE file in the root of the source
6f71323e297a928af368937089d3ed71239786f86Andreas Huber;  tree. All contributing project authors may be found in the AUTHORS
7f71323e297a928af368937089d3ed71239786f86Andreas Huber;  file in the root of the source tree.
8f71323e297a928af368937089d3ed71239786f86Andreas Huber;
9f71323e297a928af368937089d3ed71239786f86Andreas Huber
10f71323e297a928af368937089d3ed71239786f86Andreas Huber
11f71323e297a928af368937089d3ed71239786f86Andreas Huber%include "vpx_ports/x86_abi_support.asm"
1279f15823c34ae1e423108295e416213200bb280fAndreas Huber%include "asm_enc_offsets.asm"
13f71323e297a928af368937089d3ed71239786f86Andreas Huber
14f71323e297a928af368937089d3ed71239786f86Andreas Huber
1579f15823c34ae1e423108295e416213200bb280fAndreas Huber; void vp8_regular_quantize_b_sse2 | arg
1679f15823c34ae1e423108295e416213200bb280fAndreas Huber;  (BLOCK  *b,                     |  0
1779f15823c34ae1e423108295e416213200bb280fAndreas Huber;   BLOCKD *d)                     |  1
1879f15823c34ae1e423108295e416213200bb280fAndreas Huber
1979f15823c34ae1e423108295e416213200bb280fAndreas Huberglobal sym(vp8_regular_quantize_b_sse2)
2079f15823c34ae1e423108295e416213200bb280fAndreas Hubersym(vp8_regular_quantize_b_sse2):
21f71323e297a928af368937089d3ed71239786f86Andreas Huber    push        rbp
22f71323e297a928af368937089d3ed71239786f86Andreas Huber    mov         rbp, rsp
2379f15823c34ae1e423108295e416213200bb280fAndreas Huber    SAVE_XMM
2479f15823c34ae1e423108295e416213200bb280fAndreas Huber    GET_GOT     rbx
25f71323e297a928af368937089d3ed71239786f86Andreas Huber    push        rsi
2679f15823c34ae1e423108295e416213200bb280fAndreas Huber
2779f15823c34ae1e423108295e416213200bb280fAndreas Huber%if ABI_IS_32BIT
28f71323e297a928af368937089d3ed71239786f86Andreas Huber    push        rdi
2979f15823c34ae1e423108295e416213200bb280fAndreas Huber%else
3079f15823c34ae1e423108295e416213200bb280fAndreas Huber  %ifidn __OUTPUT_FORMAT__,x64
3179f15823c34ae1e423108295e416213200bb280fAndreas Huber    push        rdi
3279f15823c34ae1e423108295e416213200bb280fAndreas Huber  %endif
3379f15823c34ae1e423108295e416213200bb280fAndreas Huber%endif
34f71323e297a928af368937089d3ed71239786f86Andreas Huber
35f71323e297a928af368937089d3ed71239786f86Andreas Huber    ALIGN_STACK 16, rax
3679f15823c34ae1e423108295e416213200bb280fAndreas Huber    %define BLOCKD_d          0  ;  8
3779f15823c34ae1e423108295e416213200bb280fAndreas Huber    %define zrun_zbin_boost   8  ;  8
3879f15823c34ae1e423108295e416213200bb280fAndreas Huber    %define abs_minus_zbin    16 ; 32
3979f15823c34ae1e423108295e416213200bb280fAndreas Huber    %define temp_qcoeff       48 ; 32
4079f15823c34ae1e423108295e416213200bb280fAndreas Huber    %define qcoeff            80 ; 32
4179f15823c34ae1e423108295e416213200bb280fAndreas Huber    %define stack_size        112
4279f15823c34ae1e423108295e416213200bb280fAndreas Huber    sub         rsp, stack_size
4379f15823c34ae1e423108295e416213200bb280fAndreas Huber    ; end prolog
44f71323e297a928af368937089d3ed71239786f86Andreas Huber
4579f15823c34ae1e423108295e416213200bb280fAndreas Huber%if ABI_IS_32BIT
4679f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov         rdi, arg(0)
4779f15823c34ae1e423108295e416213200bb280fAndreas Huber%else
4879f15823c34ae1e423108295e416213200bb280fAndreas Huber  %ifidn __OUTPUT_FORMAT__,x64
4979f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov         rdi, rcx                    ; BLOCK *b
5079f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov         [rsp + BLOCKD_d], rdx
5179f15823c34ae1e423108295e416213200bb280fAndreas Huber  %else
5279f15823c34ae1e423108295e416213200bb280fAndreas Huber    ;mov         rdi, rdi                    ; BLOCK *b
5379f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov         [rsp + BLOCKD_d], rsi
5479f15823c34ae1e423108295e416213200bb280fAndreas Huber  %endif
5579f15823c34ae1e423108295e416213200bb280fAndreas Huber%endif
5679f15823c34ae1e423108295e416213200bb280fAndreas Huber
5779f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov         rdx, [rdi + vp8_block_coeff] ; coeff_ptr
5879f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov         rcx, [rdi + vp8_block_zbin] ; zbin_ptr
5979f15823c34ae1e423108295e416213200bb280fAndreas Huber    movd        xmm7, [rdi + vp8_block_zbin_extra] ; zbin_oq_value
6079f15823c34ae1e423108295e416213200bb280fAndreas Huber
6179f15823c34ae1e423108295e416213200bb280fAndreas Huber    ; z
6279f15823c34ae1e423108295e416213200bb280fAndreas Huber    movdqa      xmm0, [rdx]
6379f15823c34ae1e423108295e416213200bb280fAndreas Huber    movdqa      xmm4, [rdx + 16]
6479f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov         rdx, [rdi + vp8_block_round] ; round_ptr
65f71323e297a928af368937089d3ed71239786f86Andreas Huber
6679f15823c34ae1e423108295e416213200bb280fAndreas Huber    pshuflw     xmm7, xmm7, 0
6779f15823c34ae1e423108295e416213200bb280fAndreas Huber    punpcklwd   xmm7, xmm7                  ; duplicated zbin_oq_value
68f71323e297a928af368937089d3ed71239786f86Andreas Huber
69f71323e297a928af368937089d3ed71239786f86Andreas Huber    movdqa      xmm1, xmm0
70f71323e297a928af368937089d3ed71239786f86Andreas Huber    movdqa      xmm5, xmm4
71f71323e297a928af368937089d3ed71239786f86Andreas Huber
7279f15823c34ae1e423108295e416213200bb280fAndreas Huber    ; sz
7379f15823c34ae1e423108295e416213200bb280fAndreas Huber    psraw       xmm0, 15
7479f15823c34ae1e423108295e416213200bb280fAndreas Huber    psraw       xmm4, 15
75f71323e297a928af368937089d3ed71239786f86Andreas Huber
7679f15823c34ae1e423108295e416213200bb280fAndreas Huber    ; (z ^ sz)
77f71323e297a928af368937089d3ed71239786f86Andreas Huber    pxor        xmm1, xmm0
78f71323e297a928af368937089d3ed71239786f86Andreas Huber    pxor        xmm5, xmm4
79f71323e297a928af368937089d3ed71239786f86Andreas Huber
8079f15823c34ae1e423108295e416213200bb280fAndreas Huber    ; x = abs(z)
8179f15823c34ae1e423108295e416213200bb280fAndreas Huber    psubw       xmm1, xmm0
8279f15823c34ae1e423108295e416213200bb280fAndreas Huber    psubw       xmm5, xmm4
83f71323e297a928af368937089d3ed71239786f86Andreas Huber
8479f15823c34ae1e423108295e416213200bb280fAndreas Huber    movdqa      xmm2, [rcx]
8579f15823c34ae1e423108295e416213200bb280fAndreas Huber    movdqa      xmm3, [rcx + 16]
8679f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov         rcx, [rdi + vp8_block_quant] ; quant_ptr
87f71323e297a928af368937089d3ed71239786f86Andreas Huber
8879f15823c34ae1e423108295e416213200bb280fAndreas Huber    ; *zbin_ptr + zbin_oq_value
89f71323e297a928af368937089d3ed71239786f86Andreas Huber    paddw       xmm2, xmm7
90f71323e297a928af368937089d3ed71239786f86Andreas Huber    paddw       xmm3, xmm7
91f71323e297a928af368937089d3ed71239786f86Andreas Huber
9279f15823c34ae1e423108295e416213200bb280fAndreas Huber    ; x - (*zbin_ptr + zbin_oq_value)
9379f15823c34ae1e423108295e416213200bb280fAndreas Huber    psubw       xmm1, xmm2
9479f15823c34ae1e423108295e416213200bb280fAndreas Huber    psubw       xmm5, xmm3
9579f15823c34ae1e423108295e416213200bb280fAndreas Huber    movdqa      [rsp + abs_minus_zbin], xmm1
9679f15823c34ae1e423108295e416213200bb280fAndreas Huber    movdqa      [rsp + abs_minus_zbin + 16], xmm5
97f71323e297a928af368937089d3ed71239786f86Andreas Huber
9879f15823c34ae1e423108295e416213200bb280fAndreas Huber    ; add (zbin_ptr + zbin_oq_value) back
9979f15823c34ae1e423108295e416213200bb280fAndreas Huber    paddw       xmm1, xmm2
10079f15823c34ae1e423108295e416213200bb280fAndreas Huber    paddw       xmm5, xmm3
101f71323e297a928af368937089d3ed71239786f86Andreas Huber
10279f15823c34ae1e423108295e416213200bb280fAndreas Huber    movdqa      xmm2, [rdx]
10379f15823c34ae1e423108295e416213200bb280fAndreas Huber    movdqa      xmm6, [rdx + 16]
104f71323e297a928af368937089d3ed71239786f86Andreas Huber
10579f15823c34ae1e423108295e416213200bb280fAndreas Huber    movdqa      xmm3, [rcx]
10679f15823c34ae1e423108295e416213200bb280fAndreas Huber    movdqa      xmm7, [rcx + 16]
107f71323e297a928af368937089d3ed71239786f86Andreas Huber
10879f15823c34ae1e423108295e416213200bb280fAndreas Huber    ; x + round
109f71323e297a928af368937089d3ed71239786f86Andreas Huber    paddw       xmm1, xmm2
110f71323e297a928af368937089d3ed71239786f86Andreas Huber    paddw       xmm5, xmm6
111f71323e297a928af368937089d3ed71239786f86Andreas Huber
11279f15823c34ae1e423108295e416213200bb280fAndreas Huber    ; y = x * quant_ptr >> 16
11379f15823c34ae1e423108295e416213200bb280fAndreas Huber    pmulhw      xmm3, xmm1
11479f15823c34ae1e423108295e416213200bb280fAndreas Huber    pmulhw      xmm7, xmm5
115f71323e297a928af368937089d3ed71239786f86Andreas Huber
11679f15823c34ae1e423108295e416213200bb280fAndreas Huber    ; y += x
11779f15823c34ae1e423108295e416213200bb280fAndreas Huber    paddw       xmm1, xmm3
11879f15823c34ae1e423108295e416213200bb280fAndreas Huber    paddw       xmm5, xmm7
119f71323e297a928af368937089d3ed71239786f86Andreas Huber
12079f15823c34ae1e423108295e416213200bb280fAndreas Huber    movdqa      [rsp + temp_qcoeff], xmm1
12179f15823c34ae1e423108295e416213200bb280fAndreas Huber    movdqa      [rsp + temp_qcoeff + 16], xmm5
122f71323e297a928af368937089d3ed71239786f86Andreas Huber
12379f15823c34ae1e423108295e416213200bb280fAndreas Huber    pxor        xmm6, xmm6
12479f15823c34ae1e423108295e416213200bb280fAndreas Huber    ; zero qcoeff
12579f15823c34ae1e423108295e416213200bb280fAndreas Huber    movdqa      [rsp + qcoeff], xmm6
12679f15823c34ae1e423108295e416213200bb280fAndreas Huber    movdqa      [rsp + qcoeff + 16], xmm6
12779f15823c34ae1e423108295e416213200bb280fAndreas Huber
12879f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov         rsi, [rdi + vp8_block_zrun_zbin_boost] ; zbin_boost_ptr
12979f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov         rax, [rdi + vp8_block_quant_shift] ; quant_shift_ptr
13079f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov         [rsp + zrun_zbin_boost], rsi
13179f15823c34ae1e423108295e416213200bb280fAndreas Huber
13279f15823c34ae1e423108295e416213200bb280fAndreas Huber%macro ZIGZAG_LOOP 1
13379f15823c34ae1e423108295e416213200bb280fAndreas Huber    movsx       edx, WORD PTR[GLOBAL(zig_zag + (%1 * 2))] ; rc
13479f15823c34ae1e423108295e416213200bb280fAndreas Huber
13579f15823c34ae1e423108295e416213200bb280fAndreas Huber    ; x
13679f15823c34ae1e423108295e416213200bb280fAndreas Huber    movsx       ecx, WORD PTR[rsp + abs_minus_zbin + rdx *2]
13779f15823c34ae1e423108295e416213200bb280fAndreas Huber
13879f15823c34ae1e423108295e416213200bb280fAndreas Huber    ; if (x >= zbin)
13979f15823c34ae1e423108295e416213200bb280fAndreas Huber    sub         cx, WORD PTR[rsi]           ; x - zbin
14079f15823c34ae1e423108295e416213200bb280fAndreas Huber    lea         rsi, [rsi + 2]              ; zbin_boost_ptr++
14179f15823c34ae1e423108295e416213200bb280fAndreas Huber    jl          rq_zigzag_loop_%1           ; x < zbin
14279f15823c34ae1e423108295e416213200bb280fAndreas Huber
14379f15823c34ae1e423108295e416213200bb280fAndreas Huber    movsx       edi, WORD PTR[rsp + temp_qcoeff + rdx *2]
14479f15823c34ae1e423108295e416213200bb280fAndreas Huber
14579f15823c34ae1e423108295e416213200bb280fAndreas Huber    ; downshift by quant_shift[rdx]
14679f15823c34ae1e423108295e416213200bb280fAndreas Huber    movsx       ecx, WORD PTR[rax + rdx*2]  ; quant_shift_ptr[rc]
14779f15823c34ae1e423108295e416213200bb280fAndreas Huber    sar         edi, cl                     ; also sets Z bit
14879f15823c34ae1e423108295e416213200bb280fAndreas Huber    je          rq_zigzag_loop_%1           ; !y
14979f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov         WORD PTR[rsp + qcoeff + rdx*2], di ;qcoeff_ptr[rc] = temp_qcoeff[rc]
15079f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov         rsi, [rsp + zrun_zbin_boost] ; reset to b->zrun_zbin_boost
15179f15823c34ae1e423108295e416213200bb280fAndreas Huberrq_zigzag_loop_%1:
15279f15823c34ae1e423108295e416213200bb280fAndreas Huber%endmacro
15379f15823c34ae1e423108295e416213200bb280fAndreas HuberZIGZAG_LOOP 0
15479f15823c34ae1e423108295e416213200bb280fAndreas HuberZIGZAG_LOOP 1
15579f15823c34ae1e423108295e416213200bb280fAndreas HuberZIGZAG_LOOP 2
15679f15823c34ae1e423108295e416213200bb280fAndreas HuberZIGZAG_LOOP 3
15779f15823c34ae1e423108295e416213200bb280fAndreas HuberZIGZAG_LOOP 4
15879f15823c34ae1e423108295e416213200bb280fAndreas HuberZIGZAG_LOOP 5
15979f15823c34ae1e423108295e416213200bb280fAndreas HuberZIGZAG_LOOP 6
16079f15823c34ae1e423108295e416213200bb280fAndreas HuberZIGZAG_LOOP 7
16179f15823c34ae1e423108295e416213200bb280fAndreas HuberZIGZAG_LOOP 8
16279f15823c34ae1e423108295e416213200bb280fAndreas HuberZIGZAG_LOOP 9
16379f15823c34ae1e423108295e416213200bb280fAndreas HuberZIGZAG_LOOP 10
16479f15823c34ae1e423108295e416213200bb280fAndreas HuberZIGZAG_LOOP 11
16579f15823c34ae1e423108295e416213200bb280fAndreas HuberZIGZAG_LOOP 12
16679f15823c34ae1e423108295e416213200bb280fAndreas HuberZIGZAG_LOOP 13
16779f15823c34ae1e423108295e416213200bb280fAndreas HuberZIGZAG_LOOP 14
16879f15823c34ae1e423108295e416213200bb280fAndreas HuberZIGZAG_LOOP 15
16979f15823c34ae1e423108295e416213200bb280fAndreas Huber
17079f15823c34ae1e423108295e416213200bb280fAndreas Huber    movdqa      xmm2, [rsp + qcoeff]
17179f15823c34ae1e423108295e416213200bb280fAndreas Huber    movdqa      xmm3, [rsp + qcoeff + 16]
17279f15823c34ae1e423108295e416213200bb280fAndreas Huber
17379f15823c34ae1e423108295e416213200bb280fAndreas Huber%if ABI_IS_32BIT
17479f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov         rdi, arg(1)
17579f15823c34ae1e423108295e416213200bb280fAndreas Huber%else
17679f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov         rdi, [rsp + BLOCKD_d]
17779f15823c34ae1e423108295e416213200bb280fAndreas Huber%endif
17879f15823c34ae1e423108295e416213200bb280fAndreas Huber
17979f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov         rcx, [rdi + vp8_blockd_dequant] ; dequant_ptr
18079f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov         rsi, [rdi + vp8_blockd_dqcoeff] ; dqcoeff_ptr
18179f15823c34ae1e423108295e416213200bb280fAndreas Huber
18279f15823c34ae1e423108295e416213200bb280fAndreas Huber    ; y ^ sz
18379f15823c34ae1e423108295e416213200bb280fAndreas Huber    pxor        xmm2, xmm0
18479f15823c34ae1e423108295e416213200bb280fAndreas Huber    pxor        xmm3, xmm4
18579f15823c34ae1e423108295e416213200bb280fAndreas Huber    ; x = (y ^ sz) - sz
18679f15823c34ae1e423108295e416213200bb280fAndreas Huber    psubw       xmm2, xmm0
18779f15823c34ae1e423108295e416213200bb280fAndreas Huber    psubw       xmm3, xmm4
18879f15823c34ae1e423108295e416213200bb280fAndreas Huber
18979f15823c34ae1e423108295e416213200bb280fAndreas Huber    ; dequant
19079f15823c34ae1e423108295e416213200bb280fAndreas Huber    movdqa      xmm0, [rcx]
19179f15823c34ae1e423108295e416213200bb280fAndreas Huber    movdqa      xmm1, [rcx + 16]
19279f15823c34ae1e423108295e416213200bb280fAndreas Huber
19379f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov         rcx, [rdi + vp8_blockd_qcoeff] ; qcoeff_ptr
194f71323e297a928af368937089d3ed71239786f86Andreas Huber
195f71323e297a928af368937089d3ed71239786f86Andreas Huber    pmullw      xmm0, xmm2
196f71323e297a928af368937089d3ed71239786f86Andreas Huber    pmullw      xmm1, xmm3
197f71323e297a928af368937089d3ed71239786f86Andreas Huber
19879f15823c34ae1e423108295e416213200bb280fAndreas Huber    movdqa      [rcx], xmm2        ; store qcoeff
19979f15823c34ae1e423108295e416213200bb280fAndreas Huber    movdqa      [rcx + 16], xmm3
20079f15823c34ae1e423108295e416213200bb280fAndreas Huber    movdqa      [rsi], xmm0        ; store dqcoeff
20179f15823c34ae1e423108295e416213200bb280fAndreas Huber    movdqa      [rsi + 16], xmm1
20279f15823c34ae1e423108295e416213200bb280fAndreas Huber
20379f15823c34ae1e423108295e416213200bb280fAndreas Huber    ; select the last value (in zig_zag order) for EOB
20479f15823c34ae1e423108295e416213200bb280fAndreas Huber    pcmpeqw     xmm2, xmm6
20579f15823c34ae1e423108295e416213200bb280fAndreas Huber    pcmpeqw     xmm3, xmm6
20679f15823c34ae1e423108295e416213200bb280fAndreas Huber    ; !
20779f15823c34ae1e423108295e416213200bb280fAndreas Huber    pcmpeqw     xmm6, xmm6
20879f15823c34ae1e423108295e416213200bb280fAndreas Huber    pxor        xmm2, xmm6
20979f15823c34ae1e423108295e416213200bb280fAndreas Huber    pxor        xmm3, xmm6
21079f15823c34ae1e423108295e416213200bb280fAndreas Huber    ; mask inv_zig_zag
21179f15823c34ae1e423108295e416213200bb280fAndreas Huber    pand        xmm2, [GLOBAL(inv_zig_zag)]
21279f15823c34ae1e423108295e416213200bb280fAndreas Huber    pand        xmm3, [GLOBAL(inv_zig_zag + 16)]
21379f15823c34ae1e423108295e416213200bb280fAndreas Huber    ; select the max value
21479f15823c34ae1e423108295e416213200bb280fAndreas Huber    pmaxsw      xmm2, xmm3
21579f15823c34ae1e423108295e416213200bb280fAndreas Huber    pshufd      xmm3, xmm2, 00001110b
21679f15823c34ae1e423108295e416213200bb280fAndreas Huber    pmaxsw      xmm2, xmm3
21779f15823c34ae1e423108295e416213200bb280fAndreas Huber    pshuflw     xmm3, xmm2, 00001110b
21879f15823c34ae1e423108295e416213200bb280fAndreas Huber    pmaxsw      xmm2, xmm3
21979f15823c34ae1e423108295e416213200bb280fAndreas Huber    pshuflw     xmm3, xmm2, 00000001b
22079f15823c34ae1e423108295e416213200bb280fAndreas Huber    pmaxsw      xmm2, xmm3
22179f15823c34ae1e423108295e416213200bb280fAndreas Huber    movd        eax, xmm2
22279f15823c34ae1e423108295e416213200bb280fAndreas Huber    and         eax, 0xff
22379f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov         [rdi + vp8_blockd_eob], eax
224f71323e297a928af368937089d3ed71239786f86Andreas Huber
225f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; begin epilog
22679f15823c34ae1e423108295e416213200bb280fAndreas Huber    add         rsp, stack_size
22779f15823c34ae1e423108295e416213200bb280fAndreas Huber    pop         rsp
22879f15823c34ae1e423108295e416213200bb280fAndreas Huber%if ABI_IS_32BIT
22979f15823c34ae1e423108295e416213200bb280fAndreas Huber    pop         rdi
23079f15823c34ae1e423108295e416213200bb280fAndreas Huber%else
23179f15823c34ae1e423108295e416213200bb280fAndreas Huber  %ifidn __OUTPUT_FORMAT__,x64
232f71323e297a928af368937089d3ed71239786f86Andreas Huber    pop         rdi
23379f15823c34ae1e423108295e416213200bb280fAndreas Huber  %endif
23479f15823c34ae1e423108295e416213200bb280fAndreas Huber%endif
235f71323e297a928af368937089d3ed71239786f86Andreas Huber    pop         rsi
23679f15823c34ae1e423108295e416213200bb280fAndreas Huber    RESTORE_GOT
23779f15823c34ae1e423108295e416213200bb280fAndreas Huber    RESTORE_XMM
238f71323e297a928af368937089d3ed71239786f86Andreas Huber    pop         rbp
239f71323e297a928af368937089d3ed71239786f86Andreas Huber    ret
240538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
24179f15823c34ae1e423108295e416213200bb280fAndreas Huber; int vp8_fast_quantize_b_impl_sse2 | arg
24279f15823c34ae1e423108295e416213200bb280fAndreas Huber;  (short *coeff_ptr,               |  0
24379f15823c34ae1e423108295e416213200bb280fAndreas Huber;   short *qcoeff_ptr,              |  1
24479f15823c34ae1e423108295e416213200bb280fAndreas Huber;   short *dequant_ptr,             |  2
24579f15823c34ae1e423108295e416213200bb280fAndreas Huber;   short *inv_scan_order,          |  3
24679f15823c34ae1e423108295e416213200bb280fAndreas Huber;   short *round_ptr,               |  4
24779f15823c34ae1e423108295e416213200bb280fAndreas Huber;   short *quant_ptr,               |  5
24879f15823c34ae1e423108295e416213200bb280fAndreas Huber;   short *dqcoeff_ptr)             |  6
249538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
250538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huberglobal sym(vp8_fast_quantize_b_impl_sse2)
251538f6170b788de7408b06efc6613dc98579aa6a6Andreas Hubersym(vp8_fast_quantize_b_impl_sse2):
252538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    push        rbp
253538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    mov         rbp, rsp
254538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    SHADOW_ARGS_TO_STACK 7
255538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    push        rsi
256538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    push        rdi
257538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    ; end prolog
258538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
259538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    mov         rdx, arg(0)                 ;coeff_ptr
260538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    mov         rcx, arg(2)                 ;dequant_ptr
261538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    mov         rdi, arg(4)                 ;round_ptr
262538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    mov         rsi, arg(5)                 ;quant_ptr
263538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
264538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa      xmm0, XMMWORD PTR[rdx]
265538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa      xmm4, XMMWORD PTR[rdx + 16]
266538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
26779f15823c34ae1e423108295e416213200bb280fAndreas Huber    movdqa      xmm2, XMMWORD PTR[rdi]      ;round lo
26879f15823c34ae1e423108295e416213200bb280fAndreas Huber    movdqa      xmm3, XMMWORD PTR[rdi + 16] ;round hi
269538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
270538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa      xmm1, xmm0
271538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa      xmm5, xmm4
272538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
273538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    psraw       xmm0, 15                    ;sign of z (aka sz)
274538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    psraw       xmm4, 15                    ;sign of z (aka sz)
275538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
276538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pxor        xmm1, xmm0
277538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pxor        xmm5, xmm4
278538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    psubw       xmm1, xmm0                  ;x = abs(z)
279538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    psubw       xmm5, xmm4                  ;x = abs(z)
280538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
28179f15823c34ae1e423108295e416213200bb280fAndreas Huber    paddw       xmm1, xmm2
28279f15823c34ae1e423108295e416213200bb280fAndreas Huber    paddw       xmm5, xmm3
283538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
284538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pmulhw      xmm1, XMMWORD PTR[rsi]
285538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pmulhw      xmm5, XMMWORD PTR[rsi + 16]
286538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
287538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    mov         rdi, arg(1)                 ;qcoeff_ptr
288538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    mov         rsi, arg(6)                 ;dqcoeff_ptr
289538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
29079f15823c34ae1e423108295e416213200bb280fAndreas Huber    movdqa      xmm2, XMMWORD PTR[rcx]
29179f15823c34ae1e423108295e416213200bb280fAndreas Huber    movdqa      xmm3, XMMWORD PTR[rcx + 16]
292538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
293538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pxor        xmm1, xmm0
294538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pxor        xmm5, xmm4
295538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    psubw       xmm1, xmm0
296538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    psubw       xmm5, xmm4
297538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
298538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa      XMMWORD PTR[rdi], xmm1
299538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa      XMMWORD PTR[rdi + 16], xmm5
300538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
30179f15823c34ae1e423108295e416213200bb280fAndreas Huber    pmullw      xmm2, xmm1
30279f15823c34ae1e423108295e416213200bb280fAndreas Huber    pmullw      xmm3, xmm5
303538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
30479f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov         rdi, arg(3)                 ;inv_scan_order
305538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
30679f15823c34ae1e423108295e416213200bb280fAndreas Huber    ; Start with 16
30779f15823c34ae1e423108295e416213200bb280fAndreas Huber    pxor        xmm4, xmm4                  ;clear all bits
308538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pcmpeqw     xmm1, xmm4
309538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pcmpeqw     xmm5, xmm4
310538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
31179f15823c34ae1e423108295e416213200bb280fAndreas Huber    pcmpeqw     xmm4, xmm4                  ;set all bits
312538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pxor        xmm1, xmm4
313538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pxor        xmm5, xmm4
314538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
31579f15823c34ae1e423108295e416213200bb280fAndreas Huber    pand        xmm1, XMMWORD PTR[rdi]
31679f15823c34ae1e423108295e416213200bb280fAndreas Huber    pand        xmm5, XMMWORD PTR[rdi+16]
317538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
31879f15823c34ae1e423108295e416213200bb280fAndreas Huber    pmaxsw      xmm1, xmm5
319538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
32079f15823c34ae1e423108295e416213200bb280fAndreas Huber    ; now down to 8
32179f15823c34ae1e423108295e416213200bb280fAndreas Huber    pshufd      xmm5, xmm1, 00001110b
322538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
32379f15823c34ae1e423108295e416213200bb280fAndreas Huber    pmaxsw      xmm1, xmm5
324538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
32579f15823c34ae1e423108295e416213200bb280fAndreas Huber    ; only 4 left
32679f15823c34ae1e423108295e416213200bb280fAndreas Huber    pshuflw     xmm5, xmm1, 00001110b
327538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
32879f15823c34ae1e423108295e416213200bb280fAndreas Huber    pmaxsw      xmm1, xmm5
329538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
33079f15823c34ae1e423108295e416213200bb280fAndreas Huber    ; okay, just 2!
33179f15823c34ae1e423108295e416213200bb280fAndreas Huber    pshuflw     xmm5, xmm1, 00000001b
332538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
33379f15823c34ae1e423108295e416213200bb280fAndreas Huber    pmaxsw      xmm1, xmm5
334538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
33579f15823c34ae1e423108295e416213200bb280fAndreas Huber    movd        rax, xmm1
33679f15823c34ae1e423108295e416213200bb280fAndreas Huber    and         rax, 0xff
337538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
33879f15823c34ae1e423108295e416213200bb280fAndreas Huber    movdqa      XMMWORD PTR[rsi], xmm2        ;store dqcoeff
33979f15823c34ae1e423108295e416213200bb280fAndreas Huber    movdqa      XMMWORD PTR[rsi + 16], xmm3   ;store dqcoeff
340538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
341538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    ; begin epilog
342538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pop         rdi
343538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pop         rsi
344538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    UNSHADOW_ARGS
345538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pop         rbp
346538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    ret
34779f15823c34ae1e423108295e416213200bb280fAndreas Huber
34879f15823c34ae1e423108295e416213200bb280fAndreas HuberSECTION_RODATA
34979f15823c34ae1e423108295e416213200bb280fAndreas Huberalign 16
35079f15823c34ae1e423108295e416213200bb280fAndreas Huberzig_zag:
35179f15823c34ae1e423108295e416213200bb280fAndreas Huber  dw 0x0000, 0x0001, 0x0004, 0x0008
35279f15823c34ae1e423108295e416213200bb280fAndreas Huber  dw 0x0005, 0x0002, 0x0003, 0x0006
35379f15823c34ae1e423108295e416213200bb280fAndreas Huber  dw 0x0009, 0x000c, 0x000d, 0x000a
35479f15823c34ae1e423108295e416213200bb280fAndreas Huber  dw 0x0007, 0x000b, 0x000e, 0x000f
35579f15823c34ae1e423108295e416213200bb280fAndreas Huberinv_zig_zag:
35679f15823c34ae1e423108295e416213200bb280fAndreas Huber  dw 0x0001, 0x0002, 0x0006, 0x0007
35779f15823c34ae1e423108295e416213200bb280fAndreas Huber  dw 0x0003, 0x0005, 0x0008, 0x000d
35879f15823c34ae1e423108295e416213200bb280fAndreas Huber  dw 0x0004, 0x0009, 0x000c, 0x000e
35979f15823c34ae1e423108295e416213200bb280fAndreas Huber  dw 0x000a, 0x000b, 0x000f, 0x0010
360