1f71323e297a928af368937089d3ed71239786f86Andreas Huber; 2f71323e297a928af368937089d3ed71239786f86Andreas Huber; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3f71323e297a928af368937089d3ed71239786f86Andreas Huber; 4f71323e297a928af368937089d3ed71239786f86Andreas Huber; Use of this source code is governed by a BSD-style license and patent 5f71323e297a928af368937089d3ed71239786f86Andreas Huber; grant that can be found in the LICENSE file in the root of the source 6f71323e297a928af368937089d3ed71239786f86Andreas Huber; tree. All contributing project authors may be found in the AUTHORS 7f71323e297a928af368937089d3ed71239786f86Andreas Huber; file in the root of the source tree. 8f71323e297a928af368937089d3ed71239786f86Andreas Huber; 9f71323e297a928af368937089d3ed71239786f86Andreas Huber 10f71323e297a928af368937089d3ed71239786f86Andreas Huber 11f71323e297a928af368937089d3ed71239786f86Andreas Huber%include "vpx_ports/x86_abi_support.asm" 1279f15823c34ae1e423108295e416213200bb280fAndreas Huber%include "asm_enc_offsets.asm" 13f71323e297a928af368937089d3ed71239786f86Andreas Huber 14f71323e297a928af368937089d3ed71239786f86Andreas Huber 1579f15823c34ae1e423108295e416213200bb280fAndreas Huber; void vp8_regular_quantize_b_sse2 | arg 1679f15823c34ae1e423108295e416213200bb280fAndreas Huber; (BLOCK *b, | 0 1779f15823c34ae1e423108295e416213200bb280fAndreas Huber; BLOCKD *d) | 1 1879f15823c34ae1e423108295e416213200bb280fAndreas Huber 1979f15823c34ae1e423108295e416213200bb280fAndreas Huberglobal sym(vp8_regular_quantize_b_sse2) 2079f15823c34ae1e423108295e416213200bb280fAndreas Hubersym(vp8_regular_quantize_b_sse2): 21f71323e297a928af368937089d3ed71239786f86Andreas Huber push rbp 22f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rbp, rsp 2379f15823c34ae1e423108295e416213200bb280fAndreas Huber SAVE_XMM 2479f15823c34ae1e423108295e416213200bb280fAndreas Huber GET_GOT rbx 25f71323e297a928af368937089d3ed71239786f86Andreas Huber push rsi 2679f15823c34ae1e423108295e416213200bb280fAndreas Huber 2779f15823c34ae1e423108295e416213200bb280fAndreas Huber%if ABI_IS_32BIT 28f71323e297a928af368937089d3ed71239786f86Andreas Huber push rdi 2979f15823c34ae1e423108295e416213200bb280fAndreas Huber%else 3079f15823c34ae1e423108295e416213200bb280fAndreas Huber %ifidn __OUTPUT_FORMAT__,x64 3179f15823c34ae1e423108295e416213200bb280fAndreas Huber push rdi 3279f15823c34ae1e423108295e416213200bb280fAndreas Huber %endif 3379f15823c34ae1e423108295e416213200bb280fAndreas Huber%endif 34f71323e297a928af368937089d3ed71239786f86Andreas Huber 35f71323e297a928af368937089d3ed71239786f86Andreas Huber ALIGN_STACK 16, rax 3679f15823c34ae1e423108295e416213200bb280fAndreas Huber %define BLOCKD_d 0 ; 8 3779f15823c34ae1e423108295e416213200bb280fAndreas Huber %define zrun_zbin_boost 8 ; 8 3879f15823c34ae1e423108295e416213200bb280fAndreas Huber %define abs_minus_zbin 16 ; 32 3979f15823c34ae1e423108295e416213200bb280fAndreas Huber %define temp_qcoeff 48 ; 32 4079f15823c34ae1e423108295e416213200bb280fAndreas Huber %define qcoeff 80 ; 32 4179f15823c34ae1e423108295e416213200bb280fAndreas Huber %define stack_size 112 4279f15823c34ae1e423108295e416213200bb280fAndreas Huber sub rsp, stack_size 4379f15823c34ae1e423108295e416213200bb280fAndreas Huber ; end prolog 44f71323e297a928af368937089d3ed71239786f86Andreas Huber 4579f15823c34ae1e423108295e416213200bb280fAndreas Huber%if ABI_IS_32BIT 4679f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rdi, arg(0) 4779f15823c34ae1e423108295e416213200bb280fAndreas Huber%else 4879f15823c34ae1e423108295e416213200bb280fAndreas Huber %ifidn __OUTPUT_FORMAT__,x64 4979f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rdi, rcx ; BLOCK *b 5079f15823c34ae1e423108295e416213200bb280fAndreas Huber mov [rsp + BLOCKD_d], rdx 5179f15823c34ae1e423108295e416213200bb280fAndreas Huber %else 5279f15823c34ae1e423108295e416213200bb280fAndreas Huber ;mov rdi, rdi ; BLOCK *b 5379f15823c34ae1e423108295e416213200bb280fAndreas Huber mov [rsp + BLOCKD_d], rsi 5479f15823c34ae1e423108295e416213200bb280fAndreas Huber %endif 5579f15823c34ae1e423108295e416213200bb280fAndreas Huber%endif 5679f15823c34ae1e423108295e416213200bb280fAndreas Huber 5779f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rdx, [rdi + vp8_block_coeff] ; coeff_ptr 5879f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rcx, [rdi + vp8_block_zbin] ; zbin_ptr 5979f15823c34ae1e423108295e416213200bb280fAndreas Huber movd xmm7, [rdi + vp8_block_zbin_extra] ; zbin_oq_value 6079f15823c34ae1e423108295e416213200bb280fAndreas Huber 6179f15823c34ae1e423108295e416213200bb280fAndreas Huber ; z 6279f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm0, [rdx] 6379f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm4, [rdx + 16] 6479f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rdx, [rdi + vp8_block_round] ; round_ptr 65f71323e297a928af368937089d3ed71239786f86Andreas Huber 6679f15823c34ae1e423108295e416213200bb280fAndreas Huber pshuflw xmm7, xmm7, 0 6779f15823c34ae1e423108295e416213200bb280fAndreas Huber punpcklwd xmm7, xmm7 ; duplicated zbin_oq_value 68f71323e297a928af368937089d3ed71239786f86Andreas Huber 69f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm1, xmm0 70f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm5, xmm4 71f71323e297a928af368937089d3ed71239786f86Andreas Huber 7279f15823c34ae1e423108295e416213200bb280fAndreas Huber ; sz 7379f15823c34ae1e423108295e416213200bb280fAndreas Huber psraw xmm0, 15 7479f15823c34ae1e423108295e416213200bb280fAndreas Huber psraw xmm4, 15 75f71323e297a928af368937089d3ed71239786f86Andreas Huber 7679f15823c34ae1e423108295e416213200bb280fAndreas Huber ; (z ^ sz) 77f71323e297a928af368937089d3ed71239786f86Andreas Huber pxor xmm1, xmm0 78f71323e297a928af368937089d3ed71239786f86Andreas Huber pxor xmm5, xmm4 79f71323e297a928af368937089d3ed71239786f86Andreas Huber 8079f15823c34ae1e423108295e416213200bb280fAndreas Huber ; x = abs(z) 8179f15823c34ae1e423108295e416213200bb280fAndreas Huber psubw xmm1, xmm0 8279f15823c34ae1e423108295e416213200bb280fAndreas Huber psubw xmm5, xmm4 83f71323e297a928af368937089d3ed71239786f86Andreas Huber 8479f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm2, [rcx] 8579f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm3, [rcx + 16] 8679f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rcx, [rdi + vp8_block_quant] ; quant_ptr 87f71323e297a928af368937089d3ed71239786f86Andreas Huber 8879f15823c34ae1e423108295e416213200bb280fAndreas Huber ; *zbin_ptr + zbin_oq_value 89f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm2, xmm7 90f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm3, xmm7 91f71323e297a928af368937089d3ed71239786f86Andreas Huber 9279f15823c34ae1e423108295e416213200bb280fAndreas Huber ; x - (*zbin_ptr + zbin_oq_value) 9379f15823c34ae1e423108295e416213200bb280fAndreas Huber psubw xmm1, xmm2 9479f15823c34ae1e423108295e416213200bb280fAndreas Huber psubw xmm5, xmm3 9579f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa [rsp + abs_minus_zbin], xmm1 9679f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa [rsp + abs_minus_zbin + 16], xmm5 97f71323e297a928af368937089d3ed71239786f86Andreas Huber 9879f15823c34ae1e423108295e416213200bb280fAndreas Huber ; add (zbin_ptr + zbin_oq_value) back 9979f15823c34ae1e423108295e416213200bb280fAndreas Huber paddw xmm1, xmm2 10079f15823c34ae1e423108295e416213200bb280fAndreas Huber paddw xmm5, xmm3 101f71323e297a928af368937089d3ed71239786f86Andreas Huber 10279f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm2, [rdx] 10379f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm6, [rdx + 16] 104f71323e297a928af368937089d3ed71239786f86Andreas Huber 10579f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm3, [rcx] 10679f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm7, [rcx + 16] 107f71323e297a928af368937089d3ed71239786f86Andreas Huber 10879f15823c34ae1e423108295e416213200bb280fAndreas Huber ; x + round 109f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm1, xmm2 110f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm5, xmm6 111f71323e297a928af368937089d3ed71239786f86Andreas Huber 11279f15823c34ae1e423108295e416213200bb280fAndreas Huber ; y = x * quant_ptr >> 16 11379f15823c34ae1e423108295e416213200bb280fAndreas Huber pmulhw xmm3, xmm1 11479f15823c34ae1e423108295e416213200bb280fAndreas Huber pmulhw xmm7, xmm5 115f71323e297a928af368937089d3ed71239786f86Andreas Huber 11679f15823c34ae1e423108295e416213200bb280fAndreas Huber ; y += x 11779f15823c34ae1e423108295e416213200bb280fAndreas Huber paddw xmm1, xmm3 11879f15823c34ae1e423108295e416213200bb280fAndreas Huber paddw xmm5, xmm7 119f71323e297a928af368937089d3ed71239786f86Andreas Huber 12079f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa [rsp + temp_qcoeff], xmm1 12179f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa [rsp + temp_qcoeff + 16], xmm5 122f71323e297a928af368937089d3ed71239786f86Andreas Huber 12379f15823c34ae1e423108295e416213200bb280fAndreas Huber pxor xmm6, xmm6 12479f15823c34ae1e423108295e416213200bb280fAndreas Huber ; zero qcoeff 12579f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa [rsp + qcoeff], xmm6 12679f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa [rsp + qcoeff + 16], xmm6 12779f15823c34ae1e423108295e416213200bb280fAndreas Huber 12879f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rsi, [rdi + vp8_block_zrun_zbin_boost] ; zbin_boost_ptr 12979f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rax, [rdi + vp8_block_quant_shift] ; quant_shift_ptr 13079f15823c34ae1e423108295e416213200bb280fAndreas Huber mov [rsp + zrun_zbin_boost], rsi 13179f15823c34ae1e423108295e416213200bb280fAndreas Huber 13279f15823c34ae1e423108295e416213200bb280fAndreas Huber%macro ZIGZAG_LOOP 1 13379f15823c34ae1e423108295e416213200bb280fAndreas Huber movsx edx, WORD PTR[GLOBAL(zig_zag + (%1 * 2))] ; rc 13479f15823c34ae1e423108295e416213200bb280fAndreas Huber 13579f15823c34ae1e423108295e416213200bb280fAndreas Huber ; x 13679f15823c34ae1e423108295e416213200bb280fAndreas Huber movsx ecx, WORD PTR[rsp + abs_minus_zbin + rdx *2] 13779f15823c34ae1e423108295e416213200bb280fAndreas Huber 13879f15823c34ae1e423108295e416213200bb280fAndreas Huber ; if (x >= zbin) 13979f15823c34ae1e423108295e416213200bb280fAndreas Huber sub cx, WORD PTR[rsi] ; x - zbin 14079f15823c34ae1e423108295e416213200bb280fAndreas Huber lea rsi, [rsi + 2] ; zbin_boost_ptr++ 14179f15823c34ae1e423108295e416213200bb280fAndreas Huber jl rq_zigzag_loop_%1 ; x < zbin 14279f15823c34ae1e423108295e416213200bb280fAndreas Huber 14379f15823c34ae1e423108295e416213200bb280fAndreas Huber movsx edi, WORD PTR[rsp + temp_qcoeff + rdx *2] 14479f15823c34ae1e423108295e416213200bb280fAndreas Huber 14579f15823c34ae1e423108295e416213200bb280fAndreas Huber ; downshift by quant_shift[rdx] 14679f15823c34ae1e423108295e416213200bb280fAndreas Huber movsx ecx, WORD PTR[rax + rdx*2] ; quant_shift_ptr[rc] 14779f15823c34ae1e423108295e416213200bb280fAndreas Huber sar edi, cl ; also sets Z bit 14879f15823c34ae1e423108295e416213200bb280fAndreas Huber je rq_zigzag_loop_%1 ; !y 14979f15823c34ae1e423108295e416213200bb280fAndreas Huber mov WORD PTR[rsp + qcoeff + rdx*2], di ;qcoeff_ptr[rc] = temp_qcoeff[rc] 15079f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rsi, [rsp + zrun_zbin_boost] ; reset to b->zrun_zbin_boost 15179f15823c34ae1e423108295e416213200bb280fAndreas Huberrq_zigzag_loop_%1: 15279f15823c34ae1e423108295e416213200bb280fAndreas Huber%endmacro 15379f15823c34ae1e423108295e416213200bb280fAndreas HuberZIGZAG_LOOP 0 15479f15823c34ae1e423108295e416213200bb280fAndreas HuberZIGZAG_LOOP 1 15579f15823c34ae1e423108295e416213200bb280fAndreas HuberZIGZAG_LOOP 2 15679f15823c34ae1e423108295e416213200bb280fAndreas HuberZIGZAG_LOOP 3 15779f15823c34ae1e423108295e416213200bb280fAndreas HuberZIGZAG_LOOP 4 15879f15823c34ae1e423108295e416213200bb280fAndreas HuberZIGZAG_LOOP 5 15979f15823c34ae1e423108295e416213200bb280fAndreas HuberZIGZAG_LOOP 6 16079f15823c34ae1e423108295e416213200bb280fAndreas HuberZIGZAG_LOOP 7 16179f15823c34ae1e423108295e416213200bb280fAndreas HuberZIGZAG_LOOP 8 16279f15823c34ae1e423108295e416213200bb280fAndreas HuberZIGZAG_LOOP 9 16379f15823c34ae1e423108295e416213200bb280fAndreas HuberZIGZAG_LOOP 10 16479f15823c34ae1e423108295e416213200bb280fAndreas HuberZIGZAG_LOOP 11 16579f15823c34ae1e423108295e416213200bb280fAndreas HuberZIGZAG_LOOP 12 16679f15823c34ae1e423108295e416213200bb280fAndreas HuberZIGZAG_LOOP 13 16779f15823c34ae1e423108295e416213200bb280fAndreas HuberZIGZAG_LOOP 14 16879f15823c34ae1e423108295e416213200bb280fAndreas HuberZIGZAG_LOOP 15 16979f15823c34ae1e423108295e416213200bb280fAndreas Huber 17079f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm2, [rsp + qcoeff] 17179f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm3, [rsp + qcoeff + 16] 17279f15823c34ae1e423108295e416213200bb280fAndreas Huber 17379f15823c34ae1e423108295e416213200bb280fAndreas Huber%if ABI_IS_32BIT 17479f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rdi, arg(1) 17579f15823c34ae1e423108295e416213200bb280fAndreas Huber%else 17679f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rdi, [rsp + BLOCKD_d] 17779f15823c34ae1e423108295e416213200bb280fAndreas Huber%endif 17879f15823c34ae1e423108295e416213200bb280fAndreas Huber 17979f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rcx, [rdi + vp8_blockd_dequant] ; dequant_ptr 18079f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rsi, [rdi + vp8_blockd_dqcoeff] ; dqcoeff_ptr 18179f15823c34ae1e423108295e416213200bb280fAndreas Huber 18279f15823c34ae1e423108295e416213200bb280fAndreas Huber ; y ^ sz 18379f15823c34ae1e423108295e416213200bb280fAndreas Huber pxor xmm2, xmm0 18479f15823c34ae1e423108295e416213200bb280fAndreas Huber pxor xmm3, xmm4 18579f15823c34ae1e423108295e416213200bb280fAndreas Huber ; x = (y ^ sz) - sz 18679f15823c34ae1e423108295e416213200bb280fAndreas Huber psubw xmm2, xmm0 18779f15823c34ae1e423108295e416213200bb280fAndreas Huber psubw xmm3, xmm4 18879f15823c34ae1e423108295e416213200bb280fAndreas Huber 18979f15823c34ae1e423108295e416213200bb280fAndreas Huber ; dequant 19079f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm0, [rcx] 19179f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm1, [rcx + 16] 19279f15823c34ae1e423108295e416213200bb280fAndreas Huber 19379f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rcx, [rdi + vp8_blockd_qcoeff] ; qcoeff_ptr 194f71323e297a928af368937089d3ed71239786f86Andreas Huber 195f71323e297a928af368937089d3ed71239786f86Andreas Huber pmullw xmm0, xmm2 196f71323e297a928af368937089d3ed71239786f86Andreas Huber pmullw xmm1, xmm3 197f71323e297a928af368937089d3ed71239786f86Andreas Huber 19879f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa [rcx], xmm2 ; store qcoeff 19979f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa [rcx + 16], xmm3 20079f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa [rsi], xmm0 ; store dqcoeff 20179f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa [rsi + 16], xmm1 20279f15823c34ae1e423108295e416213200bb280fAndreas Huber 20379f15823c34ae1e423108295e416213200bb280fAndreas Huber ; select the last value (in zig_zag order) for EOB 20479f15823c34ae1e423108295e416213200bb280fAndreas Huber pcmpeqw xmm2, xmm6 20579f15823c34ae1e423108295e416213200bb280fAndreas Huber pcmpeqw xmm3, xmm6 20679f15823c34ae1e423108295e416213200bb280fAndreas Huber ; ! 20779f15823c34ae1e423108295e416213200bb280fAndreas Huber pcmpeqw xmm6, xmm6 20879f15823c34ae1e423108295e416213200bb280fAndreas Huber pxor xmm2, xmm6 20979f15823c34ae1e423108295e416213200bb280fAndreas Huber pxor xmm3, xmm6 21079f15823c34ae1e423108295e416213200bb280fAndreas Huber ; mask inv_zig_zag 21179f15823c34ae1e423108295e416213200bb280fAndreas Huber pand xmm2, [GLOBAL(inv_zig_zag)] 21279f15823c34ae1e423108295e416213200bb280fAndreas Huber pand xmm3, [GLOBAL(inv_zig_zag + 16)] 21379f15823c34ae1e423108295e416213200bb280fAndreas Huber ; select the max value 21479f15823c34ae1e423108295e416213200bb280fAndreas Huber pmaxsw xmm2, xmm3 21579f15823c34ae1e423108295e416213200bb280fAndreas Huber pshufd xmm3, xmm2, 00001110b 21679f15823c34ae1e423108295e416213200bb280fAndreas Huber pmaxsw xmm2, xmm3 21779f15823c34ae1e423108295e416213200bb280fAndreas Huber pshuflw xmm3, xmm2, 00001110b 21879f15823c34ae1e423108295e416213200bb280fAndreas Huber pmaxsw xmm2, xmm3 21979f15823c34ae1e423108295e416213200bb280fAndreas Huber pshuflw xmm3, xmm2, 00000001b 22079f15823c34ae1e423108295e416213200bb280fAndreas Huber pmaxsw xmm2, xmm3 22179f15823c34ae1e423108295e416213200bb280fAndreas Huber movd eax, xmm2 22279f15823c34ae1e423108295e416213200bb280fAndreas Huber and eax, 0xff 22379f15823c34ae1e423108295e416213200bb280fAndreas Huber mov [rdi + vp8_blockd_eob], eax 224f71323e297a928af368937089d3ed71239786f86Andreas Huber 225f71323e297a928af368937089d3ed71239786f86Andreas Huber ; begin epilog 22679f15823c34ae1e423108295e416213200bb280fAndreas Huber add rsp, stack_size 22779f15823c34ae1e423108295e416213200bb280fAndreas Huber pop rsp 22879f15823c34ae1e423108295e416213200bb280fAndreas Huber%if ABI_IS_32BIT 22979f15823c34ae1e423108295e416213200bb280fAndreas Huber pop rdi 23079f15823c34ae1e423108295e416213200bb280fAndreas Huber%else 23179f15823c34ae1e423108295e416213200bb280fAndreas Huber %ifidn __OUTPUT_FORMAT__,x64 232f71323e297a928af368937089d3ed71239786f86Andreas Huber pop rdi 23379f15823c34ae1e423108295e416213200bb280fAndreas Huber %endif 23479f15823c34ae1e423108295e416213200bb280fAndreas Huber%endif 235f71323e297a928af368937089d3ed71239786f86Andreas Huber pop rsi 23679f15823c34ae1e423108295e416213200bb280fAndreas Huber RESTORE_GOT 23779f15823c34ae1e423108295e416213200bb280fAndreas Huber RESTORE_XMM 238f71323e297a928af368937089d3ed71239786f86Andreas Huber pop rbp 239f71323e297a928af368937089d3ed71239786f86Andreas Huber ret 240538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 24179f15823c34ae1e423108295e416213200bb280fAndreas Huber; int vp8_fast_quantize_b_impl_sse2 | arg 24279f15823c34ae1e423108295e416213200bb280fAndreas Huber; (short *coeff_ptr, | 0 24379f15823c34ae1e423108295e416213200bb280fAndreas Huber; short *qcoeff_ptr, | 1 24479f15823c34ae1e423108295e416213200bb280fAndreas Huber; short *dequant_ptr, | 2 24579f15823c34ae1e423108295e416213200bb280fAndreas Huber; short *inv_scan_order, | 3 24679f15823c34ae1e423108295e416213200bb280fAndreas Huber; short *round_ptr, | 4 24779f15823c34ae1e423108295e416213200bb280fAndreas Huber; short *quant_ptr, | 5 24879f15823c34ae1e423108295e416213200bb280fAndreas Huber; short *dqcoeff_ptr) | 6 249538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 250538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huberglobal sym(vp8_fast_quantize_b_impl_sse2) 251538f6170b788de7408b06efc6613dc98579aa6a6Andreas Hubersym(vp8_fast_quantize_b_impl_sse2): 252538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber push rbp 253538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber mov rbp, rsp 254538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber SHADOW_ARGS_TO_STACK 7 255538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber push rsi 256538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber push rdi 257538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber ; end prolog 258538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 259538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber mov rdx, arg(0) ;coeff_ptr 260538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber mov rcx, arg(2) ;dequant_ptr 261538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber mov rdi, arg(4) ;round_ptr 262538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber mov rsi, arg(5) ;quant_ptr 263538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 264538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm0, XMMWORD PTR[rdx] 265538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm4, XMMWORD PTR[rdx + 16] 266538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 26779f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm2, XMMWORD PTR[rdi] ;round lo 26879f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm3, XMMWORD PTR[rdi + 16] ;round hi 269538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 270538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm1, xmm0 271538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm5, xmm4 272538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 273538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psraw xmm0, 15 ;sign of z (aka sz) 274538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psraw xmm4, 15 ;sign of z (aka sz) 275538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 276538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm1, xmm0 277538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm5, xmm4 278538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psubw xmm1, xmm0 ;x = abs(z) 279538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psubw xmm5, xmm4 ;x = abs(z) 280538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 28179f15823c34ae1e423108295e416213200bb280fAndreas Huber paddw xmm1, xmm2 28279f15823c34ae1e423108295e416213200bb280fAndreas Huber paddw xmm5, xmm3 283538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 284538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmulhw xmm1, XMMWORD PTR[rsi] 285538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmulhw xmm5, XMMWORD PTR[rsi + 16] 286538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 287538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber mov rdi, arg(1) ;qcoeff_ptr 288538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber mov rsi, arg(6) ;dqcoeff_ptr 289538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 29079f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm2, XMMWORD PTR[rcx] 29179f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm3, XMMWORD PTR[rcx + 16] 292538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 293538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm1, xmm0 294538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm5, xmm4 295538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psubw xmm1, xmm0 296538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psubw xmm5, xmm4 297538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 298538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa XMMWORD PTR[rdi], xmm1 299538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa XMMWORD PTR[rdi + 16], xmm5 300538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 30179f15823c34ae1e423108295e416213200bb280fAndreas Huber pmullw xmm2, xmm1 30279f15823c34ae1e423108295e416213200bb280fAndreas Huber pmullw xmm3, xmm5 303538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 30479f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rdi, arg(3) ;inv_scan_order 305538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 30679f15823c34ae1e423108295e416213200bb280fAndreas Huber ; Start with 16 30779f15823c34ae1e423108295e416213200bb280fAndreas Huber pxor xmm4, xmm4 ;clear all bits 308538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pcmpeqw xmm1, xmm4 309538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pcmpeqw xmm5, xmm4 310538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 31179f15823c34ae1e423108295e416213200bb280fAndreas Huber pcmpeqw xmm4, xmm4 ;set all bits 312538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm1, xmm4 313538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm5, xmm4 314538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 31579f15823c34ae1e423108295e416213200bb280fAndreas Huber pand xmm1, XMMWORD PTR[rdi] 31679f15823c34ae1e423108295e416213200bb280fAndreas Huber pand xmm5, XMMWORD PTR[rdi+16] 317538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 31879f15823c34ae1e423108295e416213200bb280fAndreas Huber pmaxsw xmm1, xmm5 319538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 32079f15823c34ae1e423108295e416213200bb280fAndreas Huber ; now down to 8 32179f15823c34ae1e423108295e416213200bb280fAndreas Huber pshufd xmm5, xmm1, 00001110b 322538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 32379f15823c34ae1e423108295e416213200bb280fAndreas Huber pmaxsw xmm1, xmm5 324538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 32579f15823c34ae1e423108295e416213200bb280fAndreas Huber ; only 4 left 32679f15823c34ae1e423108295e416213200bb280fAndreas Huber pshuflw xmm5, xmm1, 00001110b 327538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 32879f15823c34ae1e423108295e416213200bb280fAndreas Huber pmaxsw xmm1, xmm5 329538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 33079f15823c34ae1e423108295e416213200bb280fAndreas Huber ; okay, just 2! 33179f15823c34ae1e423108295e416213200bb280fAndreas Huber pshuflw xmm5, xmm1, 00000001b 332538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 33379f15823c34ae1e423108295e416213200bb280fAndreas Huber pmaxsw xmm1, xmm5 334538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 33579f15823c34ae1e423108295e416213200bb280fAndreas Huber movd rax, xmm1 33679f15823c34ae1e423108295e416213200bb280fAndreas Huber and rax, 0xff 337538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 33879f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa XMMWORD PTR[rsi], xmm2 ;store dqcoeff 33979f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa XMMWORD PTR[rsi + 16], xmm3 ;store dqcoeff 340538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 341538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber ; begin epilog 342538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pop rdi 343538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pop rsi 344538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber UNSHADOW_ARGS 345538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pop rbp 346538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber ret 34779f15823c34ae1e423108295e416213200bb280fAndreas Huber 34879f15823c34ae1e423108295e416213200bb280fAndreas HuberSECTION_RODATA 34979f15823c34ae1e423108295e416213200bb280fAndreas Huberalign 16 35079f15823c34ae1e423108295e416213200bb280fAndreas Huberzig_zag: 35179f15823c34ae1e423108295e416213200bb280fAndreas Huber dw 0x0000, 0x0001, 0x0004, 0x0008 35279f15823c34ae1e423108295e416213200bb280fAndreas Huber dw 0x0005, 0x0002, 0x0003, 0x0006 35379f15823c34ae1e423108295e416213200bb280fAndreas Huber dw 0x0009, 0x000c, 0x000d, 0x000a 35479f15823c34ae1e423108295e416213200bb280fAndreas Huber dw 0x0007, 0x000b, 0x000e, 0x000f 35579f15823c34ae1e423108295e416213200bb280fAndreas Huberinv_zig_zag: 35679f15823c34ae1e423108295e416213200bb280fAndreas Huber dw 0x0001, 0x0002, 0x0006, 0x0007 35779f15823c34ae1e423108295e416213200bb280fAndreas Huber dw 0x0003, 0x0005, 0x0008, 0x000d 35879f15823c34ae1e423108295e416213200bb280fAndreas Huber dw 0x0004, 0x0009, 0x000c, 0x000e 35979f15823c34ae1e423108295e416213200bb280fAndreas Huber dw 0x000a, 0x000b, 0x000f, 0x0010 360