quantize_sse2.asm revision 538f6170b788de7408b06efc6613dc98579aa6a6
1f71323e297a928af368937089d3ed71239786f86Andreas Huber; 2f71323e297a928af368937089d3ed71239786f86Andreas Huber; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3f71323e297a928af368937089d3ed71239786f86Andreas Huber; 4f71323e297a928af368937089d3ed71239786f86Andreas Huber; Use of this source code is governed by a BSD-style license and patent 5f71323e297a928af368937089d3ed71239786f86Andreas Huber; grant that can be found in the LICENSE file in the root of the source 6f71323e297a928af368937089d3ed71239786f86Andreas Huber; tree. All contributing project authors may be found in the AUTHORS 7f71323e297a928af368937089d3ed71239786f86Andreas Huber; file in the root of the source tree. 8f71323e297a928af368937089d3ed71239786f86Andreas Huber; 9f71323e297a928af368937089d3ed71239786f86Andreas Huber 10f71323e297a928af368937089d3ed71239786f86Andreas Huber 11f71323e297a928af368937089d3ed71239786f86Andreas Huber%include "vpx_ports/x86_abi_support.asm" 12f71323e297a928af368937089d3ed71239786f86Andreas Huber 13f71323e297a928af368937089d3ed71239786f86Andreas Huber 14f71323e297a928af368937089d3ed71239786f86Andreas Huber;int vp8_regular_quantize_b_impl_sse2(short *coeff_ptr, short *zbin_ptr, 15f71323e297a928af368937089d3ed71239786f86Andreas Huber; short *qcoeff_ptr,short *dequant_ptr, 16f71323e297a928af368937089d3ed71239786f86Andreas Huber; const int *default_zig_zag, short *round_ptr, 17f71323e297a928af368937089d3ed71239786f86Andreas Huber; short *quant_ptr, short *dqcoeff_ptr, 18f71323e297a928af368937089d3ed71239786f86Andreas Huber; unsigned short zbin_oq_value, 19f71323e297a928af368937089d3ed71239786f86Andreas Huber; short *zbin_boost_ptr); 20f71323e297a928af368937089d3ed71239786f86Andreas Huber; 21f71323e297a928af368937089d3ed71239786f86Andreas Huberglobal sym(vp8_regular_quantize_b_impl_sse2) 22f71323e297a928af368937089d3ed71239786f86Andreas Hubersym(vp8_regular_quantize_b_impl_sse2): 23f71323e297a928af368937089d3ed71239786f86Andreas Huber push rbp 24f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rbp, rsp 25f71323e297a928af368937089d3ed71239786f86Andreas Huber SHADOW_ARGS_TO_STACK 10 26f71323e297a928af368937089d3ed71239786f86Andreas Huber push rsi 27f71323e297a928af368937089d3ed71239786f86Andreas Huber push rdi 28f71323e297a928af368937089d3ed71239786f86Andreas Huber push rbx 29f71323e297a928af368937089d3ed71239786f86Andreas Huber ; end prolog 30f71323e297a928af368937089d3ed71239786f86Andreas Huber 31f71323e297a928af368937089d3ed71239786f86Andreas Huber ALIGN_STACK 16, rax 32f71323e297a928af368937089d3ed71239786f86Andreas Huber 33f71323e297a928af368937089d3ed71239786f86Andreas Huber %define abs_minus_zbin_lo 0 34f71323e297a928af368937089d3ed71239786f86Andreas Huber %define abs_minus_zbin_hi 16 35f71323e297a928af368937089d3ed71239786f86Andreas Huber %define temp_qcoeff_lo 32 36f71323e297a928af368937089d3ed71239786f86Andreas Huber %define temp_qcoeff_hi 48 37f71323e297a928af368937089d3ed71239786f86Andreas Huber %define save_xmm6 64 38f71323e297a928af368937089d3ed71239786f86Andreas Huber %define save_xmm7 80 39f71323e297a928af368937089d3ed71239786f86Andreas Huber %define eob 96 40f71323e297a928af368937089d3ed71239786f86Andreas Huber 41f71323e297a928af368937089d3ed71239786f86Andreas Huber %define vp8_regularquantizeb_stack_size eob + 16 42f71323e297a928af368937089d3ed71239786f86Andreas Huber 43f71323e297a928af368937089d3ed71239786f86Andreas Huber sub rsp, vp8_regularquantizeb_stack_size 44f71323e297a928af368937089d3ed71239786f86Andreas Huber 45f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa OWORD PTR[rsp + save_xmm6], xmm6 46f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa OWORD PTR[rsp + save_xmm7], xmm7 47f71323e297a928af368937089d3ed71239786f86Andreas Huber 48f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rdx, arg(0) ;coeff_ptr 49f71323e297a928af368937089d3ed71239786f86Andreas Huber mov eax, arg(8) ;zbin_oq_value 50f71323e297a928af368937089d3ed71239786f86Andreas Huber 51f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rcx, arg(1) ;zbin_ptr 52f71323e297a928af368937089d3ed71239786f86Andreas Huber movd xmm7, eax 53f71323e297a928af368937089d3ed71239786f86Andreas Huber 54f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm0, OWORD PTR[rdx] 55f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm4, OWORD PTR[rdx + 16] 56f71323e297a928af368937089d3ed71239786f86Andreas Huber 57f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm1, xmm0 58f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm5, xmm4 59f71323e297a928af368937089d3ed71239786f86Andreas Huber 60f71323e297a928af368937089d3ed71239786f86Andreas Huber psraw xmm0, 15 ;sign of z (aka sz) 61f71323e297a928af368937089d3ed71239786f86Andreas Huber psraw xmm4, 15 ;sign of z (aka sz) 62f71323e297a928af368937089d3ed71239786f86Andreas Huber 63f71323e297a928af368937089d3ed71239786f86Andreas Huber pxor xmm1, xmm0 64f71323e297a928af368937089d3ed71239786f86Andreas Huber pxor xmm5, xmm4 65f71323e297a928af368937089d3ed71239786f86Andreas Huber 66f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm2, OWORD PTR[rcx] ;load zbin_ptr 67f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm3, OWORD PTR[rcx + 16] ;load zbin_ptr 68f71323e297a928af368937089d3ed71239786f86Andreas Huber 69f71323e297a928af368937089d3ed71239786f86Andreas Huber pshuflw xmm7, xmm7, 0 70f71323e297a928af368937089d3ed71239786f86Andreas Huber psubw xmm1, xmm0 ;x = abs(z) 71f71323e297a928af368937089d3ed71239786f86Andreas Huber 72f71323e297a928af368937089d3ed71239786f86Andreas Huber punpcklwd xmm7, xmm7 ;duplicated zbin_oq_value 73f71323e297a928af368937089d3ed71239786f86Andreas Huber psubw xmm5, xmm4 ;x = abs(z) 74f71323e297a928af368937089d3ed71239786f86Andreas Huber 75f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm2, xmm7 76f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm3, xmm7 77f71323e297a928af368937089d3ed71239786f86Andreas Huber 78f71323e297a928af368937089d3ed71239786f86Andreas Huber psubw xmm1, xmm2 ;sub (zbin_ptr + zbin_oq_value) 79f71323e297a928af368937089d3ed71239786f86Andreas Huber psubw xmm5, xmm3 ;sub (zbin_ptr + zbin_oq_value) 80f71323e297a928af368937089d3ed71239786f86Andreas Huber 81f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rdi, arg(5) ;round_ptr 82f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rsi, arg(6) ;quant_ptr 83f71323e297a928af368937089d3ed71239786f86Andreas Huber 84f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa OWORD PTR[rsp + abs_minus_zbin_lo], xmm1 85f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa OWORD PTR[rsp + abs_minus_zbin_hi], xmm5 86f71323e297a928af368937089d3ed71239786f86Andreas Huber 87f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm1, xmm2 ;add (zbin_ptr + zbin_oq_value) back 88f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm5, xmm3 ;add (zbin_ptr + zbin_oq_value) back 89f71323e297a928af368937089d3ed71239786f86Andreas Huber 90f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm2, OWORD PTR[rdi] 91f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm3, OWORD PTR[rsi] 92f71323e297a928af368937089d3ed71239786f86Andreas Huber 93f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm6, OWORD PTR[rdi + 16] 94f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm7, OWORD PTR[rsi + 16] 95f71323e297a928af368937089d3ed71239786f86Andreas Huber 96f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm1, xmm2 97f71323e297a928af368937089d3ed71239786f86Andreas Huber paddw xmm5, xmm6 98f71323e297a928af368937089d3ed71239786f86Andreas Huber 99f71323e297a928af368937089d3ed71239786f86Andreas Huber pmulhw xmm1, xmm3 100f71323e297a928af368937089d3ed71239786f86Andreas Huber pmulhw xmm5, xmm7 101f71323e297a928af368937089d3ed71239786f86Andreas Huber 102f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rsi, arg(2) ;qcoeff_ptr 103f71323e297a928af368937089d3ed71239786f86Andreas Huber pxor xmm6, xmm6 104f71323e297a928af368937089d3ed71239786f86Andreas Huber 105f71323e297a928af368937089d3ed71239786f86Andreas Huber pxor xmm1, xmm0 106f71323e297a928af368937089d3ed71239786f86Andreas Huber pxor xmm5, xmm4 107f71323e297a928af368937089d3ed71239786f86Andreas Huber 108f71323e297a928af368937089d3ed71239786f86Andreas Huber psubw xmm1, xmm0 109f71323e297a928af368937089d3ed71239786f86Andreas Huber psubw xmm5, xmm4 110f71323e297a928af368937089d3ed71239786f86Andreas Huber 111f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa OWORD PTR[rsp + temp_qcoeff_lo], xmm1 112f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa OWORD PTR[rsp + temp_qcoeff_hi], xmm5 113f71323e297a928af368937089d3ed71239786f86Andreas Huber 114f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa OWORD PTR[rsi], xmm6 ;zero qcoeff 115f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa OWORD PTR[rsi + 16], xmm6 ;zero qcoeff 116f71323e297a928af368937089d3ed71239786f86Andreas Huber 117f71323e297a928af368937089d3ed71239786f86Andreas Huber xor rax, rax 118f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rcx, -1 119f71323e297a928af368937089d3ed71239786f86Andreas Huber 120f71323e297a928af368937089d3ed71239786f86Andreas Huber mov [rsp + eob], rcx 121f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rsi, arg(9) ;zbin_boost_ptr 122f71323e297a928af368937089d3ed71239786f86Andreas Huber 123f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rbx, arg(4) ;default_zig_zag 124f71323e297a928af368937089d3ed71239786f86Andreas Huber 125f71323e297a928af368937089d3ed71239786f86Andreas Huberrq_zigzag_loop: 126f71323e297a928af368937089d3ed71239786f86Andreas Huber movsxd rcx, DWORD PTR[rbx + rax*4] ;now we have rc 127f71323e297a928af368937089d3ed71239786f86Andreas Huber movsx edi, WORD PTR [rsi] ;*zbin_boost_ptr aka zbin 128f71323e297a928af368937089d3ed71239786f86Andreas Huber lea rsi, [rsi + 2] ;zbin_boost_ptr++ 129f71323e297a928af368937089d3ed71239786f86Andreas Huber 130f71323e297a928af368937089d3ed71239786f86Andreas Huber movsx edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2] 131f71323e297a928af368937089d3ed71239786f86Andreas Huber 132f71323e297a928af368937089d3ed71239786f86Andreas Huber sub edx, edi ;x - zbin 133f71323e297a928af368937089d3ed71239786f86Andreas Huber jl rq_zigzag_1 134f71323e297a928af368937089d3ed71239786f86Andreas Huber 135f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rdi, arg(2) ;qcoeff_ptr 136f71323e297a928af368937089d3ed71239786f86Andreas Huber 137f71323e297a928af368937089d3ed71239786f86Andreas Huber movsx edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2] 138f71323e297a928af368937089d3ed71239786f86Andreas Huber 139f71323e297a928af368937089d3ed71239786f86Andreas Huber cmp edx, 0 140f71323e297a928af368937089d3ed71239786f86Andreas Huber je rq_zigzag_1 141f71323e297a928af368937089d3ed71239786f86Andreas Huber 142f71323e297a928af368937089d3ed71239786f86Andreas Huber mov WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc] 143f71323e297a928af368937089d3ed71239786f86Andreas Huber 144f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rsi, arg(9) ;zbin_boost_ptr 145f71323e297a928af368937089d3ed71239786f86Andreas Huber mov [rsp + eob], rax ;eob = i 146f71323e297a928af368937089d3ed71239786f86Andreas Huber 147f71323e297a928af368937089d3ed71239786f86Andreas Huberrq_zigzag_1: 148f71323e297a928af368937089d3ed71239786f86Andreas Huber movsxd rcx, DWORD PTR[rbx + rax*4 + 4] 149f71323e297a928af368937089d3ed71239786f86Andreas Huber movsx edi, WORD PTR [rsi] ;*zbin_boost_ptr aka zbin 150f71323e297a928af368937089d3ed71239786f86Andreas Huber lea rsi, [rsi + 2] ;zbin_boost_ptr++ 151f71323e297a928af368937089d3ed71239786f86Andreas Huber 152f71323e297a928af368937089d3ed71239786f86Andreas Huber movsx edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2] 153f71323e297a928af368937089d3ed71239786f86Andreas Huber lea rax, [rax + 1] 154f71323e297a928af368937089d3ed71239786f86Andreas Huber 155f71323e297a928af368937089d3ed71239786f86Andreas Huber sub edx, edi ;x - zbin 156f71323e297a928af368937089d3ed71239786f86Andreas Huber jl rq_zigzag_1a 157f71323e297a928af368937089d3ed71239786f86Andreas Huber 158f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rdi, arg(2) ;qcoeff_ptr 159f71323e297a928af368937089d3ed71239786f86Andreas Huber 160f71323e297a928af368937089d3ed71239786f86Andreas Huber movsx edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2] 161f71323e297a928af368937089d3ed71239786f86Andreas Huber 162f71323e297a928af368937089d3ed71239786f86Andreas Huber cmp edx, 0 163f71323e297a928af368937089d3ed71239786f86Andreas Huber je rq_zigzag_1a 164f71323e297a928af368937089d3ed71239786f86Andreas Huber 165f71323e297a928af368937089d3ed71239786f86Andreas Huber mov WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc] 166f71323e297a928af368937089d3ed71239786f86Andreas Huber 167f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rsi, arg(9) ;zbin_boost_ptr 168f71323e297a928af368937089d3ed71239786f86Andreas Huber mov [rsp + eob], rax ;eob = i 169f71323e297a928af368937089d3ed71239786f86Andreas Huber 170f71323e297a928af368937089d3ed71239786f86Andreas Huberrq_zigzag_1a: 171f71323e297a928af368937089d3ed71239786f86Andreas Huber movsxd rcx, DWORD PTR[rbx + rax*4 + 4] 172f71323e297a928af368937089d3ed71239786f86Andreas Huber movsx edi, WORD PTR [rsi] ;*zbin_boost_ptr aka zbin 173f71323e297a928af368937089d3ed71239786f86Andreas Huber lea rsi, [rsi + 2] ;zbin_boost_ptr++ 174f71323e297a928af368937089d3ed71239786f86Andreas Huber 175f71323e297a928af368937089d3ed71239786f86Andreas Huber movsx edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2] 176f71323e297a928af368937089d3ed71239786f86Andreas Huber lea rax, [rax + 1] 177f71323e297a928af368937089d3ed71239786f86Andreas Huber 178f71323e297a928af368937089d3ed71239786f86Andreas Huber sub edx, edi ;x - zbin 179f71323e297a928af368937089d3ed71239786f86Andreas Huber jl rq_zigzag_1b 180f71323e297a928af368937089d3ed71239786f86Andreas Huber 181f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rdi, arg(2) ;qcoeff_ptr 182f71323e297a928af368937089d3ed71239786f86Andreas Huber 183f71323e297a928af368937089d3ed71239786f86Andreas Huber movsx edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2] 184f71323e297a928af368937089d3ed71239786f86Andreas Huber 185f71323e297a928af368937089d3ed71239786f86Andreas Huber cmp edx, 0 186f71323e297a928af368937089d3ed71239786f86Andreas Huber je rq_zigzag_1b 187f71323e297a928af368937089d3ed71239786f86Andreas Huber 188f71323e297a928af368937089d3ed71239786f86Andreas Huber mov WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc] 189f71323e297a928af368937089d3ed71239786f86Andreas Huber 190f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rsi, arg(9) ;zbin_boost_ptr 191f71323e297a928af368937089d3ed71239786f86Andreas Huber mov [rsp + eob], rax ;eob = i 192f71323e297a928af368937089d3ed71239786f86Andreas Huber 193f71323e297a928af368937089d3ed71239786f86Andreas Huberrq_zigzag_1b: 194f71323e297a928af368937089d3ed71239786f86Andreas Huber movsxd rcx, DWORD PTR[rbx + rax*4 + 4] 195f71323e297a928af368937089d3ed71239786f86Andreas Huber movsx edi, WORD PTR [rsi] ;*zbin_boost_ptr aka zbin 196f71323e297a928af368937089d3ed71239786f86Andreas Huber lea rsi, [rsi + 2] ;zbin_boost_ptr++ 197f71323e297a928af368937089d3ed71239786f86Andreas Huber 198f71323e297a928af368937089d3ed71239786f86Andreas Huber movsx edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2] 199f71323e297a928af368937089d3ed71239786f86Andreas Huber lea rax, [rax + 1] 200f71323e297a928af368937089d3ed71239786f86Andreas Huber 201f71323e297a928af368937089d3ed71239786f86Andreas Huber sub edx, edi ;x - zbin 202f71323e297a928af368937089d3ed71239786f86Andreas Huber jl rq_zigzag_1c 203f71323e297a928af368937089d3ed71239786f86Andreas Huber 204f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rdi, arg(2) ;qcoeff_ptr 205f71323e297a928af368937089d3ed71239786f86Andreas Huber 206f71323e297a928af368937089d3ed71239786f86Andreas Huber movsx edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2] 207f71323e297a928af368937089d3ed71239786f86Andreas Huber 208f71323e297a928af368937089d3ed71239786f86Andreas Huber cmp edx, 0 209f71323e297a928af368937089d3ed71239786f86Andreas Huber je rq_zigzag_1c 210f71323e297a928af368937089d3ed71239786f86Andreas Huber 211f71323e297a928af368937089d3ed71239786f86Andreas Huber mov WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc] 212f71323e297a928af368937089d3ed71239786f86Andreas Huber 213f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rsi, arg(9) ;zbin_boost_ptr 214f71323e297a928af368937089d3ed71239786f86Andreas Huber mov [rsp + eob], rax ;eob = i 215f71323e297a928af368937089d3ed71239786f86Andreas Huber 216f71323e297a928af368937089d3ed71239786f86Andreas Huberrq_zigzag_1c: 217f71323e297a928af368937089d3ed71239786f86Andreas Huber lea rax, [rax + 1] 218f71323e297a928af368937089d3ed71239786f86Andreas Huber 219f71323e297a928af368937089d3ed71239786f86Andreas Huber cmp rax, 16 220f71323e297a928af368937089d3ed71239786f86Andreas Huber jl rq_zigzag_loop 221f71323e297a928af368937089d3ed71239786f86Andreas Huber 222f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rdi, arg(2) ;qcoeff_ptr 223f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rcx, arg(3) ;dequant_ptr 224f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rsi, arg(7) ;dqcoeff_ptr 225f71323e297a928af368937089d3ed71239786f86Andreas Huber 226f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm2, OWORD PTR[rdi] 227f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm3, OWORD PTR[rdi + 16] 228f71323e297a928af368937089d3ed71239786f86Andreas Huber 229f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm0, OWORD PTR[rcx] 230f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm1, OWORD PTR[rcx + 16] 231f71323e297a928af368937089d3ed71239786f86Andreas Huber 232f71323e297a928af368937089d3ed71239786f86Andreas Huber pmullw xmm0, xmm2 233f71323e297a928af368937089d3ed71239786f86Andreas Huber pmullw xmm1, xmm3 234f71323e297a928af368937089d3ed71239786f86Andreas Huber 235f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa OWORD PTR[rsi], xmm0 ;store dqcoeff 236f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa OWORD PTR[rsi + 16], xmm1 ;store dqcoeff 237f71323e297a928af368937089d3ed71239786f86Andreas Huber 238f71323e297a928af368937089d3ed71239786f86Andreas Huber mov rax, [rsp + eob] 239f71323e297a928af368937089d3ed71239786f86Andreas Huber 240f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm6, OWORD PTR[rsp + save_xmm6] 241f71323e297a928af368937089d3ed71239786f86Andreas Huber movdqa xmm7, OWORD PTR[rsp + save_xmm7] 242f71323e297a928af368937089d3ed71239786f86Andreas Huber 243f71323e297a928af368937089d3ed71239786f86Andreas Huber add rax, 1 244f71323e297a928af368937089d3ed71239786f86Andreas Huber 245f71323e297a928af368937089d3ed71239786f86Andreas Huber add rsp, vp8_regularquantizeb_stack_size 246f71323e297a928af368937089d3ed71239786f86Andreas Huber pop rsp 247f71323e297a928af368937089d3ed71239786f86Andreas Huber 248f71323e297a928af368937089d3ed71239786f86Andreas Huber ; begin epilog 249f71323e297a928af368937089d3ed71239786f86Andreas Huber pop rbx 250f71323e297a928af368937089d3ed71239786f86Andreas Huber pop rdi 251f71323e297a928af368937089d3ed71239786f86Andreas Huber pop rsi 252f71323e297a928af368937089d3ed71239786f86Andreas Huber UNSHADOW_ARGS 253f71323e297a928af368937089d3ed71239786f86Andreas Huber pop rbp 254f71323e297a928af368937089d3ed71239786f86Andreas Huber ret 255538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 256538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 257538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber;int vp8_fast_quantize_b_impl_sse2(short *coeff_ptr, 258538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber; short *qcoeff_ptr,short *dequant_ptr, 259538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber; short *scan_mask, short *round_ptr, 260538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber; short *quant_ptr, short *dqcoeff_ptr); 261538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huberglobal sym(vp8_fast_quantize_b_impl_sse2) 262538f6170b788de7408b06efc6613dc98579aa6a6Andreas Hubersym(vp8_fast_quantize_b_impl_sse2): 263538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber push rbp 264538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber mov rbp, rsp 265538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber SHADOW_ARGS_TO_STACK 7 266538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber push rsi 267538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber push rdi 268538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber push rbx 269538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber ; end prolog 270538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 271538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber ALIGN_STACK 16, rax 272538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 273538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber %define save_xmm6 0 274538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber %define save_xmm7 16 275538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 276538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber %define vp8_fastquantizeb_stack_size save_xmm7 + 16 277538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 278538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber sub rsp, vp8_fastquantizeb_stack_size 279538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 280538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa XMMWORD PTR[rsp + save_xmm6], xmm6 281538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa XMMWORD PTR[rsp + save_xmm7], xmm7 282538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 283538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber mov rdx, arg(0) ;coeff_ptr 284538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber mov rcx, arg(2) ;dequant_ptr 285538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber mov rax, arg(3) ;scan_mask 286538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber mov rdi, arg(4) ;round_ptr 287538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber mov rsi, arg(5) ;quant_ptr 288538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 289538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm0, XMMWORD PTR[rdx] 290538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm4, XMMWORD PTR[rdx + 16] 291538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 292538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm6, XMMWORD PTR[rdi] ;round lo 293538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm7, XMMWORD PTR[rdi + 16] ;round hi 294538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 295538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm1, xmm0 296538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm5, xmm4 297538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 298538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psraw xmm0, 15 ;sign of z (aka sz) 299538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psraw xmm4, 15 ;sign of z (aka sz) 300538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 301538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm1, xmm0 302538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm5, xmm4 303538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psubw xmm1, xmm0 ;x = abs(z) 304538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psubw xmm5, xmm4 ;x = abs(z) 305538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 306538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw xmm1, xmm6 307538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw xmm5, xmm7 308538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 309538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmulhw xmm1, XMMWORD PTR[rsi] 310538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmulhw xmm5, XMMWORD PTR[rsi + 16] 311538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 312538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber mov rdi, arg(1) ;qcoeff_ptr 313538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber mov rsi, arg(6) ;dqcoeff_ptr 314538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 315538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm6, XMMWORD PTR[rcx] 316538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm7, XMMWORD PTR[rcx + 16] 317538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 318538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm1, xmm0 319538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm5, xmm4 320538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psubw xmm1, xmm0 321538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psubw xmm5, xmm4 322538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 323538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa XMMWORD PTR[rdi], xmm1 324538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa XMMWORD PTR[rdi + 16], xmm5 325538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 326538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmullw xmm6, xmm1 327538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmullw xmm7, xmm5 328538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 329538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm2, XMMWORD PTR[rax] 330538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm3, XMMWORD PTR[rax+16]; 331538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 332538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm4, xmm4 ;clear all bits 333538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pcmpeqw xmm1, xmm4 334538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pcmpeqw xmm5, xmm4 335538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 336538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pcmpeqw xmm4, xmm4 ;set all bits 337538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm1, xmm4 338538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm5, xmm4 339538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 340538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psrlw xmm1, 15 341538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psrlw xmm5, 15 342538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 343538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmaddwd xmm1, xmm2 344538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmaddwd xmm5, xmm3 345538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 346538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movq xmm2, xmm1 347538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movq xmm3, xmm5 348538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 349538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psrldq xmm1, 8 350538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psrldq xmm5, 8 351538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 352538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddd xmm1, xmm5 353538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddd xmm2, xmm3 354538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 355538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddd xmm1, xmm2 356538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movq xmm5, xmm1 357538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 358538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psrldq xmm1, 4 359538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddd xmm5, xmm1 360538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 361538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movq rcx, xmm5 362538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber and rcx, 0xffff 363538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 364538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber xor rdx, rdx 365538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber sub rdx, rcx 366538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 367538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber bsr rax, rcx 368538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber inc rax 369538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 370538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber sar rdx, 31 371538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber and rax, rdx 372538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 373538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa XMMWORD PTR[rsi], xmm6 ;store dqcoeff 374538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa XMMWORD PTR[rsi + 16], xmm7 ;store dqcoeff 375538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 376538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm6, XMMWORD PTR[rsp + save_xmm6] 377538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm7, XMMWORD PTR[rsp + save_xmm7] 378538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 379538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber add rsp, vp8_fastquantizeb_stack_size 380538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pop rsp 381538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 382538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber ; begin epilog 383538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pop rbx 384538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pop rdi 385538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pop rsi 386538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber UNSHADOW_ARGS 387538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pop rbp 388538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber ret 389