1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license and patent 5; grant that can be found in the LICENSE file in the root of the source 6; tree. All contributing project authors may be found in the AUTHORS 7; file in the root of the source tree. 8; 9 10 11%include "vpx_ports/x86_abi_support.asm" 12%include "asm_enc_offsets.asm" 13 14 15; void vp8_regular_quantize_b_sse2 | arg 16; (BLOCK *b, | 0 17; BLOCKD *d) | 1 18 19global sym(vp8_regular_quantize_b_sse2) 20sym(vp8_regular_quantize_b_sse2): 21 push rbp 22 mov rbp, rsp 23 SAVE_XMM 24 GET_GOT rbx 25 push rsi 26 27%if ABI_IS_32BIT 28 push rdi 29%else 30 %ifidn __OUTPUT_FORMAT__,x64 31 push rdi 32 %endif 33%endif 34 35 ALIGN_STACK 16, rax 36 %define BLOCKD_d 0 ; 8 37 %define zrun_zbin_boost 8 ; 8 38 %define abs_minus_zbin 16 ; 32 39 %define temp_qcoeff 48 ; 32 40 %define qcoeff 80 ; 32 41 %define stack_size 112 42 sub rsp, stack_size 43 ; end prolog 44 45%if ABI_IS_32BIT 46 mov rdi, arg(0) 47%else 48 %ifidn __OUTPUT_FORMAT__,x64 49 mov rdi, rcx ; BLOCK *b 50 mov [rsp + BLOCKD_d], rdx 51 %else 52 ;mov rdi, rdi ; BLOCK *b 53 mov [rsp + BLOCKD_d], rsi 54 %endif 55%endif 56 57 mov rdx, [rdi + vp8_block_coeff] ; coeff_ptr 58 mov rcx, [rdi + vp8_block_zbin] ; zbin_ptr 59 movd xmm7, [rdi + vp8_block_zbin_extra] ; zbin_oq_value 60 61 ; z 62 movdqa xmm0, [rdx] 63 movdqa xmm4, [rdx + 16] 64 mov rdx, [rdi + vp8_block_round] ; round_ptr 65 66 pshuflw xmm7, xmm7, 0 67 punpcklwd xmm7, xmm7 ; duplicated zbin_oq_value 68 69 movdqa xmm1, xmm0 70 movdqa xmm5, xmm4 71 72 ; sz 73 psraw xmm0, 15 74 psraw xmm4, 15 75 76 ; (z ^ sz) 77 pxor xmm1, xmm0 78 pxor xmm5, xmm4 79 80 ; x = abs(z) 81 psubw xmm1, xmm0 82 psubw xmm5, xmm4 83 84 movdqa xmm2, [rcx] 85 movdqa xmm3, [rcx + 16] 86 mov rcx, [rdi + vp8_block_quant] ; quant_ptr 87 88 ; *zbin_ptr + zbin_oq_value 89 paddw xmm2, xmm7 90 paddw xmm3, xmm7 91 92 ; x - (*zbin_ptr + zbin_oq_value) 93 psubw xmm1, xmm2 94 psubw xmm5, xmm3 95 movdqa [rsp + abs_minus_zbin], xmm1 96 movdqa [rsp + abs_minus_zbin + 16], xmm5 97 98 ; add (zbin_ptr + zbin_oq_value) back 99 paddw xmm1, xmm2 100 paddw xmm5, xmm3 101 102 movdqa xmm2, [rdx] 103 movdqa xmm6, [rdx + 16] 104 105 movdqa xmm3, [rcx] 106 movdqa xmm7, [rcx + 16] 107 108 ; x + round 109 paddw xmm1, xmm2 110 paddw xmm5, xmm6 111 112 ; y = x * quant_ptr >> 16 113 pmulhw xmm3, xmm1 114 pmulhw xmm7, xmm5 115 116 ; y += x 117 paddw xmm1, xmm3 118 paddw xmm5, xmm7 119 120 movdqa [rsp + temp_qcoeff], xmm1 121 movdqa [rsp + temp_qcoeff + 16], xmm5 122 123 pxor xmm6, xmm6 124 ; zero qcoeff 125 movdqa [rsp + qcoeff], xmm6 126 movdqa [rsp + qcoeff + 16], xmm6 127 128 mov rsi, [rdi + vp8_block_zrun_zbin_boost] ; zbin_boost_ptr 129 mov rax, [rdi + vp8_block_quant_shift] ; quant_shift_ptr 130 mov [rsp + zrun_zbin_boost], rsi 131 132%macro ZIGZAG_LOOP 1 133 movsx edx, WORD PTR[GLOBAL(zig_zag + (%1 * 2))] ; rc 134 135 ; x 136 movsx ecx, WORD PTR[rsp + abs_minus_zbin + rdx *2] 137 138 ; if (x >= zbin) 139 sub cx, WORD PTR[rsi] ; x - zbin 140 lea rsi, [rsi + 2] ; zbin_boost_ptr++ 141 jl rq_zigzag_loop_%1 ; x < zbin 142 143 movsx edi, WORD PTR[rsp + temp_qcoeff + rdx *2] 144 145 ; downshift by quant_shift[rdx] 146 movsx ecx, WORD PTR[rax + rdx*2] ; quant_shift_ptr[rc] 147 sar edi, cl ; also sets Z bit 148 je rq_zigzag_loop_%1 ; !y 149 mov WORD PTR[rsp + qcoeff + rdx*2], di ;qcoeff_ptr[rc] = temp_qcoeff[rc] 150 mov rsi, [rsp + zrun_zbin_boost] ; reset to b->zrun_zbin_boost 151rq_zigzag_loop_%1: 152%endmacro 153ZIGZAG_LOOP 0 154ZIGZAG_LOOP 1 155ZIGZAG_LOOP 2 156ZIGZAG_LOOP 3 157ZIGZAG_LOOP 4 158ZIGZAG_LOOP 5 159ZIGZAG_LOOP 6 160ZIGZAG_LOOP 7 161ZIGZAG_LOOP 8 162ZIGZAG_LOOP 9 163ZIGZAG_LOOP 10 164ZIGZAG_LOOP 11 165ZIGZAG_LOOP 12 166ZIGZAG_LOOP 13 167ZIGZAG_LOOP 14 168ZIGZAG_LOOP 15 169 170 movdqa xmm2, [rsp + qcoeff] 171 movdqa xmm3, [rsp + qcoeff + 16] 172 173%if ABI_IS_32BIT 174 mov rdi, arg(1) 175%else 176 mov rdi, [rsp + BLOCKD_d] 177%endif 178 179 mov rcx, [rdi + vp8_blockd_dequant] ; dequant_ptr 180 mov rsi, [rdi + vp8_blockd_dqcoeff] ; dqcoeff_ptr 181 182 ; y ^ sz 183 pxor xmm2, xmm0 184 pxor xmm3, xmm4 185 ; x = (y ^ sz) - sz 186 psubw xmm2, xmm0 187 psubw xmm3, xmm4 188 189 ; dequant 190 movdqa xmm0, [rcx] 191 movdqa xmm1, [rcx + 16] 192 193 mov rcx, [rdi + vp8_blockd_qcoeff] ; qcoeff_ptr 194 195 pmullw xmm0, xmm2 196 pmullw xmm1, xmm3 197 198 movdqa [rcx], xmm2 ; store qcoeff 199 movdqa [rcx + 16], xmm3 200 movdqa [rsi], xmm0 ; store dqcoeff 201 movdqa [rsi + 16], xmm1 202 203 ; select the last value (in zig_zag order) for EOB 204 pcmpeqw xmm2, xmm6 205 pcmpeqw xmm3, xmm6 206 ; ! 207 pcmpeqw xmm6, xmm6 208 pxor xmm2, xmm6 209 pxor xmm3, xmm6 210 ; mask inv_zig_zag 211 pand xmm2, [GLOBAL(inv_zig_zag)] 212 pand xmm3, [GLOBAL(inv_zig_zag + 16)] 213 ; select the max value 214 pmaxsw xmm2, xmm3 215 pshufd xmm3, xmm2, 00001110b 216 pmaxsw xmm2, xmm3 217 pshuflw xmm3, xmm2, 00001110b 218 pmaxsw xmm2, xmm3 219 pshuflw xmm3, xmm2, 00000001b 220 pmaxsw xmm2, xmm3 221 movd eax, xmm2 222 and eax, 0xff 223 mov [rdi + vp8_blockd_eob], eax 224 225 ; begin epilog 226 add rsp, stack_size 227 pop rsp 228%if ABI_IS_32BIT 229 pop rdi 230%else 231 %ifidn __OUTPUT_FORMAT__,x64 232 pop rdi 233 %endif 234%endif 235 pop rsi 236 RESTORE_GOT 237 RESTORE_XMM 238 pop rbp 239 ret 240 241; int vp8_fast_quantize_b_impl_sse2 | arg 242; (short *coeff_ptr, | 0 243; short *qcoeff_ptr, | 1 244; short *dequant_ptr, | 2 245; short *inv_scan_order, | 3 246; short *round_ptr, | 4 247; short *quant_ptr, | 5 248; short *dqcoeff_ptr) | 6 249 250global sym(vp8_fast_quantize_b_impl_sse2) 251sym(vp8_fast_quantize_b_impl_sse2): 252 push rbp 253 mov rbp, rsp 254 SHADOW_ARGS_TO_STACK 7 255 push rsi 256 push rdi 257 ; end prolog 258 259 mov rdx, arg(0) ;coeff_ptr 260 mov rcx, arg(2) ;dequant_ptr 261 mov rdi, arg(4) ;round_ptr 262 mov rsi, arg(5) ;quant_ptr 263 264 movdqa xmm0, XMMWORD PTR[rdx] 265 movdqa xmm4, XMMWORD PTR[rdx + 16] 266 267 movdqa xmm2, XMMWORD PTR[rdi] ;round lo 268 movdqa xmm3, XMMWORD PTR[rdi + 16] ;round hi 269 270 movdqa xmm1, xmm0 271 movdqa xmm5, xmm4 272 273 psraw xmm0, 15 ;sign of z (aka sz) 274 psraw xmm4, 15 ;sign of z (aka sz) 275 276 pxor xmm1, xmm0 277 pxor xmm5, xmm4 278 psubw xmm1, xmm0 ;x = abs(z) 279 psubw xmm5, xmm4 ;x = abs(z) 280 281 paddw xmm1, xmm2 282 paddw xmm5, xmm3 283 284 pmulhw xmm1, XMMWORD PTR[rsi] 285 pmulhw xmm5, XMMWORD PTR[rsi + 16] 286 287 mov rdi, arg(1) ;qcoeff_ptr 288 mov rsi, arg(6) ;dqcoeff_ptr 289 290 movdqa xmm2, XMMWORD PTR[rcx] 291 movdqa xmm3, XMMWORD PTR[rcx + 16] 292 293 pxor xmm1, xmm0 294 pxor xmm5, xmm4 295 psubw xmm1, xmm0 296 psubw xmm5, xmm4 297 298 movdqa XMMWORD PTR[rdi], xmm1 299 movdqa XMMWORD PTR[rdi + 16], xmm5 300 301 pmullw xmm2, xmm1 302 pmullw xmm3, xmm5 303 304 mov rdi, arg(3) ;inv_scan_order 305 306 ; Start with 16 307 pxor xmm4, xmm4 ;clear all bits 308 pcmpeqw xmm1, xmm4 309 pcmpeqw xmm5, xmm4 310 311 pcmpeqw xmm4, xmm4 ;set all bits 312 pxor xmm1, xmm4 313 pxor xmm5, xmm4 314 315 pand xmm1, XMMWORD PTR[rdi] 316 pand xmm5, XMMWORD PTR[rdi+16] 317 318 pmaxsw xmm1, xmm5 319 320 ; now down to 8 321 pshufd xmm5, xmm1, 00001110b 322 323 pmaxsw xmm1, xmm5 324 325 ; only 4 left 326 pshuflw xmm5, xmm1, 00001110b 327 328 pmaxsw xmm1, xmm5 329 330 ; okay, just 2! 331 pshuflw xmm5, xmm1, 00000001b 332 333 pmaxsw xmm1, xmm5 334 335 movd rax, xmm1 336 and rax, 0xff 337 338 movdqa XMMWORD PTR[rsi], xmm2 ;store dqcoeff 339 movdqa XMMWORD PTR[rsi + 16], xmm3 ;store dqcoeff 340 341 ; begin epilog 342 pop rdi 343 pop rsi 344 UNSHADOW_ARGS 345 pop rbp 346 ret 347 348SECTION_RODATA 349align 16 350zig_zag: 351 dw 0x0000, 0x0001, 0x0004, 0x0008 352 dw 0x0005, 0x0002, 0x0003, 0x0006 353 dw 0x0009, 0x000c, 0x000d, 0x000a 354 dw 0x0007, 0x000b, 0x000e, 0x000f 355inv_zig_zag: 356 dw 0x0001, 0x0002, 0x0006, 0x0007 357 dw 0x0003, 0x0005, 0x0008, 0x000d 358 dw 0x0004, 0x0009, 0x000c, 0x000e 359 dw 0x000a, 0x000b, 0x000f, 0x0010 360