179f15823c34ae1e423108295e416213200bb280fAndreas Huber; 279f15823c34ae1e423108295e416213200bb280fAndreas Huber; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 379f15823c34ae1e423108295e416213200bb280fAndreas Huber; 479f15823c34ae1e423108295e416213200bb280fAndreas Huber; Use of this source code is governed by a BSD-style license 579f15823c34ae1e423108295e416213200bb280fAndreas Huber; that can be found in the LICENSE file in the root of the source 679f15823c34ae1e423108295e416213200bb280fAndreas Huber; tree. An additional intellectual property rights grant can be found 779f15823c34ae1e423108295e416213200bb280fAndreas Huber; in the file PATENTS. All contributing project authors may 879f15823c34ae1e423108295e416213200bb280fAndreas Huber; be found in the AUTHORS file in the root of the source tree. 979f15823c34ae1e423108295e416213200bb280fAndreas Huber; 1079f15823c34ae1e423108295e416213200bb280fAndreas Huber 1179f15823c34ae1e423108295e416213200bb280fAndreas Huber 1279f15823c34ae1e423108295e416213200bb280fAndreas Huber%include "vpx_ports/x86_abi_support.asm" 1379f15823c34ae1e423108295e416213200bb280fAndreas Huber 1479f15823c34ae1e423108295e416213200bb280fAndreas Huber; void vp8_temporal_filter_apply_sse2 | arg 1579f15823c34ae1e423108295e416213200bb280fAndreas Huber; (unsigned char *frame1, | 0 1679f15823c34ae1e423108295e416213200bb280fAndreas Huber; unsigned int stride, | 1 1779f15823c34ae1e423108295e416213200bb280fAndreas Huber; unsigned char *frame2, | 2 1879f15823c34ae1e423108295e416213200bb280fAndreas Huber; unsigned int block_size, | 3 1979f15823c34ae1e423108295e416213200bb280fAndreas Huber; int strength, | 4 2079f15823c34ae1e423108295e416213200bb280fAndreas Huber; int filter_weight, | 5 2179f15823c34ae1e423108295e416213200bb280fAndreas Huber; unsigned int *accumulator, | 6 2279f15823c34ae1e423108295e416213200bb280fAndreas Huber; unsigned short *count) | 7 231b362b15af34006e6a11974088a46d42b903418eJohannglobal sym(vp8_temporal_filter_apply_sse2) PRIVATE 2479f15823c34ae1e423108295e416213200bb280fAndreas Hubersym(vp8_temporal_filter_apply_sse2): 2579f15823c34ae1e423108295e416213200bb280fAndreas Huber 2679f15823c34ae1e423108295e416213200bb280fAndreas Huber push rbp 2779f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rbp, rsp 2879f15823c34ae1e423108295e416213200bb280fAndreas Huber SHADOW_ARGS_TO_STACK 8 291b362b15af34006e6a11974088a46d42b903418eJohann SAVE_XMM 7 3079f15823c34ae1e423108295e416213200bb280fAndreas Huber GET_GOT rbx 3179f15823c34ae1e423108295e416213200bb280fAndreas Huber push rsi 3279f15823c34ae1e423108295e416213200bb280fAndreas Huber push rdi 3379f15823c34ae1e423108295e416213200bb280fAndreas Huber ALIGN_STACK 16, rax 3479f15823c34ae1e423108295e416213200bb280fAndreas Huber %define block_size 0 3579f15823c34ae1e423108295e416213200bb280fAndreas Huber %define strength 16 3679f15823c34ae1e423108295e416213200bb280fAndreas Huber %define filter_weight 32 3779f15823c34ae1e423108295e416213200bb280fAndreas Huber %define rounding_bit 48 3879f15823c34ae1e423108295e416213200bb280fAndreas Huber %define rbp_backup 64 3979f15823c34ae1e423108295e416213200bb280fAndreas Huber %define stack_size 80 4079f15823c34ae1e423108295e416213200bb280fAndreas Huber sub rsp, stack_size 4179f15823c34ae1e423108295e416213200bb280fAndreas Huber mov [rsp + rbp_backup], rbp 4279f15823c34ae1e423108295e416213200bb280fAndreas Huber ; end prolog 4379f15823c34ae1e423108295e416213200bb280fAndreas Huber 4479f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rdx, arg(3) 4579f15823c34ae1e423108295e416213200bb280fAndreas Huber mov [rsp + block_size], rdx 4679f15823c34ae1e423108295e416213200bb280fAndreas Huber movd xmm6, arg(4) 4779f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read 4879f15823c34ae1e423108295e416213200bb280fAndreas Huber 4979f15823c34ae1e423108295e416213200bb280fAndreas Huber ; calculate the rounding bit outside the loop 5079f15823c34ae1e423108295e416213200bb280fAndreas Huber ; 0x8000 >> (16 - strength) 5179f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rdx, 16 5279f15823c34ae1e423108295e416213200bb280fAndreas Huber sub rdx, arg(4) ; 16 - strength 5379f15823c34ae1e423108295e416213200bb280fAndreas Huber movd xmm4, rdx ; can't use rdx w/ shift 5479f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm5, [GLOBAL(_const_top_bit)] 5579f15823c34ae1e423108295e416213200bb280fAndreas Huber psrlw xmm5, xmm4 5679f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa [rsp + rounding_bit], xmm5 5779f15823c34ae1e423108295e416213200bb280fAndreas Huber 5879f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rsi, arg(0) ; src/frame1 5979f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rdx, arg(2) ; predictor frame 6079f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rdi, arg(6) ; accumulator 6179f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rax, arg(7) ; count 6279f15823c34ae1e423108295e416213200bb280fAndreas Huber 6379f15823c34ae1e423108295e416213200bb280fAndreas Huber ; dup the filter weight and store for later 6479f15823c34ae1e423108295e416213200bb280fAndreas Huber movd xmm0, arg(5) ; filter_weight 6579f15823c34ae1e423108295e416213200bb280fAndreas Huber pshuflw xmm0, xmm0, 0 6679f15823c34ae1e423108295e416213200bb280fAndreas Huber punpcklwd xmm0, xmm0 6779f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa [rsp + filter_weight], xmm0 6879f15823c34ae1e423108295e416213200bb280fAndreas Huber 6979f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rbp, arg(1) ; stride 7079f15823c34ae1e423108295e416213200bb280fAndreas Huber pxor xmm7, xmm7 ; zero for extraction 7179f15823c34ae1e423108295e416213200bb280fAndreas Huber 7279f15823c34ae1e423108295e416213200bb280fAndreas Huber lea rcx, [rdx + 16*16*1] 7379f15823c34ae1e423108295e416213200bb280fAndreas Huber cmp dword ptr [rsp + block_size], 8 741b362b15af34006e6a11974088a46d42b903418eJohann jne .temporal_filter_apply_load_16 7579f15823c34ae1e423108295e416213200bb280fAndreas Huber lea rcx, [rdx + 8*8*1] 7679f15823c34ae1e423108295e416213200bb280fAndreas Huber 771b362b15af34006e6a11974088a46d42b903418eJohann.temporal_filter_apply_load_8: 7879f15823c34ae1e423108295e416213200bb280fAndreas Huber movq xmm0, [rsi] ; first row 7979f15823c34ae1e423108295e416213200bb280fAndreas Huber lea rsi, [rsi + rbp] ; += stride 8079f15823c34ae1e423108295e416213200bb280fAndreas Huber punpcklbw xmm0, xmm7 ; src[ 0- 7] 8179f15823c34ae1e423108295e416213200bb280fAndreas Huber movq xmm1, [rsi] ; second row 8279f15823c34ae1e423108295e416213200bb280fAndreas Huber lea rsi, [rsi + rbp] ; += stride 8379f15823c34ae1e423108295e416213200bb280fAndreas Huber punpcklbw xmm1, xmm7 ; src[ 8-15] 841b362b15af34006e6a11974088a46d42b903418eJohann jmp .temporal_filter_apply_load_finished 8579f15823c34ae1e423108295e416213200bb280fAndreas Huber 861b362b15af34006e6a11974088a46d42b903418eJohann.temporal_filter_apply_load_16: 8779f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm0, [rsi] ; src (frame1) 8879f15823c34ae1e423108295e416213200bb280fAndreas Huber lea rsi, [rsi + rbp] ; += stride 8979f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm1, xmm0 9079f15823c34ae1e423108295e416213200bb280fAndreas Huber punpcklbw xmm0, xmm7 ; src[ 0- 7] 9179f15823c34ae1e423108295e416213200bb280fAndreas Huber punpckhbw xmm1, xmm7 ; src[ 8-15] 9279f15823c34ae1e423108295e416213200bb280fAndreas Huber 931b362b15af34006e6a11974088a46d42b903418eJohann.temporal_filter_apply_load_finished: 9479f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm2, [rdx] ; predictor (frame2) 9579f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm3, xmm2 9679f15823c34ae1e423108295e416213200bb280fAndreas Huber punpcklbw xmm2, xmm7 ; pred[ 0- 7] 9779f15823c34ae1e423108295e416213200bb280fAndreas Huber punpckhbw xmm3, xmm7 ; pred[ 8-15] 9879f15823c34ae1e423108295e416213200bb280fAndreas Huber 9979f15823c34ae1e423108295e416213200bb280fAndreas Huber ; modifier = src_byte - pixel_value 10079f15823c34ae1e423108295e416213200bb280fAndreas Huber psubw xmm0, xmm2 ; src - pred[ 0- 7] 10179f15823c34ae1e423108295e416213200bb280fAndreas Huber psubw xmm1, xmm3 ; src - pred[ 8-15] 10279f15823c34ae1e423108295e416213200bb280fAndreas Huber 10379f15823c34ae1e423108295e416213200bb280fAndreas Huber ; modifier *= modifier 10479f15823c34ae1e423108295e416213200bb280fAndreas Huber pmullw xmm0, xmm0 ; modifer[ 0- 7]^2 10579f15823c34ae1e423108295e416213200bb280fAndreas Huber pmullw xmm1, xmm1 ; modifer[ 8-15]^2 10679f15823c34ae1e423108295e416213200bb280fAndreas Huber 10779f15823c34ae1e423108295e416213200bb280fAndreas Huber ; modifier *= 3 10879f15823c34ae1e423108295e416213200bb280fAndreas Huber pmullw xmm0, [GLOBAL(_const_3w)] 10979f15823c34ae1e423108295e416213200bb280fAndreas Huber pmullw xmm1, [GLOBAL(_const_3w)] 11079f15823c34ae1e423108295e416213200bb280fAndreas Huber 11179f15823c34ae1e423108295e416213200bb280fAndreas Huber ; modifer += 0x8000 >> (16 - strength) 11279f15823c34ae1e423108295e416213200bb280fAndreas Huber paddw xmm0, [rsp + rounding_bit] 11379f15823c34ae1e423108295e416213200bb280fAndreas Huber paddw xmm1, [rsp + rounding_bit] 11479f15823c34ae1e423108295e416213200bb280fAndreas Huber 11579f15823c34ae1e423108295e416213200bb280fAndreas Huber ; modifier >>= strength 11679f15823c34ae1e423108295e416213200bb280fAndreas Huber psrlw xmm0, [rsp + strength] 11779f15823c34ae1e423108295e416213200bb280fAndreas Huber psrlw xmm1, [rsp + strength] 11879f15823c34ae1e423108295e416213200bb280fAndreas Huber 11979f15823c34ae1e423108295e416213200bb280fAndreas Huber ; modifier = 16 - modifier 12079f15823c34ae1e423108295e416213200bb280fAndreas Huber ; saturation takes care of modifier > 16 12179f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm3, [GLOBAL(_const_16w)] 12279f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm2, [GLOBAL(_const_16w)] 12379f15823c34ae1e423108295e416213200bb280fAndreas Huber psubusw xmm3, xmm1 12479f15823c34ae1e423108295e416213200bb280fAndreas Huber psubusw xmm2, xmm0 12579f15823c34ae1e423108295e416213200bb280fAndreas Huber 12679f15823c34ae1e423108295e416213200bb280fAndreas Huber ; modifier *= filter_weight 12779f15823c34ae1e423108295e416213200bb280fAndreas Huber pmullw xmm2, [rsp + filter_weight] 12879f15823c34ae1e423108295e416213200bb280fAndreas Huber pmullw xmm3, [rsp + filter_weight] 12979f15823c34ae1e423108295e416213200bb280fAndreas Huber 13079f15823c34ae1e423108295e416213200bb280fAndreas Huber ; count 13179f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm4, [rax] 13279f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm5, [rax+16] 13379f15823c34ae1e423108295e416213200bb280fAndreas Huber ; += modifier 13479f15823c34ae1e423108295e416213200bb280fAndreas Huber paddw xmm4, xmm2 13579f15823c34ae1e423108295e416213200bb280fAndreas Huber paddw xmm5, xmm3 13679f15823c34ae1e423108295e416213200bb280fAndreas Huber ; write back 13779f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa [rax], xmm4 13879f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa [rax+16], xmm5 13979f15823c34ae1e423108295e416213200bb280fAndreas Huber lea rax, [rax + 16*2] ; count += 16*(sizeof(short)) 14079f15823c34ae1e423108295e416213200bb280fAndreas Huber 14179f15823c34ae1e423108295e416213200bb280fAndreas Huber ; load and extract the predictor up to shorts 14279f15823c34ae1e423108295e416213200bb280fAndreas Huber pxor xmm7, xmm7 14379f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm0, [rdx] 14479f15823c34ae1e423108295e416213200bb280fAndreas Huber lea rdx, [rdx + 16*1] ; pred += 16*(sizeof(char)) 14579f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm1, xmm0 14679f15823c34ae1e423108295e416213200bb280fAndreas Huber punpcklbw xmm0, xmm7 ; pred[ 0- 7] 14779f15823c34ae1e423108295e416213200bb280fAndreas Huber punpckhbw xmm1, xmm7 ; pred[ 8-15] 14879f15823c34ae1e423108295e416213200bb280fAndreas Huber 14979f15823c34ae1e423108295e416213200bb280fAndreas Huber ; modifier *= pixel_value 15079f15823c34ae1e423108295e416213200bb280fAndreas Huber pmullw xmm0, xmm2 15179f15823c34ae1e423108295e416213200bb280fAndreas Huber pmullw xmm1, xmm3 15279f15823c34ae1e423108295e416213200bb280fAndreas Huber 15379f15823c34ae1e423108295e416213200bb280fAndreas Huber ; expand to double words 15479f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm2, xmm0 15579f15823c34ae1e423108295e416213200bb280fAndreas Huber punpcklwd xmm0, xmm7 ; [ 0- 3] 15679f15823c34ae1e423108295e416213200bb280fAndreas Huber punpckhwd xmm2, xmm7 ; [ 4- 7] 15779f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm3, xmm1 15879f15823c34ae1e423108295e416213200bb280fAndreas Huber punpcklwd xmm1, xmm7 ; [ 8-11] 15979f15823c34ae1e423108295e416213200bb280fAndreas Huber punpckhwd xmm3, xmm7 ; [12-15] 16079f15823c34ae1e423108295e416213200bb280fAndreas Huber 16179f15823c34ae1e423108295e416213200bb280fAndreas Huber ; accumulator 16279f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm4, [rdi] 16379f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm5, [rdi+16] 16479f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm6, [rdi+32] 16579f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm7, [rdi+48] 16679f15823c34ae1e423108295e416213200bb280fAndreas Huber ; += modifier 1671b362b15af34006e6a11974088a46d42b903418eJohann paddd xmm4, xmm0 1681b362b15af34006e6a11974088a46d42b903418eJohann paddd xmm5, xmm2 1691b362b15af34006e6a11974088a46d42b903418eJohann paddd xmm6, xmm1 1701b362b15af34006e6a11974088a46d42b903418eJohann paddd xmm7, xmm3 17179f15823c34ae1e423108295e416213200bb280fAndreas Huber ; write back 17279f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa [rdi], xmm4 17379f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa [rdi+16], xmm5 17479f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa [rdi+32], xmm6 17579f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa [rdi+48], xmm7 17679f15823c34ae1e423108295e416213200bb280fAndreas Huber lea rdi, [rdi + 16*4] ; accumulator += 16*(sizeof(int)) 17779f15823c34ae1e423108295e416213200bb280fAndreas Huber 17879f15823c34ae1e423108295e416213200bb280fAndreas Huber cmp rdx, rcx 1791b362b15af34006e6a11974088a46d42b903418eJohann je .temporal_filter_apply_epilog 18079f15823c34ae1e423108295e416213200bb280fAndreas Huber pxor xmm7, xmm7 ; zero for extraction 18179f15823c34ae1e423108295e416213200bb280fAndreas Huber cmp dword ptr [rsp + block_size], 16 1821b362b15af34006e6a11974088a46d42b903418eJohann je .temporal_filter_apply_load_16 1831b362b15af34006e6a11974088a46d42b903418eJohann jmp .temporal_filter_apply_load_8 18479f15823c34ae1e423108295e416213200bb280fAndreas Huber 1851b362b15af34006e6a11974088a46d42b903418eJohann.temporal_filter_apply_epilog: 18679f15823c34ae1e423108295e416213200bb280fAndreas Huber ; begin epilog 18779f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rbp, [rsp + rbp_backup] 18879f15823c34ae1e423108295e416213200bb280fAndreas Huber add rsp, stack_size 18979f15823c34ae1e423108295e416213200bb280fAndreas Huber pop rsp 19079f15823c34ae1e423108295e416213200bb280fAndreas Huber pop rdi 19179f15823c34ae1e423108295e416213200bb280fAndreas Huber pop rsi 19279f15823c34ae1e423108295e416213200bb280fAndreas Huber RESTORE_GOT 19379f15823c34ae1e423108295e416213200bb280fAndreas Huber RESTORE_XMM 19479f15823c34ae1e423108295e416213200bb280fAndreas Huber UNSHADOW_ARGS 19579f15823c34ae1e423108295e416213200bb280fAndreas Huber pop rbp 19679f15823c34ae1e423108295e416213200bb280fAndreas Huber ret 19779f15823c34ae1e423108295e416213200bb280fAndreas Huber 19879f15823c34ae1e423108295e416213200bb280fAndreas HuberSECTION_RODATA 19979f15823c34ae1e423108295e416213200bb280fAndreas Huberalign 16 20079f15823c34ae1e423108295e416213200bb280fAndreas Huber_const_3w: 20179f15823c34ae1e423108295e416213200bb280fAndreas Huber times 8 dw 3 20279f15823c34ae1e423108295e416213200bb280fAndreas Huberalign 16 20379f15823c34ae1e423108295e416213200bb280fAndreas Huber_const_top_bit: 20479f15823c34ae1e423108295e416213200bb280fAndreas Huber times 8 dw 1<<15 20579f15823c34ae1e423108295e416213200bb280fAndreas Huberalign 16 20679f15823c34ae1e423108295e416213200bb280fAndreas Huber_const_16w 20779f15823c34ae1e423108295e416213200bb280fAndreas Huber times 8 dw 16 208