vp9_temporal_filter_apply_sse2.asm revision 233d2500723e5594f3e7c70896ffeeef32b9c950
1233d2500723e5594f3e7c70896ffeeef32b9c950ywan; 2233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3233d2500723e5594f3e7c70896ffeeef32b9c950ywan; 4233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Use of this source code is governed by a BSD-style license 5233d2500723e5594f3e7c70896ffeeef32b9c950ywan; that can be found in the LICENSE file in the root of the source 6233d2500723e5594f3e7c70896ffeeef32b9c950ywan; tree. An additional intellectual property rights grant can be found 7233d2500723e5594f3e7c70896ffeeef32b9c950ywan; in the file PATENTS. All contributing project authors may 8233d2500723e5594f3e7c70896ffeeef32b9c950ywan; be found in the AUTHORS file in the root of the source tree. 9233d2500723e5594f3e7c70896ffeeef32b9c950ywan; 10233d2500723e5594f3e7c70896ffeeef32b9c950ywan 11233d2500723e5594f3e7c70896ffeeef32b9c950ywan 12233d2500723e5594f3e7c70896ffeeef32b9c950ywan%include "vpx_ports/x86_abi_support.asm" 13233d2500723e5594f3e7c70896ffeeef32b9c950ywan 14233d2500723e5594f3e7c70896ffeeef32b9c950ywan; void vp9_temporal_filter_apply_sse2 | arg 15233d2500723e5594f3e7c70896ffeeef32b9c950ywan; (unsigned char *frame1, | 0 16233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned int stride, | 1 17233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned char *frame2, | 2 18233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned int block_size, | 3 19233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int strength, | 4 20233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int filter_weight, | 5 21233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned int *accumulator, | 6 22233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned short *count) | 7 23233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp9_temporal_filter_apply_sse2) PRIVATE 24233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp9_temporal_filter_apply_sse2): 25233d2500723e5594f3e7c70896ffeeef32b9c950ywan 26233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rbp 27233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rbp, rsp 28233d2500723e5594f3e7c70896ffeeef32b9c950ywan SHADOW_ARGS_TO_STACK 8 29233d2500723e5594f3e7c70896ffeeef32b9c950ywan SAVE_XMM 7 30233d2500723e5594f3e7c70896ffeeef32b9c950ywan GET_GOT rbx 31233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rsi 32233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rdi 33233d2500723e5594f3e7c70896ffeeef32b9c950ywan ALIGN_STACK 16, rax 34233d2500723e5594f3e7c70896ffeeef32b9c950ywan %define block_size 0 35233d2500723e5594f3e7c70896ffeeef32b9c950ywan %define strength 16 36233d2500723e5594f3e7c70896ffeeef32b9c950ywan %define filter_weight 32 37233d2500723e5594f3e7c70896ffeeef32b9c950ywan %define rounding_bit 48 38233d2500723e5594f3e7c70896ffeeef32b9c950ywan %define rbp_backup 64 39233d2500723e5594f3e7c70896ffeeef32b9c950ywan %define stack_size 80 40233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub rsp, stack_size 41233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov [rsp + rbp_backup], rbp 42233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; end prolog 43233d2500723e5594f3e7c70896ffeeef32b9c950ywan 44233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rdx, arg(3) 45233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov [rsp + block_size], rdx 46233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd xmm6, arg(4) 47233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read 48233d2500723e5594f3e7c70896ffeeef32b9c950ywan 49233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; calculate the rounding bit outside the loop 50233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; 0x8000 >> (16 - strength) 51233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rdx, 16 52233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub rdx, arg(4) ; 16 - strength 53233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm4, rdx ; can't use rdx w/ shift 54233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm5, [GLOBAL(_const_top_bit)] 55233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrlw xmm5, xmm4 56233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa [rsp + rounding_bit], xmm5 57233d2500723e5594f3e7c70896ffeeef32b9c950ywan 58233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rsi, arg(0) ; src/frame1 59233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rdx, arg(2) ; predictor frame 60233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rdi, arg(6) ; accumulator 61233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rax, arg(7) ; count 62233d2500723e5594f3e7c70896ffeeef32b9c950ywan 63233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; dup the filter weight and store for later 64233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd xmm0, arg(5) ; filter_weight 65233d2500723e5594f3e7c70896ffeeef32b9c950ywan pshuflw xmm0, xmm0, 0 66233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklwd xmm0, xmm0 67233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa [rsp + filter_weight], xmm0 68233d2500723e5594f3e7c70896ffeeef32b9c950ywan 69233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rbp, arg(1) ; stride 70233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm7, xmm7 ; zero for extraction 71233d2500723e5594f3e7c70896ffeeef32b9c950ywan 72233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rcx, [rdx + 16*16*1] 73233d2500723e5594f3e7c70896ffeeef32b9c950ywan cmp dword ptr [rsp + block_size], 8 74233d2500723e5594f3e7c70896ffeeef32b9c950ywan jne .temporal_filter_apply_load_16 75233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rcx, [rdx + 8*8*1] 76233d2500723e5594f3e7c70896ffeeef32b9c950ywan 77233d2500723e5594f3e7c70896ffeeef32b9c950ywan.temporal_filter_apply_load_8: 78233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm0, [rsi] ; first row 79233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rsi, [rsi + rbp] ; += stride 80233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm0, xmm7 ; src[ 0- 7] 81233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm1, [rsi] ; second row 82233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rsi, [rsi + rbp] ; += stride 83233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm1, xmm7 ; src[ 8-15] 84233d2500723e5594f3e7c70896ffeeef32b9c950ywan jmp .temporal_filter_apply_load_finished 85233d2500723e5594f3e7c70896ffeeef32b9c950ywan 86233d2500723e5594f3e7c70896ffeeef32b9c950ywan.temporal_filter_apply_load_16: 87233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm0, [rsi] ; src (frame1) 88233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rsi, [rsi + rbp] ; += stride 89233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm1, xmm0 90233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm0, xmm7 ; src[ 0- 7] 91233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhbw xmm1, xmm7 ; src[ 8-15] 92233d2500723e5594f3e7c70896ffeeef32b9c950ywan 93233d2500723e5594f3e7c70896ffeeef32b9c950ywan.temporal_filter_apply_load_finished: 94233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm2, [rdx] ; predictor (frame2) 95233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm3, xmm2 96233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm2, xmm7 ; pred[ 0- 7] 97233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhbw xmm3, xmm7 ; pred[ 8-15] 98233d2500723e5594f3e7c70896ffeeef32b9c950ywan 99233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; modifier = src_byte - pixel_value 100233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubw xmm0, xmm2 ; src - pred[ 0- 7] 101233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubw xmm1, xmm3 ; src - pred[ 8-15] 102233d2500723e5594f3e7c70896ffeeef32b9c950ywan 103233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; modifier *= modifier 104233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmullw xmm0, xmm0 ; modifer[ 0- 7]^2 105233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmullw xmm1, xmm1 ; modifer[ 8-15]^2 106233d2500723e5594f3e7c70896ffeeef32b9c950ywan 107233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; modifier *= 3 108233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmullw xmm0, [GLOBAL(_const_3w)] 109233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmullw xmm1, [GLOBAL(_const_3w)] 110233d2500723e5594f3e7c70896ffeeef32b9c950ywan 111233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; modifer += 0x8000 >> (16 - strength) 112233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm0, [rsp + rounding_bit] 113233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm1, [rsp + rounding_bit] 114233d2500723e5594f3e7c70896ffeeef32b9c950ywan 115233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; modifier >>= strength 116233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrlw xmm0, [rsp + strength] 117233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrlw xmm1, [rsp + strength] 118233d2500723e5594f3e7c70896ffeeef32b9c950ywan 119233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; modifier = 16 - modifier 120233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; saturation takes care of modifier > 16 121233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm3, [GLOBAL(_const_16w)] 122233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm2, [GLOBAL(_const_16w)] 123233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusw xmm3, xmm1 124233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusw xmm2, xmm0 125233d2500723e5594f3e7c70896ffeeef32b9c950ywan 126233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; modifier *= filter_weight 127233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmullw xmm2, [rsp + filter_weight] 128233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmullw xmm3, [rsp + filter_weight] 129233d2500723e5594f3e7c70896ffeeef32b9c950ywan 130233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; count 131233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm4, [rax] 132233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm5, [rax+16] 133233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; += modifier 134233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm4, xmm2 135233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm5, xmm3 136233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; write back 137233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa [rax], xmm4 138233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa [rax+16], xmm5 139233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rax, [rax + 16*2] ; count += 16*(sizeof(short)) 140233d2500723e5594f3e7c70896ffeeef32b9c950ywan 141233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; load and extract the predictor up to shorts 142233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm7, xmm7 143233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm0, [rdx] 144233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rdx, [rdx + 16*1] ; pred += 16*(sizeof(char)) 145233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm1, xmm0 146233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm0, xmm7 ; pred[ 0- 7] 147233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhbw xmm1, xmm7 ; pred[ 8-15] 148233d2500723e5594f3e7c70896ffeeef32b9c950ywan 149233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; modifier *= pixel_value 150233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmullw xmm0, xmm2 151233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmullw xmm1, xmm3 152233d2500723e5594f3e7c70896ffeeef32b9c950ywan 153233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; expand to double words 154233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm2, xmm0 155233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklwd xmm0, xmm7 ; [ 0- 3] 156233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhwd xmm2, xmm7 ; [ 4- 7] 157233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm3, xmm1 158233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklwd xmm1, xmm7 ; [ 8-11] 159233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhwd xmm3, xmm7 ; [12-15] 160233d2500723e5594f3e7c70896ffeeef32b9c950ywan 161233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; accumulator 162233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm4, [rdi] 163233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm5, [rdi+16] 164233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm6, [rdi+32] 165233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm7, [rdi+48] 166233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; += modifier 167233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd xmm4, xmm0 168233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd xmm5, xmm2 169233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd xmm6, xmm1 170233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd xmm7, xmm3 171233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; write back 172233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa [rdi], xmm4 173233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa [rdi+16], xmm5 174233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa [rdi+32], xmm6 175233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa [rdi+48], xmm7 176233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rdi, [rdi + 16*4] ; accumulator += 16*(sizeof(int)) 177233d2500723e5594f3e7c70896ffeeef32b9c950ywan 178233d2500723e5594f3e7c70896ffeeef32b9c950ywan cmp rdx, rcx 179233d2500723e5594f3e7c70896ffeeef32b9c950ywan je .temporal_filter_apply_epilog 180233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm7, xmm7 ; zero for extraction 181233d2500723e5594f3e7c70896ffeeef32b9c950ywan cmp dword ptr [rsp + block_size], 16 182233d2500723e5594f3e7c70896ffeeef32b9c950ywan je .temporal_filter_apply_load_16 183233d2500723e5594f3e7c70896ffeeef32b9c950ywan jmp .temporal_filter_apply_load_8 184233d2500723e5594f3e7c70896ffeeef32b9c950ywan 185233d2500723e5594f3e7c70896ffeeef32b9c950ywan.temporal_filter_apply_epilog: 186233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; begin epilog 187233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rbp, [rsp + rbp_backup] 188233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rsp, stack_size 189233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rsp 190233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rdi 191233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rsi 192233d2500723e5594f3e7c70896ffeeef32b9c950ywan RESTORE_GOT 193233d2500723e5594f3e7c70896ffeeef32b9c950ywan RESTORE_XMM 194233d2500723e5594f3e7c70896ffeeef32b9c950ywan UNSHADOW_ARGS 195233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rbp 196233d2500723e5594f3e7c70896ffeeef32b9c950ywan ret 197233d2500723e5594f3e7c70896ffeeef32b9c950ywan 198233d2500723e5594f3e7c70896ffeeef32b9c950ywanSECTION_RODATA 199233d2500723e5594f3e7c70896ffeeef32b9c950ywanalign 16 200233d2500723e5594f3e7c70896ffeeef32b9c950ywan_const_3w: 201233d2500723e5594f3e7c70896ffeeef32b9c950ywan times 8 dw 3 202233d2500723e5594f3e7c70896ffeeef32b9c950ywanalign 16 203233d2500723e5594f3e7c70896ffeeef32b9c950ywan_const_top_bit: 204233d2500723e5594f3e7c70896ffeeef32b9c950ywan times 8 dw 1<<15 205233d2500723e5594f3e7c70896ffeeef32b9c950ywanalign 16 206233d2500723e5594f3e7c70896ffeeef32b9c950ywan_const_16w 207233d2500723e5594f3e7c70896ffeeef32b9c950ywan times 8 dw 16 208