1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13 14; void vp8_temporal_filter_apply_sse2 | arg 15; (unsigned char *frame1, | 0 16; unsigned int stride, | 1 17; unsigned char *frame2, | 2 18; unsigned int block_size, | 3 19; int strength, | 4 20; int filter_weight, | 5 21; unsigned int *accumulator, | 6 22; unsigned short *count) | 7 23global sym(vp8_temporal_filter_apply_sse2) PRIVATE 24sym(vp8_temporal_filter_apply_sse2): 25 26 push rbp 27 mov rbp, rsp 28 SHADOW_ARGS_TO_STACK 8 29 SAVE_XMM 7 30 GET_GOT rbx 31 push rsi 32 push rdi 33 ALIGN_STACK 16, rax 34 %define block_size 0 35 %define strength 16 36 %define filter_weight 32 37 %define rounding_bit 48 38 %define rbp_backup 64 39 %define stack_size 80 40 sub rsp, stack_size 41 mov [rsp + rbp_backup], rbp 42 ; end prolog 43 44 mov rdx, arg(3) 45 mov [rsp + block_size], rdx 46 movd xmm6, arg(4) 47 movdqa [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read 48 49 ; calculate the rounding bit outside the loop 50 ; 0x8000 >> (16 - strength) 51 mov rdx, 16 52 sub rdx, arg(4) ; 16 - strength 53 movq xmm4, rdx ; can't use rdx w/ shift 54 movdqa xmm5, [GLOBAL(_const_top_bit)] 55 psrlw xmm5, xmm4 56 movdqa [rsp + rounding_bit], xmm5 57 58 mov rsi, arg(0) ; src/frame1 59 mov rdx, arg(2) ; predictor frame 60 mov rdi, arg(6) ; accumulator 61 mov rax, arg(7) ; count 62 63 ; dup the filter weight and store for later 64 movd xmm0, arg(5) ; filter_weight 65 pshuflw xmm0, xmm0, 0 66 punpcklwd xmm0, xmm0 67 movdqa [rsp + filter_weight], xmm0 68 69 mov rbp, arg(1) ; stride 70 pxor xmm7, xmm7 ; zero for extraction 71 72 lea rcx, [rdx + 16*16*1] 73 cmp dword ptr [rsp + block_size], 8 74 jne .temporal_filter_apply_load_16 75 lea rcx, [rdx + 8*8*1] 76 77.temporal_filter_apply_load_8: 78 movq xmm0, [rsi] ; first row 79 lea rsi, [rsi + rbp] ; += stride 80 punpcklbw xmm0, xmm7 ; src[ 0- 7] 81 movq xmm1, [rsi] ; second row 82 lea rsi, [rsi + rbp] ; += stride 83 punpcklbw xmm1, xmm7 ; src[ 8-15] 84 jmp .temporal_filter_apply_load_finished 85 86.temporal_filter_apply_load_16: 87 movdqa xmm0, [rsi] ; src (frame1) 88 lea rsi, [rsi + rbp] ; += stride 89 movdqa xmm1, xmm0 90 punpcklbw xmm0, xmm7 ; src[ 0- 7] 91 punpckhbw xmm1, xmm7 ; src[ 8-15] 92 93.temporal_filter_apply_load_finished: 94 movdqa xmm2, [rdx] ; predictor (frame2) 95 movdqa xmm3, xmm2 96 punpcklbw xmm2, xmm7 ; pred[ 0- 7] 97 punpckhbw xmm3, xmm7 ; pred[ 8-15] 98 99 ; modifier = src_byte - pixel_value 100 psubw xmm0, xmm2 ; src - pred[ 0- 7] 101 psubw xmm1, xmm3 ; src - pred[ 8-15] 102 103 ; modifier *= modifier 104 pmullw xmm0, xmm0 ; modifer[ 0- 7]^2 105 pmullw xmm1, xmm1 ; modifer[ 8-15]^2 106 107 ; modifier *= 3 108 pmullw xmm0, [GLOBAL(_const_3w)] 109 pmullw xmm1, [GLOBAL(_const_3w)] 110 111 ; modifer += 0x8000 >> (16 - strength) 112 paddw xmm0, [rsp + rounding_bit] 113 paddw xmm1, [rsp + rounding_bit] 114 115 ; modifier >>= strength 116 psrlw xmm0, [rsp + strength] 117 psrlw xmm1, [rsp + strength] 118 119 ; modifier = 16 - modifier 120 ; saturation takes care of modifier > 16 121 movdqa xmm3, [GLOBAL(_const_16w)] 122 movdqa xmm2, [GLOBAL(_const_16w)] 123 psubusw xmm3, xmm1 124 psubusw xmm2, xmm0 125 126 ; modifier *= filter_weight 127 pmullw xmm2, [rsp + filter_weight] 128 pmullw xmm3, [rsp + filter_weight] 129 130 ; count 131 movdqa xmm4, [rax] 132 movdqa xmm5, [rax+16] 133 ; += modifier 134 paddw xmm4, xmm2 135 paddw xmm5, xmm3 136 ; write back 137 movdqa [rax], xmm4 138 movdqa [rax+16], xmm5 139 lea rax, [rax + 16*2] ; count += 16*(sizeof(short)) 140 141 ; load and extract the predictor up to shorts 142 pxor xmm7, xmm7 143 movdqa xmm0, [rdx] 144 lea rdx, [rdx + 16*1] ; pred += 16*(sizeof(char)) 145 movdqa xmm1, xmm0 146 punpcklbw xmm0, xmm7 ; pred[ 0- 7] 147 punpckhbw xmm1, xmm7 ; pred[ 8-15] 148 149 ; modifier *= pixel_value 150 pmullw xmm0, xmm2 151 pmullw xmm1, xmm3 152 153 ; expand to double words 154 movdqa xmm2, xmm0 155 punpcklwd xmm0, xmm7 ; [ 0- 3] 156 punpckhwd xmm2, xmm7 ; [ 4- 7] 157 movdqa xmm3, xmm1 158 punpcklwd xmm1, xmm7 ; [ 8-11] 159 punpckhwd xmm3, xmm7 ; [12-15] 160 161 ; accumulator 162 movdqa xmm4, [rdi] 163 movdqa xmm5, [rdi+16] 164 movdqa xmm6, [rdi+32] 165 movdqa xmm7, [rdi+48] 166 ; += modifier 167 paddd xmm4, xmm0 168 paddd xmm5, xmm2 169 paddd xmm6, xmm1 170 paddd xmm7, xmm3 171 ; write back 172 movdqa [rdi], xmm4 173 movdqa [rdi+16], xmm5 174 movdqa [rdi+32], xmm6 175 movdqa [rdi+48], xmm7 176 lea rdi, [rdi + 16*4] ; accumulator += 16*(sizeof(int)) 177 178 cmp rdx, rcx 179 je .temporal_filter_apply_epilog 180 pxor xmm7, xmm7 ; zero for extraction 181 cmp dword ptr [rsp + block_size], 16 182 je .temporal_filter_apply_load_16 183 jmp .temporal_filter_apply_load_8 184 185.temporal_filter_apply_epilog: 186 ; begin epilog 187 mov rbp, [rsp + rbp_backup] 188 add rsp, stack_size 189 pop rsp 190 pop rdi 191 pop rsi 192 RESTORE_GOT 193 RESTORE_XMM 194 UNSHADOW_ARGS 195 pop rbp 196 ret 197 198SECTION_RODATA 199align 16 200_const_3w: 201 times 8 dw 3 202align 16 203_const_top_bit: 204 times 8 dw 1<<15 205align 16 206_const_16w 207 times 8 dw 16 208