1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13 14; void vp9_temporal_filter_apply_sse2 | arg 15; (unsigned char *frame1, | 0 16; unsigned int stride, | 1 17; unsigned char *frame2, | 2 18; unsigned int block_width, | 3 19; unsigned int block_height, | 4 20; int strength, | 5 21; int filter_weight, | 6 22; unsigned int *accumulator, | 7 23; unsigned short *count) | 8 24global sym(vp9_temporal_filter_apply_sse2) PRIVATE 25sym(vp9_temporal_filter_apply_sse2): 26 27 push rbp 28 mov rbp, rsp 29 SHADOW_ARGS_TO_STACK 9 30 SAVE_XMM 7 31 GET_GOT rbx 32 push rsi 33 push rdi 34 ALIGN_STACK 16, rax 35 %define block_width 0 36 %define block_height 16 37 %define strength 32 38 %define filter_weight 48 39 %define rounding_bit 64 40 %define rbp_backup 80 41 %define stack_size 96 42 sub rsp, stack_size 43 mov [rsp + rbp_backup], rbp 44 ; end prolog 45 46 mov edx, arg(3) 47 mov [rsp + block_width], rdx 48 mov edx, arg(4) 49 mov [rsp + block_height], rdx 50 movd xmm6, arg(5) 51 movdqa [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read 52 53 ; calculate the rounding bit outside the loop 54 ; 0x8000 >> (16 - strength) 55 mov rdx, 16 56 sub rdx, arg(5) ; 16 - strength 57 movq xmm4, rdx ; can't use rdx w/ shift 58 movdqa xmm5, [GLOBAL(_const_top_bit)] 59 psrlw xmm5, xmm4 60 movdqa [rsp + rounding_bit], xmm5 61 62 mov rsi, arg(0) ; src/frame1 63 mov rdx, arg(2) ; predictor frame 64 mov rdi, arg(7) ; accumulator 65 mov rax, arg(8) ; count 66 67 ; dup the filter weight and store for later 68 movd xmm0, arg(6) ; filter_weight 69 pshuflw xmm0, xmm0, 0 70 punpcklwd xmm0, xmm0 71 movdqa [rsp + filter_weight], xmm0 72 73 mov rbp, arg(1) ; stride 74 pxor xmm7, xmm7 ; zero for extraction 75 76 mov rcx, [rsp + block_width] 77 imul rcx, [rsp + block_height] 78 add rcx, rdx 79 cmp dword ptr [rsp + block_width], 8 80 jne .temporal_filter_apply_load_16 81 82.temporal_filter_apply_load_8: 83 movq xmm0, [rsi] ; first row 84 lea rsi, [rsi + rbp] ; += stride 85 punpcklbw xmm0, xmm7 ; src[ 0- 7] 86 movq xmm1, [rsi] ; second row 87 lea rsi, [rsi + rbp] ; += stride 88 punpcklbw xmm1, xmm7 ; src[ 8-15] 89 jmp .temporal_filter_apply_load_finished 90 91.temporal_filter_apply_load_16: 92 movdqa xmm0, [rsi] ; src (frame1) 93 lea rsi, [rsi + rbp] ; += stride 94 movdqa xmm1, xmm0 95 punpcklbw xmm0, xmm7 ; src[ 0- 7] 96 punpckhbw xmm1, xmm7 ; src[ 8-15] 97 98.temporal_filter_apply_load_finished: 99 movdqa xmm2, [rdx] ; predictor (frame2) 100 movdqa xmm3, xmm2 101 punpcklbw xmm2, xmm7 ; pred[ 0- 7] 102 punpckhbw xmm3, xmm7 ; pred[ 8-15] 103 104 ; modifier = src_byte - pixel_value 105 psubw xmm0, xmm2 ; src - pred[ 0- 7] 106 psubw xmm1, xmm3 ; src - pred[ 8-15] 107 108 ; modifier *= modifier 109 pmullw xmm0, xmm0 ; modifer[ 0- 7]^2 110 pmullw xmm1, xmm1 ; modifer[ 8-15]^2 111 112 ; modifier *= 3 113 pmullw xmm0, [GLOBAL(_const_3w)] 114 pmullw xmm1, [GLOBAL(_const_3w)] 115 116 ; modifer += 0x8000 >> (16 - strength) 117 paddw xmm0, [rsp + rounding_bit] 118 paddw xmm1, [rsp + rounding_bit] 119 120 ; modifier >>= strength 121 psrlw xmm0, [rsp + strength] 122 psrlw xmm1, [rsp + strength] 123 124 ; modifier = 16 - modifier 125 ; saturation takes care of modifier > 16 126 movdqa xmm3, [GLOBAL(_const_16w)] 127 movdqa xmm2, [GLOBAL(_const_16w)] 128 psubusw xmm3, xmm1 129 psubusw xmm2, xmm0 130 131 ; modifier *= filter_weight 132 pmullw xmm2, [rsp + filter_weight] 133 pmullw xmm3, [rsp + filter_weight] 134 135 ; count 136 movdqa xmm4, [rax] 137 movdqa xmm5, [rax+16] 138 ; += modifier 139 paddw xmm4, xmm2 140 paddw xmm5, xmm3 141 ; write back 142 movdqa [rax], xmm4 143 movdqa [rax+16], xmm5 144 lea rax, [rax + 16*2] ; count += 16*(sizeof(short)) 145 146 ; load and extract the predictor up to shorts 147 pxor xmm7, xmm7 148 movdqa xmm0, [rdx] 149 lea rdx, [rdx + 16*1] ; pred += 16*(sizeof(char)) 150 movdqa xmm1, xmm0 151 punpcklbw xmm0, xmm7 ; pred[ 0- 7] 152 punpckhbw xmm1, xmm7 ; pred[ 8-15] 153 154 ; modifier *= pixel_value 155 pmullw xmm0, xmm2 156 pmullw xmm1, xmm3 157 158 ; expand to double words 159 movdqa xmm2, xmm0 160 punpcklwd xmm0, xmm7 ; [ 0- 3] 161 punpckhwd xmm2, xmm7 ; [ 4- 7] 162 movdqa xmm3, xmm1 163 punpcklwd xmm1, xmm7 ; [ 8-11] 164 punpckhwd xmm3, xmm7 ; [12-15] 165 166 ; accumulator 167 movdqa xmm4, [rdi] 168 movdqa xmm5, [rdi+16] 169 movdqa xmm6, [rdi+32] 170 movdqa xmm7, [rdi+48] 171 ; += modifier 172 paddd xmm4, xmm0 173 paddd xmm5, xmm2 174 paddd xmm6, xmm1 175 paddd xmm7, xmm3 176 ; write back 177 movdqa [rdi], xmm4 178 movdqa [rdi+16], xmm5 179 movdqa [rdi+32], xmm6 180 movdqa [rdi+48], xmm7 181 lea rdi, [rdi + 16*4] ; accumulator += 16*(sizeof(int)) 182 183 cmp rdx, rcx 184 je .temporal_filter_apply_epilog 185 pxor xmm7, xmm7 ; zero for extraction 186 cmp dword ptr [rsp + block_width], 16 187 je .temporal_filter_apply_load_16 188 jmp .temporal_filter_apply_load_8 189 190.temporal_filter_apply_epilog: 191 ; begin epilog 192 mov rbp, [rsp + rbp_backup] 193 add rsp, stack_size 194 pop rsp 195 pop rdi 196 pop rsi 197 RESTORE_GOT 198 RESTORE_XMM 199 UNSHADOW_ARGS 200 pop rbp 201 ret 202 203SECTION_RODATA 204align 16 205_const_3w: 206 times 8 dw 3 207align 16 208_const_top_bit: 209 times 8 dw 1<<15 210align 16 211_const_16w 212 times 8 dw 16 213