1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13 14%macro PROCESS_16X2X3 1 15%if %1 16 movdqa xmm0, XMMWORD PTR [rsi] 17 lddqu xmm5, XMMWORD PTR [rdi] 18 lddqu xmm6, XMMWORD PTR [rdi+1] 19 lddqu xmm7, XMMWORD PTR [rdi+2] 20 21 psadbw xmm5, xmm0 22 psadbw xmm6, xmm0 23 psadbw xmm7, xmm0 24%else 25 movdqa xmm0, XMMWORD PTR [rsi] 26 lddqu xmm1, XMMWORD PTR [rdi] 27 lddqu xmm2, XMMWORD PTR [rdi+1] 28 lddqu xmm3, XMMWORD PTR [rdi+2] 29 30 psadbw xmm1, xmm0 31 psadbw xmm2, xmm0 32 psadbw xmm3, xmm0 33 34 paddw xmm5, xmm1 35 paddw xmm6, xmm2 36 paddw xmm7, xmm3 37%endif 38 movdqa xmm0, XMMWORD PTR [rsi+rax] 39 lddqu xmm1, XMMWORD PTR [rdi+rdx] 40 lddqu xmm2, XMMWORD PTR [rdi+rdx+1] 41 lddqu xmm3, XMMWORD PTR [rdi+rdx+2] 42 43 lea rsi, [rsi+rax*2] 44 lea rdi, [rdi+rdx*2] 45 46 psadbw xmm1, xmm0 47 psadbw xmm2, xmm0 48 psadbw xmm3, xmm0 49 50 paddw xmm5, xmm1 51 paddw xmm6, xmm2 52 paddw xmm7, xmm3 53%endmacro 54 55%macro PROCESS_16X2X3_OFFSET 2 56%if %1 57 movdqa xmm0, XMMWORD PTR [rsi] 58 movdqa xmm4, XMMWORD PTR [rdi] 59 movdqa xmm7, XMMWORD PTR [rdi+16] 60 61 movdqa xmm5, xmm7 62 palignr xmm5, xmm4, %2 63 64 movdqa xmm6, xmm7 65 palignr xmm6, xmm4, (%2+1) 66 67 palignr xmm7, xmm4, (%2+2) 68 69 psadbw xmm5, xmm0 70 psadbw xmm6, xmm0 71 psadbw xmm7, xmm0 72%else 73 movdqa xmm0, XMMWORD PTR [rsi] 74 movdqa xmm4, XMMWORD PTR [rdi] 75 movdqa xmm3, XMMWORD PTR [rdi+16] 76 77 movdqa xmm1, xmm3 78 palignr xmm1, xmm4, %2 79 80 movdqa xmm2, xmm3 81 palignr xmm2, xmm4, (%2+1) 82 83 palignr xmm3, xmm4, (%2+2) 84 85 psadbw xmm1, xmm0 86 psadbw xmm2, xmm0 87 psadbw xmm3, xmm0 88 89 paddw xmm5, xmm1 90 paddw xmm6, xmm2 91 paddw xmm7, xmm3 92%endif 93 movdqa xmm0, XMMWORD PTR [rsi+rax] 94 movdqa xmm4, XMMWORD PTR [rdi+rdx] 95 movdqa xmm3, XMMWORD PTR [rdi+rdx+16] 96 97 movdqa xmm1, xmm3 98 palignr xmm1, xmm4, %2 99 100 movdqa xmm2, xmm3 101 palignr xmm2, xmm4, (%2+1) 102 103 palignr xmm3, xmm4, (%2+2) 104 105 lea rsi, [rsi+rax*2] 106 lea rdi, [rdi+rdx*2] 107 108 psadbw xmm1, xmm0 109 psadbw xmm2, xmm0 110 psadbw xmm3, xmm0 111 112 paddw xmm5, xmm1 113 paddw xmm6, xmm2 114 paddw xmm7, xmm3 115%endmacro 116 117%macro PROCESS_16X16X3_OFFSET 2 118%2_aligned_by_%1: 119 120 sub rdi, %1 121 122 PROCESS_16X2X3_OFFSET 1, %1 123 PROCESS_16X2X3_OFFSET 0, %1 124 PROCESS_16X2X3_OFFSET 0, %1 125 PROCESS_16X2X3_OFFSET 0, %1 126 PROCESS_16X2X3_OFFSET 0, %1 127 PROCESS_16X2X3_OFFSET 0, %1 128 PROCESS_16X2X3_OFFSET 0, %1 129 PROCESS_16X2X3_OFFSET 0, %1 130 131 jmp %2_store_off 132 133%endmacro 134 135%macro PROCESS_16X8X3_OFFSET 2 136%2_aligned_by_%1: 137 138 sub rdi, %1 139 140 PROCESS_16X2X3_OFFSET 1, %1 141 PROCESS_16X2X3_OFFSET 0, %1 142 PROCESS_16X2X3_OFFSET 0, %1 143 PROCESS_16X2X3_OFFSET 0, %1 144 145 jmp %2_store_off 146 147%endmacro 148 149;void int vp8_sad16x16x3_ssse3( 150; unsigned char *src_ptr, 151; int src_stride, 152; unsigned char *ref_ptr, 153; int ref_stride, 154; int *results) 155global sym(vp8_sad16x16x3_ssse3) 156sym(vp8_sad16x16x3_ssse3): 157 push rbp 158 mov rbp, rsp 159 SHADOW_ARGS_TO_STACK 5 160 push rsi 161 push rdi 162 push rcx 163 ; end prolog 164 165 mov rsi, arg(0) ;src_ptr 166 mov rdi, arg(2) ;ref_ptr 167 168 mov rdx, 0xf 169 and rdx, rdi 170 171 jmp vp8_sad16x16x3_ssse3_skiptable 172vp8_sad16x16x3_ssse3_jumptable: 173 dd vp8_sad16x16x3_ssse3_aligned_by_0 - vp8_sad16x16x3_ssse3_do_jump 174 dd vp8_sad16x16x3_ssse3_aligned_by_1 - vp8_sad16x16x3_ssse3_do_jump 175 dd vp8_sad16x16x3_ssse3_aligned_by_2 - vp8_sad16x16x3_ssse3_do_jump 176 dd vp8_sad16x16x3_ssse3_aligned_by_3 - vp8_sad16x16x3_ssse3_do_jump 177 dd vp8_sad16x16x3_ssse3_aligned_by_4 - vp8_sad16x16x3_ssse3_do_jump 178 dd vp8_sad16x16x3_ssse3_aligned_by_5 - vp8_sad16x16x3_ssse3_do_jump 179 dd vp8_sad16x16x3_ssse3_aligned_by_6 - vp8_sad16x16x3_ssse3_do_jump 180 dd vp8_sad16x16x3_ssse3_aligned_by_7 - vp8_sad16x16x3_ssse3_do_jump 181 dd vp8_sad16x16x3_ssse3_aligned_by_8 - vp8_sad16x16x3_ssse3_do_jump 182 dd vp8_sad16x16x3_ssse3_aligned_by_9 - vp8_sad16x16x3_ssse3_do_jump 183 dd vp8_sad16x16x3_ssse3_aligned_by_10 - vp8_sad16x16x3_ssse3_do_jump 184 dd vp8_sad16x16x3_ssse3_aligned_by_11 - vp8_sad16x16x3_ssse3_do_jump 185 dd vp8_sad16x16x3_ssse3_aligned_by_12 - vp8_sad16x16x3_ssse3_do_jump 186 dd vp8_sad16x16x3_ssse3_aligned_by_13 - vp8_sad16x16x3_ssse3_do_jump 187 dd vp8_sad16x16x3_ssse3_aligned_by_14 - vp8_sad16x16x3_ssse3_do_jump 188 dd vp8_sad16x16x3_ssse3_aligned_by_15 - vp8_sad16x16x3_ssse3_do_jump 189vp8_sad16x16x3_ssse3_skiptable: 190 191 call vp8_sad16x16x3_ssse3_do_jump 192vp8_sad16x16x3_ssse3_do_jump: 193 pop rcx ; get the address of do_jump 194 mov rax, vp8_sad16x16x3_ssse3_jumptable - vp8_sad16x16x3_ssse3_do_jump 195 add rax, rcx ; get the absolute address of vp8_sad16x16x3_ssse3_jumptable 196 197 movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable 198 add rcx, rax 199 200 movsxd rax, dword ptr arg(1) ;src_stride 201 movsxd rdx, dword ptr arg(3) ;ref_stride 202 203 jmp rcx 204 205 PROCESS_16X16X3_OFFSET 0, vp8_sad16x16x3_ssse3 206 PROCESS_16X16X3_OFFSET 1, vp8_sad16x16x3_ssse3 207 PROCESS_16X16X3_OFFSET 2, vp8_sad16x16x3_ssse3 208 PROCESS_16X16X3_OFFSET 3, vp8_sad16x16x3_ssse3 209 PROCESS_16X16X3_OFFSET 4, vp8_sad16x16x3_ssse3 210 PROCESS_16X16X3_OFFSET 5, vp8_sad16x16x3_ssse3 211 PROCESS_16X16X3_OFFSET 6, vp8_sad16x16x3_ssse3 212 PROCESS_16X16X3_OFFSET 7, vp8_sad16x16x3_ssse3 213 PROCESS_16X16X3_OFFSET 8, vp8_sad16x16x3_ssse3 214 PROCESS_16X16X3_OFFSET 9, vp8_sad16x16x3_ssse3 215 PROCESS_16X16X3_OFFSET 10, vp8_sad16x16x3_ssse3 216 PROCESS_16X16X3_OFFSET 11, vp8_sad16x16x3_ssse3 217 PROCESS_16X16X3_OFFSET 12, vp8_sad16x16x3_ssse3 218 PROCESS_16X16X3_OFFSET 13, vp8_sad16x16x3_ssse3 219 PROCESS_16X16X3_OFFSET 14, vp8_sad16x16x3_ssse3 220 221vp8_sad16x16x3_ssse3_aligned_by_15: 222 PROCESS_16X2X3 1 223 PROCESS_16X2X3 0 224 PROCESS_16X2X3 0 225 PROCESS_16X2X3 0 226 PROCESS_16X2X3 0 227 PROCESS_16X2X3 0 228 PROCESS_16X2X3 0 229 PROCESS_16X2X3 0 230 231vp8_sad16x16x3_ssse3_store_off: 232 mov rdi, arg(4) ;Results 233 234 movq xmm0, xmm5 235 psrldq xmm5, 8 236 237 paddw xmm0, xmm5 238 movd [rdi], xmm0 239;- 240 movq xmm0, xmm6 241 psrldq xmm6, 8 242 243 paddw xmm0, xmm6 244 movd [rdi+4], xmm0 245;- 246 movq xmm0, xmm7 247 psrldq xmm7, 8 248 249 paddw xmm0, xmm7 250 movd [rdi+8], xmm0 251 252 ; begin epilog 253 pop rcx 254 pop rdi 255 pop rsi 256 UNSHADOW_ARGS 257 pop rbp 258 ret 259 260;void int vp8_sad16x8x3_ssse3( 261; unsigned char *src_ptr, 262; int src_stride, 263; unsigned char *ref_ptr, 264; int ref_stride, 265; int *results) 266global sym(vp8_sad16x8x3_ssse3) 267sym(vp8_sad16x8x3_ssse3): 268 push rbp 269 mov rbp, rsp 270 SHADOW_ARGS_TO_STACK 5 271 push rsi 272 push rdi 273 push rcx 274 ; end prolog 275 276 mov rsi, arg(0) ;src_ptr 277 mov rdi, arg(2) ;ref_ptr 278 279 mov rdx, 0xf 280 and rdx, rdi 281 282 jmp vp8_sad16x8x3_ssse3_skiptable 283vp8_sad16x8x3_ssse3_jumptable: 284 dd vp8_sad16x8x3_ssse3_aligned_by_0 - vp8_sad16x8x3_ssse3_do_jump 285 dd vp8_sad16x8x3_ssse3_aligned_by_1 - vp8_sad16x8x3_ssse3_do_jump 286 dd vp8_sad16x8x3_ssse3_aligned_by_2 - vp8_sad16x8x3_ssse3_do_jump 287 dd vp8_sad16x8x3_ssse3_aligned_by_3 - vp8_sad16x8x3_ssse3_do_jump 288 dd vp8_sad16x8x3_ssse3_aligned_by_4 - vp8_sad16x8x3_ssse3_do_jump 289 dd vp8_sad16x8x3_ssse3_aligned_by_5 - vp8_sad16x8x3_ssse3_do_jump 290 dd vp8_sad16x8x3_ssse3_aligned_by_6 - vp8_sad16x8x3_ssse3_do_jump 291 dd vp8_sad16x8x3_ssse3_aligned_by_7 - vp8_sad16x8x3_ssse3_do_jump 292 dd vp8_sad16x8x3_ssse3_aligned_by_8 - vp8_sad16x8x3_ssse3_do_jump 293 dd vp8_sad16x8x3_ssse3_aligned_by_9 - vp8_sad16x8x3_ssse3_do_jump 294 dd vp8_sad16x8x3_ssse3_aligned_by_10 - vp8_sad16x8x3_ssse3_do_jump 295 dd vp8_sad16x8x3_ssse3_aligned_by_11 - vp8_sad16x8x3_ssse3_do_jump 296 dd vp8_sad16x8x3_ssse3_aligned_by_12 - vp8_sad16x8x3_ssse3_do_jump 297 dd vp8_sad16x8x3_ssse3_aligned_by_13 - vp8_sad16x8x3_ssse3_do_jump 298 dd vp8_sad16x8x3_ssse3_aligned_by_14 - vp8_sad16x8x3_ssse3_do_jump 299 dd vp8_sad16x8x3_ssse3_aligned_by_15 - vp8_sad16x8x3_ssse3_do_jump 300vp8_sad16x8x3_ssse3_skiptable: 301 302 call vp8_sad16x8x3_ssse3_do_jump 303vp8_sad16x8x3_ssse3_do_jump: 304 pop rcx ; get the address of do_jump 305 mov rax, vp8_sad16x8x3_ssse3_jumptable - vp8_sad16x8x3_ssse3_do_jump 306 add rax, rcx ; get the absolute address of vp8_sad16x8x3_ssse3_jumptable 307 308 movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable 309 add rcx, rax 310 311 movsxd rax, dword ptr arg(1) ;src_stride 312 movsxd rdx, dword ptr arg(3) ;ref_stride 313 314 jmp rcx 315 316 PROCESS_16X8X3_OFFSET 0, vp8_sad16x8x3_ssse3 317 PROCESS_16X8X3_OFFSET 1, vp8_sad16x8x3_ssse3 318 PROCESS_16X8X3_OFFSET 2, vp8_sad16x8x3_ssse3 319 PROCESS_16X8X3_OFFSET 3, vp8_sad16x8x3_ssse3 320 PROCESS_16X8X3_OFFSET 4, vp8_sad16x8x3_ssse3 321 PROCESS_16X8X3_OFFSET 5, vp8_sad16x8x3_ssse3 322 PROCESS_16X8X3_OFFSET 6, vp8_sad16x8x3_ssse3 323 PROCESS_16X8X3_OFFSET 7, vp8_sad16x8x3_ssse3 324 PROCESS_16X8X3_OFFSET 8, vp8_sad16x8x3_ssse3 325 PROCESS_16X8X3_OFFSET 9, vp8_sad16x8x3_ssse3 326 PROCESS_16X8X3_OFFSET 10, vp8_sad16x8x3_ssse3 327 PROCESS_16X8X3_OFFSET 11, vp8_sad16x8x3_ssse3 328 PROCESS_16X8X3_OFFSET 12, vp8_sad16x8x3_ssse3 329 PROCESS_16X8X3_OFFSET 13, vp8_sad16x8x3_ssse3 330 PROCESS_16X8X3_OFFSET 14, vp8_sad16x8x3_ssse3 331 332vp8_sad16x8x3_ssse3_aligned_by_15: 333 334 PROCESS_16X2X3 1 335 PROCESS_16X2X3 0 336 PROCESS_16X2X3 0 337 PROCESS_16X2X3 0 338 339vp8_sad16x8x3_ssse3_store_off: 340 mov rdi, arg(4) ;Results 341 342 movq xmm0, xmm5 343 psrldq xmm5, 8 344 345 paddw xmm0, xmm5 346 movd [rdi], xmm0 347;- 348 movq xmm0, xmm6 349 psrldq xmm6, 8 350 351 paddw xmm0, xmm6 352 movd [rdi+4], xmm0 353;- 354 movq xmm0, xmm7 355 psrldq xmm7, 8 356 357 paddw xmm0, xmm7 358 movd [rdi+8], xmm0 359 360 ; begin epilog 361 pop rcx 362 pop rdi 363 pop rsi 364 UNSHADOW_ARGS 365 pop rbp 366 ret 367