1; 2; Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11%include "third_party/x86inc/x86inc.asm" 12 13SECTION_RODATA 14pw_4: times 8 dw 4 15pw_8: times 8 dw 8 16pw_16: times 4 dd 16 17pw_32: times 4 dd 32 18 19SECTION .text 20INIT_MMX sse 21cglobal highbd_dc_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset 22 GET_GOT goffsetq 23 24 movq m0, [aboveq] 25 movq m2, [leftq] 26 DEFINE_ARGS dst, stride, one 27 mov oned, 0x0001 28 pxor m1, m1 29 movd m3, oned 30 pshufw m3, m3, 0x0 31 paddw m0, m2 32 pmaddwd m0, m3 33 packssdw m0, m1 34 pmaddwd m0, m3 35 paddw m0, [GLOBAL(pw_4)] 36 psraw m0, 3 37 pshufw m0, m0, 0x0 38 movq [dstq ], m0 39 movq [dstq+strideq*2], m0 40 lea dstq, [dstq+strideq*4] 41 movq [dstq ], m0 42 movq [dstq+strideq*2], m0 43 44 RESTORE_GOT 45 RET 46 47INIT_XMM sse2 48cglobal highbd_dc_predictor_8x8, 4, 5, 4, dst, stride, above, left, goffset 49 GET_GOT goffsetq 50 51 pxor m1, m1 52 mova m0, [aboveq] 53 mova m2, [leftq] 54 DEFINE_ARGS dst, stride, stride3, one 55 mov oned, 0x00010001 56 lea stride3q, [strideq*3] 57 movd m3, oned 58 pshufd m3, m3, 0x0 59 paddw m0, m2 60 pmaddwd m0, m3 61 packssdw m0, m1 62 pmaddwd m0, m3 63 packssdw m0, m1 64 pmaddwd m0, m3 65 paddw m0, [GLOBAL(pw_8)] 66 psrlw m0, 4 67 pshuflw m0, m0, 0x0 68 punpcklqdq m0, m0 69 mova [dstq ], m0 70 mova [dstq+strideq*2 ], m0 71 mova [dstq+strideq*4 ], m0 72 mova [dstq+stride3q*2], m0 73 lea dstq, [dstq+strideq*8] 74 mova [dstq ], m0 75 mova [dstq+strideq*2 ], m0 76 mova [dstq+strideq*4 ], m0 77 mova [dstq+stride3q*2], m0 78 79 RESTORE_GOT 80 RET 81 82INIT_XMM sse2 83cglobal highbd_dc_predictor_16x16, 4, 5, 5, dst, stride, above, left, goffset 84 GET_GOT goffsetq 85 86 pxor m1, m1 87 mova m0, [aboveq] 88 mova m3, [aboveq+16] 89 mova m2, [leftq] 90 mova m4, [leftq+16] 91 DEFINE_ARGS dst, stride, stride3, lines4 92 lea stride3q, [strideq*3] 93 mov lines4d, 4 94 paddw m0, m2 95 paddw m0, m3 96 paddw m0, m4 97 movhlps m2, m0 98 paddw m0, m2 99 punpcklwd m0, m1 100 movhlps m2, m0 101 paddd m0, m2 102 punpckldq m0, m1 103 movhlps m2, m0 104 paddd m0, m2 105 paddd m0, [GLOBAL(pw_16)] 106 psrad m0, 5 107 pshuflw m0, m0, 0x0 108 punpcklqdq m0, m0 109.loop: 110 mova [dstq ], m0 111 mova [dstq +16], m0 112 mova [dstq+strideq*2 ], m0 113 mova [dstq+strideq*2 +16], m0 114 mova [dstq+strideq*4 ], m0 115 mova [dstq+strideq*4 +16], m0 116 mova [dstq+stride3q*2 ], m0 117 mova [dstq+stride3q*2+16], m0 118 lea dstq, [dstq+strideq*8] 119 dec lines4d 120 jnz .loop 121 122 RESTORE_GOT 123 REP_RET 124 125%if ARCH_X86_64 126INIT_XMM sse2 127cglobal highbd_dc_predictor_32x32, 4, 5, 9, dst, stride, above, left, goffset 128 GET_GOT goffsetq 129 130 pxor m1, m1 131 mova m0, [aboveq] 132 mova m2, [aboveq+16] 133 mova m3, [aboveq+32] 134 mova m4, [aboveq+48] 135 mova m5, [leftq] 136 mova m6, [leftq+16] 137 mova m7, [leftq+32] 138 mova m8, [leftq+48] 139 DEFINE_ARGS dst, stride, stride3, lines4 140 lea stride3q, [strideq*3] 141 mov lines4d, 8 142 paddw m0, m2 143 paddw m0, m3 144 paddw m0, m4 145 paddw m0, m5 146 paddw m0, m6 147 paddw m0, m7 148 paddw m0, m8 149 movhlps m2, m0 150 paddw m0, m2 151 punpcklwd m0, m1 152 movhlps m2, m0 153 paddd m0, m2 154 punpckldq m0, m1 155 movhlps m2, m0 156 paddd m0, m2 157 paddd m0, [GLOBAL(pw_32)] 158 psrad m0, 6 159 pshuflw m0, m0, 0x0 160 punpcklqdq m0, m0 161.loop: 162 mova [dstq ], m0 163 mova [dstq +16 ], m0 164 mova [dstq +32 ], m0 165 mova [dstq +48 ], m0 166 mova [dstq+strideq*2 ], m0 167 mova [dstq+strideq*2+16 ], m0 168 mova [dstq+strideq*2+32 ], m0 169 mova [dstq+strideq*2+48 ], m0 170 mova [dstq+strideq*4 ], m0 171 mova [dstq+strideq*4+16 ], m0 172 mova [dstq+strideq*4+32 ], m0 173 mova [dstq+strideq*4+48 ], m0 174 mova [dstq+stride3q*2 ], m0 175 mova [dstq+stride3q*2 +16], m0 176 mova [dstq+stride3q*2 +32], m0 177 mova [dstq+stride3q*2 +48], m0 178 lea dstq, [dstq+strideq*8] 179 dec lines4d 180 jnz .loop 181 182 RESTORE_GOT 183 REP_RET 184%endif 185 186INIT_MMX sse 187cglobal highbd_v_predictor_4x4, 3, 3, 1, dst, stride, above 188 movq m0, [aboveq] 189 movq [dstq ], m0 190 movq [dstq+strideq*2], m0 191 lea dstq, [dstq+strideq*4] 192 movq [dstq ], m0 193 movq [dstq+strideq*2], m0 194 RET 195 196INIT_XMM sse2 197cglobal highbd_v_predictor_8x8, 3, 3, 1, dst, stride, above 198 mova m0, [aboveq] 199 DEFINE_ARGS dst, stride, stride3 200 lea stride3q, [strideq*3] 201 mova [dstq ], m0 202 mova [dstq+strideq*2 ], m0 203 mova [dstq+strideq*4 ], m0 204 mova [dstq+stride3q*2], m0 205 lea dstq, [dstq+strideq*8] 206 mova [dstq ], m0 207 mova [dstq+strideq*2 ], m0 208 mova [dstq+strideq*4 ], m0 209 mova [dstq+stride3q*2], m0 210 RET 211 212INIT_XMM sse2 213cglobal highbd_v_predictor_16x16, 3, 4, 2, dst, stride, above 214 mova m0, [aboveq] 215 mova m1, [aboveq+16] 216 DEFINE_ARGS dst, stride, stride3, nlines4 217 lea stride3q, [strideq*3] 218 mov nlines4d, 4 219.loop: 220 mova [dstq ], m0 221 mova [dstq +16], m1 222 mova [dstq+strideq*2 ], m0 223 mova [dstq+strideq*2 +16], m1 224 mova [dstq+strideq*4 ], m0 225 mova [dstq+strideq*4 +16], m1 226 mova [dstq+stride3q*2 ], m0 227 mova [dstq+stride3q*2+16], m1 228 lea dstq, [dstq+strideq*8] 229 dec nlines4d 230 jnz .loop 231 REP_RET 232 233INIT_XMM sse2 234cglobal highbd_v_predictor_32x32, 3, 4, 4, dst, stride, above 235 mova m0, [aboveq] 236 mova m1, [aboveq+16] 237 mova m2, [aboveq+32] 238 mova m3, [aboveq+48] 239 DEFINE_ARGS dst, stride, stride3, nlines4 240 lea stride3q, [strideq*3] 241 mov nlines4d, 8 242.loop: 243 mova [dstq ], m0 244 mova [dstq +16], m1 245 mova [dstq +32], m2 246 mova [dstq +48], m3 247 mova [dstq+strideq*2 ], m0 248 mova [dstq+strideq*2 +16], m1 249 mova [dstq+strideq*2 +32], m2 250 mova [dstq+strideq*2 +48], m3 251 mova [dstq+strideq*4 ], m0 252 mova [dstq+strideq*4 +16], m1 253 mova [dstq+strideq*4 +32], m2 254 mova [dstq+strideq*4 +48], m3 255 mova [dstq+stride3q*2 ], m0 256 mova [dstq+stride3q*2 +16], m1 257 mova [dstq+stride3q*2 +32], m2 258 mova [dstq+stride3q*2 +48], m3 259 lea dstq, [dstq+strideq*8] 260 dec nlines4d 261 jnz .loop 262 REP_RET 263 264INIT_MMX sse 265cglobal highbd_tm_predictor_4x4, 5, 6, 5, dst, stride, above, left, bps, one 266 movd m1, [aboveq-2] 267 movq m0, [aboveq] 268 pshufw m1, m1, 0x0 269 ; Get the values to compute the maximum value at this bit depth 270 mov oned, 1 271 movd m3, oned 272 movd m4, bpsd 273 pshufw m3, m3, 0x0 274 DEFINE_ARGS dst, stride, line, left 275 mov lineq, -2 276 mova m2, m3 277 psllw m3, m4 278 add leftq, 8 279 psubw m3, m2 ; max possible value 280 pxor m4, m4 ; min possible value 281 psubw m0, m1 282.loop: 283 movq m1, [leftq+lineq*4] 284 movq m2, [leftq+lineq*4+2] 285 pshufw m1, m1, 0x0 286 pshufw m2, m2, 0x0 287 paddw m1, m0 288 paddw m2, m0 289 ;Clamp to the bit-depth 290 pminsw m1, m3 291 pminsw m2, m3 292 pmaxsw m1, m4 293 pmaxsw m2, m4 294 ;Store the values 295 movq [dstq ], m1 296 movq [dstq+strideq*2], m2 297 lea dstq, [dstq+strideq*4] 298 inc lineq 299 jnz .loop 300 REP_RET 301 302INIT_XMM sse2 303cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one 304 movd m1, [aboveq-2] 305 mova m0, [aboveq] 306 pshuflw m1, m1, 0x0 307 ; Get the values to compute the maximum value at this bit depth 308 mov oned, 1 309 pxor m3, m3 310 pxor m4, m4 311 pinsrw m3, oned, 0 312 pinsrw m4, bpsd, 0 313 pshuflw m3, m3, 0x0 314 DEFINE_ARGS dst, stride, line, left 315 punpcklqdq m3, m3 316 mov lineq, -4 317 mova m2, m3 318 punpcklqdq m1, m1 319 psllw m3, m4 320 add leftq, 16 321 psubw m3, m2 ; max possible value 322 pxor m4, m4 ; min possible value 323 psubw m0, m1 324.loop: 325 movd m1, [leftq+lineq*4] 326 movd m2, [leftq+lineq*4+2] 327 pshuflw m1, m1, 0x0 328 pshuflw m2, m2, 0x0 329 punpcklqdq m1, m1 330 punpcklqdq m2, m2 331 paddw m1, m0 332 paddw m2, m0 333 ;Clamp to the bit-depth 334 pminsw m1, m3 335 pminsw m2, m3 336 pmaxsw m1, m4 337 pmaxsw m2, m4 338 ;Store the values 339 mova [dstq ], m1 340 mova [dstq+strideq*2], m2 341 lea dstq, [dstq+strideq*4] 342 inc lineq 343 jnz .loop 344 REP_RET 345 346%if ARCH_X86_64 347INIT_XMM sse2 348cglobal highbd_tm_predictor_16x16, 5, 6, 9, dst, stride, above, left, bps, one 349 movd m2, [aboveq-2] 350 mova m0, [aboveq] 351 mova m1, [aboveq+16] 352 pshuflw m2, m2, 0x0 353 ; Get the values to compute the maximum value at this bit depth 354 mov oned, 1 355 pxor m7, m7 356 pxor m8, m8 357 pinsrw m7, oned, 0 358 pinsrw m8, bpsd, 0 359 pshuflw m7, m7, 0x0 360 DEFINE_ARGS dst, stride, line, left 361 punpcklqdq m7, m7 362 mov lineq, -8 363 mova m5, m7 364 punpcklqdq m2, m2 365 psllw m7, m8 366 add leftq, 32 367 psubw m7, m5 ; max possible value 368 pxor m8, m8 ; min possible value 369 psubw m0, m2 370 psubw m1, m2 371.loop: 372 movd m2, [leftq+lineq*4] 373 movd m3, [leftq+lineq*4+2] 374 pshuflw m2, m2, 0x0 375 pshuflw m3, m3, 0x0 376 punpcklqdq m2, m2 377 punpcklqdq m3, m3 378 paddw m4, m2, m0 379 paddw m5, m3, m0 380 paddw m2, m1 381 paddw m3, m1 382 ;Clamp to the bit-depth 383 pminsw m4, m7 384 pminsw m5, m7 385 pminsw m2, m7 386 pminsw m3, m7 387 pmaxsw m4, m8 388 pmaxsw m5, m8 389 pmaxsw m2, m8 390 pmaxsw m3, m8 391 ;Store the values 392 mova [dstq ], m4 393 mova [dstq+strideq*2 ], m5 394 mova [dstq +16], m2 395 mova [dstq+strideq*2+16], m3 396 lea dstq, [dstq+strideq*4] 397 inc lineq 398 jnz .loop 399 REP_RET 400 401INIT_XMM sse2 402cglobal highbd_tm_predictor_32x32, 5, 6, 12, dst, stride, above, left, bps, one 403 movd m0, [aboveq-2] 404 mova m1, [aboveq] 405 mova m2, [aboveq+16] 406 mova m3, [aboveq+32] 407 mova m4, [aboveq+48] 408 pshuflw m0, m0, 0x0 409 ; Get the values to compute the maximum value at this bit depth 410 mov oned, 1 411 pxor m10, m10 412 pxor m11, m11 413 pinsrw m10, oned, 0 414 pinsrw m11, bpsd, 0 415 pshuflw m10, m10, 0x0 416 DEFINE_ARGS dst, stride, line, left 417 punpcklqdq m10, m10 418 mov lineq, -16 419 mova m5, m10 420 punpcklqdq m0, m0 421 psllw m10, m11 422 add leftq, 64 423 psubw m10, m5 ; max possible value 424 pxor m11, m11 ; min possible value 425 psubw m1, m0 426 psubw m2, m0 427 psubw m3, m0 428 psubw m4, m0 429.loop: 430 movd m5, [leftq+lineq*4] 431 movd m6, [leftq+lineq*4+2] 432 pshuflw m5, m5, 0x0 433 pshuflw m6, m6, 0x0 434 punpcklqdq m5, m5 435 punpcklqdq m6, m6 436 paddw m7, m5, m1 437 paddw m8, m5, m2 438 paddw m9, m5, m3 439 paddw m5, m4 440 ;Clamp these values to the bit-depth 441 pminsw m7, m10 442 pminsw m8, m10 443 pminsw m9, m10 444 pminsw m5, m10 445 pmaxsw m7, m11 446 pmaxsw m8, m11 447 pmaxsw m9, m11 448 pmaxsw m5, m11 449 ;Store these values 450 mova [dstq ], m7 451 mova [dstq +16], m8 452 mova [dstq +32], m9 453 mova [dstq +48], m5 454 paddw m7, m6, m1 455 paddw m8, m6, m2 456 paddw m9, m6, m3 457 paddw m6, m4 458 ;Clamp these values to the bit-depth 459 pminsw m7, m10 460 pminsw m8, m10 461 pminsw m9, m10 462 pminsw m6, m10 463 pmaxsw m7, m11 464 pmaxsw m8, m11 465 pmaxsw m9, m11 466 pmaxsw m6, m11 467 ;Store these values 468 mova [dstq+strideq*2 ], m7 469 mova [dstq+strideq*2+16], m8 470 mova [dstq+strideq*2+32], m9 471 mova [dstq+strideq*2+48], m6 472 lea dstq, [dstq+strideq*4] 473 inc lineq 474 jnz .loop 475 REP_RET 476%endif 477