1/* 2 * Copyright 2013 The LibYuv Project Authors. All rights reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include "libyuv/row.h" 12 13#ifdef __cplusplus 14namespace libyuv { 15extern "C" { 16#endif 17 18// This module is for Visual C x86. 19#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) 20 21// Offsets for source bytes 0 to 9 22static uvec8 kShuf0 = 23 { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 }; 24 25// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. 26static uvec8 kShuf1 = 27 { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 }; 28 29// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. 30static uvec8 kShuf2 = 31 { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 }; 32 33// Offsets for source bytes 0 to 10 34static uvec8 kShuf01 = 35 { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 }; 36 37// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. 38static uvec8 kShuf11 = 39 { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 }; 40 41// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. 42static uvec8 kShuf21 = 43 { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 }; 44 45// Coefficients for source bytes 0 to 10 46static uvec8 kMadd01 = 47 { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 }; 48 49// Coefficients for source bytes 10 to 21 50static uvec8 kMadd11 = 51 { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 }; 52 53// Coefficients for source bytes 21 to 31 54static uvec8 kMadd21 = 55 { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 }; 56 57// Coefficients for source bytes 21 to 31 58static vec16 kRound34 = 59 { 2, 2, 2, 2, 2, 2, 2, 2 }; 60 61static uvec8 kShuf38a = 62 { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; 63 64static uvec8 kShuf38b = 65 { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 }; 66 67// Arrange words 0,3,6 into 0,1,2 68static uvec8 kShufAc = 69 { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; 70 71// Arrange words 0,3,6 into 3,4,5 72static uvec8 kShufAc3 = 73 { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 }; 74 75// Scaling values for boxes of 3x3 and 2x3 76static uvec16 kScaleAc33 = 77 { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 }; 78 79// Arrange first value for pixels 0,1,2,3,4,5 80static uvec8 kShufAb0 = 81 { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 }; 82 83// Arrange second value for pixels 0,1,2,3,4,5 84static uvec8 kShufAb1 = 85 { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 }; 86 87// Arrange third value for pixels 0,1,2,3,4,5 88static uvec8 kShufAb2 = 89 { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; 90 91// Scaling values for boxes of 3x2 and 2x2 92static uvec16 kScaleAb2 = 93 { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; 94 95// Reads 32 pixels, throws half away and writes 16 pixels. 96// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. 97__declspec(naked) __declspec(align(16)) 98void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 99 uint8* dst_ptr, int dst_width) { 100 __asm { 101 mov eax, [esp + 4] // src_ptr 102 // src_stride ignored 103 mov edx, [esp + 12] // dst_ptr 104 mov ecx, [esp + 16] // dst_width 105 106 align 4 107 wloop: 108 movdqa xmm0, [eax] 109 movdqa xmm1, [eax + 16] 110 lea eax, [eax + 32] 111 psrlw xmm0, 8 // isolate odd pixels. 112 psrlw xmm1, 8 113 packuswb xmm0, xmm1 114 sub ecx, 16 115 movdqa [edx], xmm0 116 lea edx, [edx + 16] 117 jg wloop 118 119 ret 120 } 121} 122 123// Blends 32x1 rectangle to 16x1. 124// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. 125__declspec(naked) __declspec(align(16)) 126void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 127 uint8* dst_ptr, int dst_width) { 128 __asm { 129 mov eax, [esp + 4] // src_ptr 130 // src_stride 131 mov edx, [esp + 12] // dst_ptr 132 mov ecx, [esp + 16] // dst_width 133 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 134 psrlw xmm5, 8 135 136 align 4 137 wloop: 138 movdqa xmm0, [eax] 139 movdqa xmm1, [eax + 16] 140 lea eax, [eax + 32] 141 142 movdqa xmm2, xmm0 // average columns (32 to 16 pixels) 143 psrlw xmm0, 8 144 movdqa xmm3, xmm1 145 psrlw xmm1, 8 146 pand xmm2, xmm5 147 pand xmm3, xmm5 148 pavgw xmm0, xmm2 149 pavgw xmm1, xmm3 150 packuswb xmm0, xmm1 151 152 sub ecx, 16 153 movdqa [edx], xmm0 154 lea edx, [edx + 16] 155 jg wloop 156 157 ret 158 } 159} 160 161// Blends 32x2 rectangle to 16x1. 162// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. 163__declspec(naked) __declspec(align(16)) 164void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 165 uint8* dst_ptr, int dst_width) { 166 __asm { 167 push esi 168 mov eax, [esp + 4 + 4] // src_ptr 169 mov esi, [esp + 4 + 8] // src_stride 170 mov edx, [esp + 4 + 12] // dst_ptr 171 mov ecx, [esp + 4 + 16] // dst_width 172 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 173 psrlw xmm5, 8 174 175 align 4 176 wloop: 177 movdqa xmm0, [eax] 178 movdqa xmm1, [eax + 16] 179 movdqa xmm2, [eax + esi] 180 movdqa xmm3, [eax + esi + 16] 181 lea eax, [eax + 32] 182 pavgb xmm0, xmm2 // average rows 183 pavgb xmm1, xmm3 184 185 movdqa xmm2, xmm0 // average columns (32 to 16 pixels) 186 psrlw xmm0, 8 187 movdqa xmm3, xmm1 188 psrlw xmm1, 8 189 pand xmm2, xmm5 190 pand xmm3, xmm5 191 pavgw xmm0, xmm2 192 pavgw xmm1, xmm3 193 packuswb xmm0, xmm1 194 195 sub ecx, 16 196 movdqa [edx], xmm0 197 lea edx, [edx + 16] 198 jg wloop 199 200 pop esi 201 ret 202 } 203} 204 205// Reads 32 pixels, throws half away and writes 16 pixels. 206// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. 207__declspec(naked) __declspec(align(16)) 208void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, 209 ptrdiff_t src_stride, 210 uint8* dst_ptr, int dst_width) { 211 __asm { 212 mov eax, [esp + 4] // src_ptr 213 // src_stride ignored 214 mov edx, [esp + 12] // dst_ptr 215 mov ecx, [esp + 16] // dst_width 216 217 align 4 218 wloop: 219 movdqu xmm0, [eax] 220 movdqu xmm1, [eax + 16] 221 lea eax, [eax + 32] 222 psrlw xmm0, 8 // isolate odd pixels. 223 psrlw xmm1, 8 224 packuswb xmm0, xmm1 225 sub ecx, 16 226 movdqu [edx], xmm0 227 lea edx, [edx + 16] 228 jg wloop 229 230 ret 231 } 232} 233 234// Blends 32x1 rectangle to 16x1. 235// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. 236__declspec(naked) __declspec(align(16)) 237void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr, 238 ptrdiff_t src_stride, 239 uint8* dst_ptr, int dst_width) { 240 __asm { 241 mov eax, [esp + 4] // src_ptr 242 // src_stride 243 mov edx, [esp + 12] // dst_ptr 244 mov ecx, [esp + 16] // dst_width 245 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 246 psrlw xmm5, 8 247 248 align 4 249 wloop: 250 movdqu xmm0, [eax] 251 movdqu xmm1, [eax + 16] 252 lea eax, [eax + 32] 253 254 movdqa xmm2, xmm0 // average columns (32 to 16 pixels) 255 psrlw xmm0, 8 256 movdqa xmm3, xmm1 257 psrlw xmm1, 8 258 pand xmm2, xmm5 259 pand xmm3, xmm5 260 pavgw xmm0, xmm2 261 pavgw xmm1, xmm3 262 packuswb xmm0, xmm1 263 264 sub ecx, 16 265 movdqu [edx], xmm0 266 lea edx, [edx + 16] 267 jg wloop 268 269 ret 270 } 271} 272 273// Blends 32x2 rectangle to 16x1. 274// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. 275__declspec(naked) __declspec(align(16)) 276void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr, 277 ptrdiff_t src_stride, 278 uint8* dst_ptr, int dst_width) { 279 __asm { 280 push esi 281 mov eax, [esp + 4 + 4] // src_ptr 282 mov esi, [esp + 4 + 8] // src_stride 283 mov edx, [esp + 4 + 12] // dst_ptr 284 mov ecx, [esp + 4 + 16] // dst_width 285 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 286 psrlw xmm5, 8 287 288 align 4 289 wloop: 290 movdqu xmm0, [eax] 291 movdqu xmm1, [eax + 16] 292 movdqu xmm2, [eax + esi] 293 movdqu xmm3, [eax + esi + 16] 294 lea eax, [eax + 32] 295 pavgb xmm0, xmm2 // average rows 296 pavgb xmm1, xmm3 297 298 movdqa xmm2, xmm0 // average columns (32 to 16 pixels) 299 psrlw xmm0, 8 300 movdqa xmm3, xmm1 301 psrlw xmm1, 8 302 pand xmm2, xmm5 303 pand xmm3, xmm5 304 pavgw xmm0, xmm2 305 pavgw xmm1, xmm3 306 packuswb xmm0, xmm1 307 308 sub ecx, 16 309 movdqu [edx], xmm0 310 lea edx, [edx + 16] 311 jg wloop 312 313 pop esi 314 ret 315 } 316} 317 318// Point samples 32 pixels to 8 pixels. 319// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. 320__declspec(naked) __declspec(align(16)) 321void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 322 uint8* dst_ptr, int dst_width) { 323 __asm { 324 mov eax, [esp + 4] // src_ptr 325 // src_stride ignored 326 mov edx, [esp + 12] // dst_ptr 327 mov ecx, [esp + 16] // dst_width 328 pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000 329 psrld xmm5, 24 330 pslld xmm5, 16 331 332 align 4 333 wloop: 334 movdqa xmm0, [eax] 335 movdqa xmm1, [eax + 16] 336 lea eax, [eax + 32] 337 pand xmm0, xmm5 338 pand xmm1, xmm5 339 packuswb xmm0, xmm1 340 psrlw xmm0, 8 341 packuswb xmm0, xmm0 342 sub ecx, 8 343 movq qword ptr [edx], xmm0 344 lea edx, [edx + 8] 345 jg wloop 346 347 ret 348 } 349} 350 351// Blends 32x4 rectangle to 8x1. 352// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. 353__declspec(naked) __declspec(align(16)) 354void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 355 uint8* dst_ptr, int dst_width) { 356 __asm { 357 push esi 358 push edi 359 mov eax, [esp + 8 + 4] // src_ptr 360 mov esi, [esp + 8 + 8] // src_stride 361 mov edx, [esp + 8 + 12] // dst_ptr 362 mov ecx, [esp + 8 + 16] // dst_width 363 lea edi, [esi + esi * 2] // src_stride * 3 364 pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff 365 psrlw xmm7, 8 366 367 align 4 368 wloop: 369 movdqa xmm0, [eax] 370 movdqa xmm1, [eax + 16] 371 movdqa xmm2, [eax + esi] 372 movdqa xmm3, [eax + esi + 16] 373 pavgb xmm0, xmm2 // average rows 374 pavgb xmm1, xmm3 375 movdqa xmm2, [eax + esi * 2] 376 movdqa xmm3, [eax + esi * 2 + 16] 377 movdqa xmm4, [eax + edi] 378 movdqa xmm5, [eax + edi + 16] 379 lea eax, [eax + 32] 380 pavgb xmm2, xmm4 381 pavgb xmm3, xmm5 382 pavgb xmm0, xmm2 383 pavgb xmm1, xmm3 384 385 movdqa xmm2, xmm0 // average columns (32 to 16 pixels) 386 psrlw xmm0, 8 387 movdqa xmm3, xmm1 388 psrlw xmm1, 8 389 pand xmm2, xmm7 390 pand xmm3, xmm7 391 pavgw xmm0, xmm2 392 pavgw xmm1, xmm3 393 packuswb xmm0, xmm1 394 395 movdqa xmm2, xmm0 // average columns (16 to 8 pixels) 396 psrlw xmm0, 8 397 pand xmm2, xmm7 398 pavgw xmm0, xmm2 399 packuswb xmm0, xmm0 400 401 sub ecx, 8 402 movq qword ptr [edx], xmm0 403 lea edx, [edx + 8] 404 jg wloop 405 406 pop edi 407 pop esi 408 ret 409 } 410} 411 412// Point samples 32 pixels to 24 pixels. 413// Produces three 8 byte values. For each 8 bytes, 16 bytes are read. 414// Then shuffled to do the scaling. 415 416// Note that movdqa+palign may be better than movdqu. 417// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. 418__declspec(naked) __declspec(align(16)) 419void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, 420 uint8* dst_ptr, int dst_width) { 421 __asm { 422 mov eax, [esp + 4] // src_ptr 423 // src_stride ignored 424 mov edx, [esp + 12] // dst_ptr 425 mov ecx, [esp + 16] // dst_width 426 movdqa xmm3, kShuf0 427 movdqa xmm4, kShuf1 428 movdqa xmm5, kShuf2 429 430 align 4 431 wloop: 432 movdqa xmm0, [eax] 433 movdqa xmm1, [eax + 16] 434 lea eax, [eax + 32] 435 movdqa xmm2, xmm1 436 palignr xmm1, xmm0, 8 437 pshufb xmm0, xmm3 438 pshufb xmm1, xmm4 439 pshufb xmm2, xmm5 440 movq qword ptr [edx], xmm0 441 movq qword ptr [edx + 8], xmm1 442 movq qword ptr [edx + 16], xmm2 443 lea edx, [edx + 24] 444 sub ecx, 24 445 jg wloop 446 447 ret 448 } 449} 450 451// Blends 32x2 rectangle to 24x1 452// Produces three 8 byte values. For each 8 bytes, 16 bytes are read. 453// Then shuffled to do the scaling. 454 455// Register usage: 456// xmm0 src_row 0 457// xmm1 src_row 1 458// xmm2 shuf 0 459// xmm3 shuf 1 460// xmm4 shuf 2 461// xmm5 madd 0 462// xmm6 madd 1 463// xmm7 kRound34 464 465// Note that movdqa+palign may be better than movdqu. 466// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. 467__declspec(naked) __declspec(align(16)) 468void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, 469 ptrdiff_t src_stride, 470 uint8* dst_ptr, int dst_width) { 471 __asm { 472 push esi 473 mov eax, [esp + 4 + 4] // src_ptr 474 mov esi, [esp + 4 + 8] // src_stride 475 mov edx, [esp + 4 + 12] // dst_ptr 476 mov ecx, [esp + 4 + 16] // dst_width 477 movdqa xmm2, kShuf01 478 movdqa xmm3, kShuf11 479 movdqa xmm4, kShuf21 480 movdqa xmm5, kMadd01 481 movdqa xmm6, kMadd11 482 movdqa xmm7, kRound34 483 484 align 4 485 wloop: 486 movdqa xmm0, [eax] // pixels 0..7 487 movdqa xmm1, [eax + esi] 488 pavgb xmm0, xmm1 489 pshufb xmm0, xmm2 490 pmaddubsw xmm0, xmm5 491 paddsw xmm0, xmm7 492 psrlw xmm0, 2 493 packuswb xmm0, xmm0 494 movq qword ptr [edx], xmm0 495 movdqu xmm0, [eax + 8] // pixels 8..15 496 movdqu xmm1, [eax + esi + 8] 497 pavgb xmm0, xmm1 498 pshufb xmm0, xmm3 499 pmaddubsw xmm0, xmm6 500 paddsw xmm0, xmm7 501 psrlw xmm0, 2 502 packuswb xmm0, xmm0 503 movq qword ptr [edx + 8], xmm0 504 movdqa xmm0, [eax + 16] // pixels 16..23 505 movdqa xmm1, [eax + esi + 16] 506 lea eax, [eax + 32] 507 pavgb xmm0, xmm1 508 pshufb xmm0, xmm4 509 movdqa xmm1, kMadd21 510 pmaddubsw xmm0, xmm1 511 paddsw xmm0, xmm7 512 psrlw xmm0, 2 513 packuswb xmm0, xmm0 514 sub ecx, 24 515 movq qword ptr [edx + 16], xmm0 516 lea edx, [edx + 24] 517 jg wloop 518 519 pop esi 520 ret 521 } 522} 523 524// Note that movdqa+palign may be better than movdqu. 525// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. 526__declspec(naked) __declspec(align(16)) 527void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, 528 ptrdiff_t src_stride, 529 uint8* dst_ptr, int dst_width) { 530 __asm { 531 push esi 532 mov eax, [esp + 4 + 4] // src_ptr 533 mov esi, [esp + 4 + 8] // src_stride 534 mov edx, [esp + 4 + 12] // dst_ptr 535 mov ecx, [esp + 4 + 16] // dst_width 536 movdqa xmm2, kShuf01 537 movdqa xmm3, kShuf11 538 movdqa xmm4, kShuf21 539 movdqa xmm5, kMadd01 540 movdqa xmm6, kMadd11 541 movdqa xmm7, kRound34 542 543 align 4 544 wloop: 545 movdqa xmm0, [eax] // pixels 0..7 546 movdqa xmm1, [eax + esi] 547 pavgb xmm1, xmm0 548 pavgb xmm0, xmm1 549 pshufb xmm0, xmm2 550 pmaddubsw xmm0, xmm5 551 paddsw xmm0, xmm7 552 psrlw xmm0, 2 553 packuswb xmm0, xmm0 554 movq qword ptr [edx], xmm0 555 movdqu xmm0, [eax + 8] // pixels 8..15 556 movdqu xmm1, [eax + esi + 8] 557 pavgb xmm1, xmm0 558 pavgb xmm0, xmm1 559 pshufb xmm0, xmm3 560 pmaddubsw xmm0, xmm6 561 paddsw xmm0, xmm7 562 psrlw xmm0, 2 563 packuswb xmm0, xmm0 564 movq qword ptr [edx + 8], xmm0 565 movdqa xmm0, [eax + 16] // pixels 16..23 566 movdqa xmm1, [eax + esi + 16] 567 lea eax, [eax + 32] 568 pavgb xmm1, xmm0 569 pavgb xmm0, xmm1 570 pshufb xmm0, xmm4 571 movdqa xmm1, kMadd21 572 pmaddubsw xmm0, xmm1 573 paddsw xmm0, xmm7 574 psrlw xmm0, 2 575 packuswb xmm0, xmm0 576 sub ecx, 24 577 movq qword ptr [edx + 16], xmm0 578 lea edx, [edx+24] 579 jg wloop 580 581 pop esi 582 ret 583 } 584} 585 586// 3/8 point sampler 587 588// Scale 32 pixels to 12 589__declspec(naked) __declspec(align(16)) 590void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, 591 uint8* dst_ptr, int dst_width) { 592 __asm { 593 mov eax, [esp + 4] // src_ptr 594 // src_stride ignored 595 mov edx, [esp + 12] // dst_ptr 596 mov ecx, [esp + 16] // dst_width 597 movdqa xmm4, kShuf38a 598 movdqa xmm5, kShuf38b 599 600 align 4 601 xloop: 602 movdqa xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5 603 movdqa xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11 604 lea eax, [eax + 32] 605 pshufb xmm0, xmm4 606 pshufb xmm1, xmm5 607 paddusb xmm0, xmm1 608 609 sub ecx, 12 610 movq qword ptr [edx], xmm0 // write 12 pixels 611 movhlps xmm1, xmm0 612 movd [edx + 8], xmm1 613 lea edx, [edx + 12] 614 jg xloop 615 616 ret 617 } 618} 619 620// Scale 16x3 pixels to 6x1 with interpolation 621__declspec(naked) __declspec(align(16)) 622void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, 623 ptrdiff_t src_stride, 624 uint8* dst_ptr, int dst_width) { 625 __asm { 626 push esi 627 mov eax, [esp + 4 + 4] // src_ptr 628 mov esi, [esp + 4 + 8] // src_stride 629 mov edx, [esp + 4 + 12] // dst_ptr 630 mov ecx, [esp + 4 + 16] // dst_width 631 movdqa xmm2, kShufAc 632 movdqa xmm3, kShufAc3 633 movdqa xmm4, kScaleAc33 634 pxor xmm5, xmm5 635 636 align 4 637 xloop: 638 movdqa xmm0, [eax] // sum up 3 rows into xmm0/1 639 movdqa xmm6, [eax + esi] 640 movhlps xmm1, xmm0 641 movhlps xmm7, xmm6 642 punpcklbw xmm0, xmm5 643 punpcklbw xmm1, xmm5 644 punpcklbw xmm6, xmm5 645 punpcklbw xmm7, xmm5 646 paddusw xmm0, xmm6 647 paddusw xmm1, xmm7 648 movdqa xmm6, [eax + esi * 2] 649 lea eax, [eax + 16] 650 movhlps xmm7, xmm6 651 punpcklbw xmm6, xmm5 652 punpcklbw xmm7, xmm5 653 paddusw xmm0, xmm6 654 paddusw xmm1, xmm7 655 656 movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6 657 psrldq xmm0, 2 658 paddusw xmm6, xmm0 659 psrldq xmm0, 2 660 paddusw xmm6, xmm0 661 pshufb xmm6, xmm2 662 663 movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6 664 psrldq xmm1, 2 665 paddusw xmm7, xmm1 666 psrldq xmm1, 2 667 paddusw xmm7, xmm1 668 pshufb xmm7, xmm3 669 paddusw xmm6, xmm7 670 671 pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6 672 packuswb xmm6, xmm6 673 674 sub ecx, 6 675 movd [edx], xmm6 // write 6 pixels 676 psrlq xmm6, 16 677 movd [edx + 2], xmm6 678 lea edx, [edx + 6] 679 jg xloop 680 681 pop esi 682 ret 683 } 684} 685 686// Scale 16x2 pixels to 6x1 with interpolation 687__declspec(naked) __declspec(align(16)) 688void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, 689 ptrdiff_t src_stride, 690 uint8* dst_ptr, int dst_width) { 691 __asm { 692 push esi 693 mov eax, [esp + 4 + 4] // src_ptr 694 mov esi, [esp + 4 + 8] // src_stride 695 mov edx, [esp + 4 + 12] // dst_ptr 696 mov ecx, [esp + 4 + 16] // dst_width 697 movdqa xmm2, kShufAb0 698 movdqa xmm3, kShufAb1 699 movdqa xmm4, kShufAb2 700 movdqa xmm5, kScaleAb2 701 702 align 4 703 xloop: 704 movdqa xmm0, [eax] // average 2 rows into xmm0 705 pavgb xmm0, [eax + esi] 706 lea eax, [eax + 16] 707 708 movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1 709 pshufb xmm1, xmm2 710 movdqa xmm6, xmm0 711 pshufb xmm6, xmm3 712 paddusw xmm1, xmm6 713 pshufb xmm0, xmm4 714 paddusw xmm1, xmm0 715 716 pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2 717 packuswb xmm1, xmm1 718 719 sub ecx, 6 720 movd [edx], xmm1 // write 6 pixels 721 psrlq xmm1, 16 722 movd [edx + 2], xmm1 723 lea edx, [edx + 6] 724 jg xloop 725 726 pop esi 727 ret 728 } 729} 730 731// Reads 16xN bytes and produces 16 shorts at a time. 732// TODO(fbarchard): Make this handle 4xN bytes for any width ARGB. 733__declspec(naked) __declspec(align(16)) 734void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 735 uint16* dst_ptr, int src_width, 736 int src_height) { 737 __asm { 738 push esi 739 push edi 740 push ebx 741 push ebp 742 mov esi, [esp + 16 + 4] // src_ptr 743 mov edx, [esp + 16 + 8] // src_stride 744 mov edi, [esp + 16 + 12] // dst_ptr 745 mov ecx, [esp + 16 + 16] // dst_width 746 mov ebx, [esp + 16 + 20] // height 747 pxor xmm4, xmm4 748 dec ebx 749 750 align 4 751 xloop: 752 // first row 753 movdqa xmm0, [esi] 754 lea eax, [esi + edx] 755 movdqa xmm1, xmm0 756 punpcklbw xmm0, xmm4 757 punpckhbw xmm1, xmm4 758 lea esi, [esi + 16] 759 mov ebp, ebx 760 test ebp, ebp 761 je ydone 762 763 // sum remaining rows 764 align 4 765 yloop: 766 movdqa xmm2, [eax] // read 16 pixels 767 lea eax, [eax + edx] // advance to next row 768 movdqa xmm3, xmm2 769 punpcklbw xmm2, xmm4 770 punpckhbw xmm3, xmm4 771 paddusw xmm0, xmm2 // sum 16 words 772 paddusw xmm1, xmm3 773 sub ebp, 1 774 jg yloop 775 776 align 4 777 ydone: 778 movdqa [edi], xmm0 779 movdqa [edi + 16], xmm1 780 lea edi, [edi + 32] 781 782 sub ecx, 16 783 jg xloop 784 785 pop ebp 786 pop ebx 787 pop edi 788 pop esi 789 ret 790 } 791} 792 793// Bilinear column filtering. SSSE3 version. 794// TODO(fbarchard): Port to Neon 795// TODO(fbarchard): Switch the following: 796// xor ebx, ebx 797// mov bx, word ptr [esi + eax] // 2 source x0 pixels 798// To 799// movzx ebx, word ptr [esi + eax] // 2 source x0 pixels 800// when drmemory bug fixed. 801// https://code.google.com/p/drmemory/issues/detail?id=1396 802 803__declspec(naked) __declspec(align(16)) 804void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, 805 int dst_width, int x, int dx) { 806 __asm { 807 push ebx 808 push esi 809 push edi 810 mov edi, [esp + 12 + 4] // dst_ptr 811 mov esi, [esp + 12 + 8] // src_ptr 812 mov ecx, [esp + 12 + 12] // dst_width 813 movd xmm2, [esp + 12 + 16] // x 814 movd xmm3, [esp + 12 + 20] // dx 815 mov eax, 0x04040000 // shuffle to line up fractions with pixel. 816 movd xmm5, eax 817 pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. 818 psrlw xmm6, 9 819 pextrw eax, xmm2, 1 // get x0 integer. preroll 820 sub ecx, 2 821 jl xloop29 822 823 movdqa xmm0, xmm2 // x1 = x0 + dx 824 paddd xmm0, xmm3 825 punpckldq xmm2, xmm0 // x0 x1 826 punpckldq xmm3, xmm3 // dx dx 827 paddd xmm3, xmm3 // dx * 2, dx * 2 828 pextrw edx, xmm2, 3 // get x1 integer. preroll 829 830 // 2 Pixel loop. 831 align 4 832 xloop2: 833 movdqa xmm1, xmm2 // x0, x1 fractions. 834 paddd xmm2, xmm3 // x += dx 835 movzx ebx, word ptr [esi + eax] // 2 source x0 pixels 836 movd xmm0, ebx 837 psrlw xmm1, 9 // 7 bit fractions. 838 movzx ebx, word ptr [esi + edx] // 2 source x1 pixels 839 movd xmm4, ebx 840 pshufb xmm1, xmm5 // 0011 841 punpcklwd xmm0, xmm4 842 pxor xmm1, xmm6 // 0..7f and 7f..0 843 pmaddubsw xmm0, xmm1 // 16 bit, 2 pixels. 844 pextrw eax, xmm2, 1 // get x0 integer. next iteration. 845 pextrw edx, xmm2, 3 // get x1 integer. next iteration. 846 psrlw xmm0, 7 // 8.7 fixed point to low 8 bits. 847 packuswb xmm0, xmm0 // 8 bits, 2 pixels. 848 movd ebx, xmm0 849 mov [edi], bx 850 lea edi, [edi + 2] 851 sub ecx, 2 // 2 pixels 852 jge xloop2 853 854 align 4 855 xloop29: 856 857 add ecx, 2 - 1 858 jl xloop99 859 860 // 1 pixel remainder 861 movzx ebx, word ptr [esi + eax] // 2 source x0 pixels 862 movd xmm0, ebx 863 psrlw xmm2, 9 // 7 bit fractions. 864 pshufb xmm2, xmm5 // 0011 865 pxor xmm2, xmm6 // 0..7f and 7f..0 866 pmaddubsw xmm0, xmm2 // 16 bit 867 psrlw xmm0, 7 // 8.7 fixed point to low 8 bits. 868 packuswb xmm0, xmm0 // 8 bits 869 movd ebx, xmm0 870 mov [edi], bl 871 872 align 4 873 xloop99: 874 875 pop edi 876 pop esi 877 pop ebx 878 ret 879 } 880} 881 882// Reads 16 pixels, duplicates them and writes 32 pixels. 883// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. 884__declspec(naked) __declspec(align(16)) 885void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, 886 int dst_width, int x, int dx) { 887 __asm { 888 mov edx, [esp + 4] // dst_ptr 889 mov eax, [esp + 8] // src_ptr 890 mov ecx, [esp + 12] // dst_width 891 892 align 4 893 wloop: 894 movdqa xmm0, [eax] 895 lea eax, [eax + 16] 896 movdqa xmm1, xmm0 897 punpcklbw xmm0, xmm0 898 punpckhbw xmm1, xmm1 899 sub ecx, 32 900 movdqa [edx], xmm0 901 movdqa [edx + 16], xmm1 902 lea edx, [edx + 32] 903 jg wloop 904 905 ret 906 } 907} 908 909// Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6) 910// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. 911__declspec(naked) __declspec(align(16)) 912void ScaleARGBRowDown2_SSE2(const uint8* src_argb, 913 ptrdiff_t src_stride, 914 uint8* dst_argb, int dst_width) { 915 __asm { 916 mov eax, [esp + 4] // src_argb 917 // src_stride ignored 918 mov edx, [esp + 12] // dst_argb 919 mov ecx, [esp + 16] // dst_width 920 921 align 4 922 wloop: 923 movdqa xmm0, [eax] 924 movdqa xmm1, [eax + 16] 925 lea eax, [eax + 32] 926 shufps xmm0, xmm1, 0xdd 927 sub ecx, 4 928 movdqa [edx], xmm0 929 lea edx, [edx + 16] 930 jg wloop 931 932 ret 933 } 934} 935 936// Blends 8x1 rectangle to 4x1. 937// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. 938__declspec(naked) __declspec(align(16)) 939void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, 940 ptrdiff_t src_stride, 941 uint8* dst_argb, int dst_width) { 942 __asm { 943 mov eax, [esp + 4] // src_argb 944 // src_stride ignored 945 mov edx, [esp + 12] // dst_argb 946 mov ecx, [esp + 16] // dst_width 947 948 align 4 949 wloop: 950 movdqa xmm0, [eax] 951 movdqa xmm1, [eax + 16] 952 lea eax, [eax + 32] 953 movdqa xmm2, xmm0 954 shufps xmm0, xmm1, 0x88 // even pixels 955 shufps xmm2, xmm1, 0xdd // odd pixels 956 pavgb xmm0, xmm2 957 sub ecx, 4 958 movdqa [edx], xmm0 959 lea edx, [edx + 16] 960 jg wloop 961 962 ret 963 } 964} 965 966// Blends 8x2 rectangle to 4x1. 967// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. 968__declspec(naked) __declspec(align(16)) 969void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, 970 ptrdiff_t src_stride, 971 uint8* dst_argb, int dst_width) { 972 __asm { 973 push esi 974 mov eax, [esp + 4 + 4] // src_argb 975 mov esi, [esp + 4 + 8] // src_stride 976 mov edx, [esp + 4 + 12] // dst_argb 977 mov ecx, [esp + 4 + 16] // dst_width 978 979 align 4 980 wloop: 981 movdqa xmm0, [eax] 982 movdqa xmm1, [eax + 16] 983 movdqa xmm2, [eax + esi] 984 movdqa xmm3, [eax + esi + 16] 985 lea eax, [eax + 32] 986 pavgb xmm0, xmm2 // average rows 987 pavgb xmm1, xmm3 988 movdqa xmm2, xmm0 // average columns (8 to 4 pixels) 989 shufps xmm0, xmm1, 0x88 // even pixels 990 shufps xmm2, xmm1, 0xdd // odd pixels 991 pavgb xmm0, xmm2 992 sub ecx, 4 993 movdqa [edx], xmm0 994 lea edx, [edx + 16] 995 jg wloop 996 997 pop esi 998 ret 999 } 1000} 1001 1002// Reads 4 pixels at a time. 1003// Alignment requirement: dst_argb 16 byte aligned. 1004__declspec(naked) __declspec(align(16)) 1005void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, 1006 int src_stepx, 1007 uint8* dst_argb, int dst_width) { 1008 __asm { 1009 push ebx 1010 push edi 1011 mov eax, [esp + 8 + 4] // src_argb 1012 // src_stride ignored 1013 mov ebx, [esp + 8 + 12] // src_stepx 1014 mov edx, [esp + 8 + 16] // dst_argb 1015 mov ecx, [esp + 8 + 20] // dst_width 1016 lea ebx, [ebx * 4] 1017 lea edi, [ebx + ebx * 2] 1018 1019 align 4 1020 wloop: 1021 movd xmm0, [eax] 1022 movd xmm1, [eax + ebx] 1023 punpckldq xmm0, xmm1 1024 movd xmm2, [eax + ebx * 2] 1025 movd xmm3, [eax + edi] 1026 lea eax, [eax + ebx * 4] 1027 punpckldq xmm2, xmm3 1028 punpcklqdq xmm0, xmm2 1029 sub ecx, 4 1030 movdqa [edx], xmm0 1031 lea edx, [edx + 16] 1032 jg wloop 1033 1034 pop edi 1035 pop ebx 1036 ret 1037 } 1038} 1039 1040// Blends four 2x2 to 4x1. 1041// Alignment requirement: dst_argb 16 byte aligned. 1042__declspec(naked) __declspec(align(16)) 1043void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, 1044 ptrdiff_t src_stride, 1045 int src_stepx, 1046 uint8* dst_argb, int dst_width) { 1047 __asm { 1048 push ebx 1049 push esi 1050 push edi 1051 mov eax, [esp + 12 + 4] // src_argb 1052 mov esi, [esp + 12 + 8] // src_stride 1053 mov ebx, [esp + 12 + 12] // src_stepx 1054 mov edx, [esp + 12 + 16] // dst_argb 1055 mov ecx, [esp + 12 + 20] // dst_width 1056 lea esi, [eax + esi] // row1 pointer 1057 lea ebx, [ebx * 4] 1058 lea edi, [ebx + ebx * 2] 1059 1060 align 4 1061 wloop: 1062 movq xmm0, qword ptr [eax] // row0 4 pairs 1063 movhps xmm0, qword ptr [eax + ebx] 1064 movq xmm1, qword ptr [eax + ebx * 2] 1065 movhps xmm1, qword ptr [eax + edi] 1066 lea eax, [eax + ebx * 4] 1067 movq xmm2, qword ptr [esi] // row1 4 pairs 1068 movhps xmm2, qword ptr [esi + ebx] 1069 movq xmm3, qword ptr [esi + ebx * 2] 1070 movhps xmm3, qword ptr [esi + edi] 1071 lea esi, [esi + ebx * 4] 1072 pavgb xmm0, xmm2 // average rows 1073 pavgb xmm1, xmm3 1074 movdqa xmm2, xmm0 // average columns (8 to 4 pixels) 1075 shufps xmm0, xmm1, 0x88 // even pixels 1076 shufps xmm2, xmm1, 0xdd // odd pixels 1077 pavgb xmm0, xmm2 1078 sub ecx, 4 1079 movdqa [edx], xmm0 1080 lea edx, [edx + 16] 1081 jg wloop 1082 1083 pop edi 1084 pop esi 1085 pop ebx 1086 ret 1087 } 1088} 1089 1090// Column scaling unfiltered. SSE2 version. 1091__declspec(naked) __declspec(align(16)) 1092void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, 1093 int dst_width, int x, int dx) { 1094 __asm { 1095 push edi 1096 push esi 1097 mov edi, [esp + 8 + 4] // dst_argb 1098 mov esi, [esp + 8 + 8] // src_argb 1099 mov ecx, [esp + 8 + 12] // dst_width 1100 movd xmm2, [esp + 8 + 16] // x 1101 movd xmm3, [esp + 8 + 20] // dx 1102 1103 pshufd xmm2, xmm2, 0 // x0 x0 x0 x0 1104 pshufd xmm0, xmm3, 0x11 // dx 0 dx 0 1105 paddd xmm2, xmm0 1106 paddd xmm3, xmm3 // 0, 0, 0, dx * 2 1107 pshufd xmm0, xmm3, 0x05 // dx * 2, dx * 2, 0, 0 1108 paddd xmm2, xmm0 // x3 x2 x1 x0 1109 paddd xmm3, xmm3 // 0, 0, 0, dx * 4 1110 pshufd xmm3, xmm3, 0 // dx * 4, dx * 4, dx * 4, dx * 4 1111 1112 pextrw eax, xmm2, 1 // get x0 integer. 1113 pextrw edx, xmm2, 3 // get x1 integer. 1114 1115 cmp ecx, 0 1116 jle xloop99 1117 sub ecx, 4 1118 jl xloop49 1119 1120 // 4 Pixel loop. 1121 align 4 1122 xloop4: 1123 movd xmm0, [esi + eax * 4] // 1 source x0 pixels 1124 movd xmm1, [esi + edx * 4] // 1 source x1 pixels 1125 pextrw eax, xmm2, 5 // get x2 integer. 1126 pextrw edx, xmm2, 7 // get x3 integer. 1127 paddd xmm2, xmm3 // x += dx 1128 punpckldq xmm0, xmm1 // x0 x1 1129 1130 movd xmm1, [esi + eax * 4] // 1 source x2 pixels 1131 movd xmm4, [esi + edx * 4] // 1 source x3 pixels 1132 pextrw eax, xmm2, 1 // get x0 integer. next iteration. 1133 pextrw edx, xmm2, 3 // get x1 integer. next iteration. 1134 punpckldq xmm1, xmm4 // x2 x3 1135 punpcklqdq xmm0, xmm1 // x0 x1 x2 x3 1136 sub ecx, 4 // 4 pixels 1137 movdqu [edi], xmm0 1138 lea edi, [edi + 16] 1139 jge xloop4 1140 1141 align 4 1142 xloop49: 1143 test ecx, 2 1144 je xloop29 1145 1146 // 2 Pixels. 1147 movd xmm0, [esi + eax * 4] // 1 source x0 pixels 1148 movd xmm1, [esi + edx * 4] // 1 source x1 pixels 1149 pextrw eax, xmm2, 5 // get x2 integer. 1150 punpckldq xmm0, xmm1 // x0 x1 1151 1152 movq qword ptr [edi], xmm0 1153 lea edi, [edi + 8] 1154 1155 xloop29: 1156 test ecx, 1 1157 je xloop99 1158 1159 // 1 Pixels. 1160 movd xmm0, [esi + eax * 4] // 1 source x2 pixels 1161 movd dword ptr [edi], xmm0 1162 align 4 1163 xloop99: 1164 1165 pop esi 1166 pop edi 1167 ret 1168 } 1169} 1170 1171// Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version. 1172// TODO(fbarchard): Port to Neon 1173 1174// Shuffle table for arranging 2 pixels into pairs for pmaddubsw 1175static uvec8 kShuffleColARGB = { 1176 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel 1177 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel 1178}; 1179 1180// Shuffle table for duplicating 2 fractions into 8 bytes each 1181static uvec8 kShuffleFractions = { 1182 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 1183}; 1184 1185__declspec(naked) __declspec(align(16)) 1186void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, 1187 int dst_width, int x, int dx) { 1188 __asm { 1189 push esi 1190 push edi 1191 mov edi, [esp + 8 + 4] // dst_argb 1192 mov esi, [esp + 8 + 8] // src_argb 1193 mov ecx, [esp + 8 + 12] // dst_width 1194 movd xmm2, [esp + 8 + 16] // x 1195 movd xmm3, [esp + 8 + 20] // dx 1196 movdqa xmm4, kShuffleColARGB 1197 movdqa xmm5, kShuffleFractions 1198 pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. 1199 psrlw xmm6, 9 1200 pextrw eax, xmm2, 1 // get x0 integer. preroll 1201 sub ecx, 2 1202 jl xloop29 1203 1204 movdqa xmm0, xmm2 // x1 = x0 + dx 1205 paddd xmm0, xmm3 1206 punpckldq xmm2, xmm0 // x0 x1 1207 punpckldq xmm3, xmm3 // dx dx 1208 paddd xmm3, xmm3 // dx * 2, dx * 2 1209 pextrw edx, xmm2, 3 // get x1 integer. preroll 1210 1211 // 2 Pixel loop. 1212 align 4 1213 xloop2: 1214 movdqa xmm1, xmm2 // x0, x1 fractions. 1215 paddd xmm2, xmm3 // x += dx 1216 movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels 1217 psrlw xmm1, 9 // 7 bit fractions. 1218 movhps xmm0, qword ptr [esi + edx * 4] // 2 source x1 pixels 1219 pshufb xmm1, xmm5 // 0000000011111111 1220 pshufb xmm0, xmm4 // arrange pixels into pairs 1221 pxor xmm1, xmm6 // 0..7f and 7f..0 1222 pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels. 1223 pextrw eax, xmm2, 1 // get x0 integer. next iteration. 1224 pextrw edx, xmm2, 3 // get x1 integer. next iteration. 1225 psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits. 1226 packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels. 1227 movq qword ptr [edi], xmm0 1228 lea edi, [edi + 8] 1229 sub ecx, 2 // 2 pixels 1230 jge xloop2 1231 1232 align 4 1233 xloop29: 1234 1235 add ecx, 2 - 1 1236 jl xloop99 1237 1238 // 1 pixel remainder 1239 psrlw xmm2, 9 // 7 bit fractions. 1240 movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels 1241 pshufb xmm2, xmm5 // 00000000 1242 pshufb xmm0, xmm4 // arrange pixels into pairs 1243 pxor xmm2, xmm6 // 0..7f and 7f..0 1244 pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel. 1245 psrlw xmm0, 7 1246 packuswb xmm0, xmm0 // argb 8 bits, 1 pixel. 1247 movd [edi], xmm0 1248 1249 align 4 1250 xloop99: 1251 1252 pop edi 1253 pop esi 1254 ret 1255 } 1256} 1257 1258// Reads 4 pixels, duplicates them and writes 8 pixels. 1259// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. 1260__declspec(naked) __declspec(align(16)) 1261void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, 1262 int dst_width, int x, int dx) { 1263 __asm { 1264 mov edx, [esp + 4] // dst_argb 1265 mov eax, [esp + 8] // src_argb 1266 mov ecx, [esp + 12] // dst_width 1267 1268 align 4 1269 wloop: 1270 movdqa xmm0, [eax] 1271 lea eax, [eax + 16] 1272 movdqa xmm1, xmm0 1273 punpckldq xmm0, xmm0 1274 punpckhdq xmm1, xmm1 1275 sub ecx, 8 1276 movdqa [edx], xmm0 1277 movdqa [edx + 16], xmm1 1278 lea edx, [edx + 32] 1279 jg wloop 1280 1281 ret 1282 } 1283} 1284 1285// Divide num by div and return as 16.16 fixed point result. 1286__declspec(naked) __declspec(align(16)) 1287int FixedDiv_X86(int num, int div) { 1288 __asm { 1289 mov eax, [esp + 4] // num 1290 cdq // extend num to 64 bits 1291 shld edx, eax, 16 // 32.16 1292 shl eax, 16 1293 idiv dword ptr [esp + 8] 1294 ret 1295 } 1296} 1297 1298// Divide num by div and return as 16.16 fixed point result. 1299__declspec(naked) __declspec(align(16)) 1300int FixedDiv1_X86(int num, int div) { 1301 __asm { 1302 mov eax, [esp + 4] // num 1303 mov ecx, [esp + 8] // denom 1304 cdq // extend num to 64 bits 1305 shld edx, eax, 16 // 32.16 1306 shl eax, 16 1307 sub eax, 0x00010001 1308 sbb edx, 0 1309 sub ecx, 1 1310 idiv ecx 1311 ret 1312 } 1313} 1314 1315#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) 1316 1317#ifdef __cplusplus 1318} // extern "C" 1319} // namespace libyuv 1320#endif 1321