1/* 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include "libyuv/row.h" 12 13// This module is for Visual C 32/64 bit and clangcl 32 bit 14#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \ 15 (defined(_M_IX86) || (defined(_M_X64) && !defined(__clang__))) 16 17#if defined(_M_X64) 18#include <emmintrin.h> 19#include <tmmintrin.h> // For _mm_maddubs_epi16 20#endif 21 22#ifdef __cplusplus 23namespace libyuv { 24extern "C" { 25#endif 26 27// 64 bit 28#if defined(_M_X64) 29 30// Read 4 UV from 422, upsample to 8 UV. 31#define READYUV422 \ 32 xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); \ 33 xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); \ 34 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ 35 xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \ 36 u_buf += 4; \ 37 xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ 38 xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ 39 y_buf += 8; 40 41// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. 42#define READYUVA422 \ 43 xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); \ 44 xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); \ 45 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ 46 xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \ 47 u_buf += 4; \ 48 xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ 49 xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ 50 y_buf += 8; \ 51 xmm5 = _mm_loadl_epi64((__m128i*)a_buf); \ 52 a_buf += 8; 53 54// Convert 8 pixels: 8 UV and 8 Y. 55#define YUVTORGB(yuvconstants) \ 56 xmm1 = _mm_loadu_si128(&xmm0); \ 57 xmm2 = _mm_loadu_si128(&xmm0); \ 58 xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB); \ 59 xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG); \ 60 xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR); \ 61 xmm0 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasB, xmm0); \ 62 xmm1 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasG, xmm1); \ 63 xmm2 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasR, xmm2); \ 64 xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb); \ 65 xmm0 = _mm_adds_epi16(xmm0, xmm4); \ 66 xmm1 = _mm_adds_epi16(xmm1, xmm4); \ 67 xmm2 = _mm_adds_epi16(xmm2, xmm4); \ 68 xmm0 = _mm_srai_epi16(xmm0, 6); \ 69 xmm1 = _mm_srai_epi16(xmm1, 6); \ 70 xmm2 = _mm_srai_epi16(xmm2, 6); \ 71 xmm0 = _mm_packus_epi16(xmm0, xmm0); \ 72 xmm1 = _mm_packus_epi16(xmm1, xmm1); \ 73 xmm2 = _mm_packus_epi16(xmm2, xmm2); 74 75// Store 8 ARGB values. 76#define STOREARGB \ 77 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ 78 xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); \ 79 xmm1 = _mm_loadu_si128(&xmm0); \ 80 xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); \ 81 xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); \ 82 _mm_storeu_si128((__m128i *)dst_argb, xmm0); \ 83 _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1); \ 84 dst_argb += 32; 85 86 87#if defined(HAS_I422TOARGBROW_SSSE3) 88void I422ToARGBRow_SSSE3(const uint8* y_buf, 89 const uint8* u_buf, 90 const uint8* v_buf, 91 uint8* dst_argb, 92 const struct YuvConstants* yuvconstants, 93 int width) { 94 __m128i xmm0, xmm1, xmm2, xmm4; 95 const __m128i xmm5 = _mm_set1_epi8(-1); 96 const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf; 97 while (width > 0) { 98 READYUV422 99 YUVTORGB(yuvconstants) 100 STOREARGB 101 width -= 8; 102 } 103} 104#endif 105 106#if defined(HAS_I422ALPHATOARGBROW_SSSE3) 107void I422AlphaToARGBRow_SSSE3(const uint8* y_buf, 108 const uint8* u_buf, 109 const uint8* v_buf, 110 const uint8* a_buf, 111 uint8* dst_argb, 112 const struct YuvConstants* yuvconstants, 113 int width) { 114 __m128i xmm0, xmm1, xmm2, xmm4, xmm5; 115 const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf; 116 while (width > 0) { 117 READYUVA422 118 YUVTORGB(yuvconstants) 119 STOREARGB 120 width -= 8; 121 } 122} 123#endif 124 125// 32 bit 126#else // defined(_M_X64) 127#ifdef HAS_ARGBTOYROW_SSSE3 128 129// Constants for ARGB. 130static const vec8 kARGBToY = { 131 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 132}; 133 134// JPeg full range. 135static const vec8 kARGBToYJ = { 136 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0 137}; 138 139static const vec8 kARGBToU = { 140 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 141}; 142 143static const vec8 kARGBToUJ = { 144 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0 145}; 146 147static const vec8 kARGBToV = { 148 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, 149}; 150 151static const vec8 kARGBToVJ = { 152 -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0 153}; 154 155// vpshufb for vphaddw + vpackuswb packed to shorts. 156static const lvec8 kShufARGBToUV_AVX = { 157 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, 158 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 159}; 160 161// Constants for BGRA. 162static const vec8 kBGRAToY = { 163 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13 164}; 165 166static const vec8 kBGRAToU = { 167 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112 168}; 169 170static const vec8 kBGRAToV = { 171 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18 172}; 173 174// Constants for ABGR. 175static const vec8 kABGRToY = { 176 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0 177}; 178 179static const vec8 kABGRToU = { 180 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0 181}; 182 183static const vec8 kABGRToV = { 184 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0 185}; 186 187// Constants for RGBA. 188static const vec8 kRGBAToY = { 189 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33 190}; 191 192static const vec8 kRGBAToU = { 193 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38 194}; 195 196static const vec8 kRGBAToV = { 197 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112 198}; 199 200static const uvec8 kAddY16 = { 201 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u 202}; 203 204// 7 bit fixed point 0.5. 205static const vec16 kAddYJ64 = { 206 64, 64, 64, 64, 64, 64, 64, 64 207}; 208 209static const uvec8 kAddUV128 = { 210 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 211 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u 212}; 213 214static const uvec16 kAddUVJ128 = { 215 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u 216}; 217 218// Shuffle table for converting RGB24 to ARGB. 219static const uvec8 kShuffleMaskRGB24ToARGB = { 220 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u 221}; 222 223// Shuffle table for converting RAW to ARGB. 224static const uvec8 kShuffleMaskRAWToARGB = { 225 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u 226}; 227 228// Shuffle table for converting RAW to RGB24. First 8. 229static const uvec8 kShuffleMaskRAWToRGB24_0 = { 230 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u, 231 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u 232}; 233 234// Shuffle table for converting RAW to RGB24. Middle 8. 235static const uvec8 kShuffleMaskRAWToRGB24_1 = { 236 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u, 237 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u 238}; 239 240// Shuffle table for converting RAW to RGB24. Last 8. 241static const uvec8 kShuffleMaskRAWToRGB24_2 = { 242 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u, 243 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u 244}; 245 246// Shuffle table for converting ARGB to RGB24. 247static const uvec8 kShuffleMaskARGBToRGB24 = { 248 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u 249}; 250 251// Shuffle table for converting ARGB to RAW. 252static const uvec8 kShuffleMaskARGBToRAW = { 253 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u 254}; 255 256// Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 257static const uvec8 kShuffleMaskARGBToRGB24_0 = { 258 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u 259}; 260 261// YUY2 shuf 16 Y to 32 Y. 262static const lvec8 kShuffleYUY2Y = { 263 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14, 264 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14 265}; 266 267// YUY2 shuf 8 UV to 16 UV. 268static const lvec8 kShuffleYUY2UV = { 269 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15, 270 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15 271}; 272 273// UYVY shuf 16 Y to 32 Y. 274static const lvec8 kShuffleUYVYY = { 275 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15, 276 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15 277}; 278 279// UYVY shuf 8 UV to 16 UV. 280static const lvec8 kShuffleUYVYUV = { 281 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14, 282 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14 283}; 284 285// NV21 shuf 8 VU to 16 UV. 286static const lvec8 kShuffleNV21 = { 287 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, 288 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, 289}; 290 291// Duplicates gray value 3 times and fills in alpha opaque. 292__declspec(naked) 293void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) { 294 __asm { 295 mov eax, [esp + 4] // src_y 296 mov edx, [esp + 8] // dst_argb 297 mov ecx, [esp + 12] // width 298 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 299 pslld xmm5, 24 300 301 convertloop: 302 movq xmm0, qword ptr [eax] 303 lea eax, [eax + 8] 304 punpcklbw xmm0, xmm0 305 movdqa xmm1, xmm0 306 punpcklwd xmm0, xmm0 307 punpckhwd xmm1, xmm1 308 por xmm0, xmm5 309 por xmm1, xmm5 310 movdqu [edx], xmm0 311 movdqu [edx + 16], xmm1 312 lea edx, [edx + 32] 313 sub ecx, 8 314 jg convertloop 315 ret 316 } 317} 318 319#ifdef HAS_J400TOARGBROW_AVX2 320// Duplicates gray value 3 times and fills in alpha opaque. 321__declspec(naked) 322void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width) { 323 __asm { 324 mov eax, [esp + 4] // src_y 325 mov edx, [esp + 8] // dst_argb 326 mov ecx, [esp + 12] // width 327 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 328 vpslld ymm5, ymm5, 24 329 330 convertloop: 331 vmovdqu xmm0, [eax] 332 lea eax, [eax + 16] 333 vpermq ymm0, ymm0, 0xd8 334 vpunpcklbw ymm0, ymm0, ymm0 335 vpermq ymm0, ymm0, 0xd8 336 vpunpckhwd ymm1, ymm0, ymm0 337 vpunpcklwd ymm0, ymm0, ymm0 338 vpor ymm0, ymm0, ymm5 339 vpor ymm1, ymm1, ymm5 340 vmovdqu [edx], ymm0 341 vmovdqu [edx + 32], ymm1 342 lea edx, [edx + 64] 343 sub ecx, 16 344 jg convertloop 345 vzeroupper 346 ret 347 } 348} 349#endif // HAS_J400TOARGBROW_AVX2 350 351__declspec(naked) 352void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) { 353 __asm { 354 mov eax, [esp + 4] // src_rgb24 355 mov edx, [esp + 8] // dst_argb 356 mov ecx, [esp + 12] // width 357 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 358 pslld xmm5, 24 359 movdqa xmm4, xmmword ptr kShuffleMaskRGB24ToARGB 360 361 convertloop: 362 movdqu xmm0, [eax] 363 movdqu xmm1, [eax + 16] 364 movdqu xmm3, [eax + 32] 365 lea eax, [eax + 48] 366 movdqa xmm2, xmm3 367 palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} 368 pshufb xmm2, xmm4 369 por xmm2, xmm5 370 palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} 371 pshufb xmm0, xmm4 372 movdqu [edx + 32], xmm2 373 por xmm0, xmm5 374 pshufb xmm1, xmm4 375 movdqu [edx], xmm0 376 por xmm1, xmm5 377 palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} 378 pshufb xmm3, xmm4 379 movdqu [edx + 16], xmm1 380 por xmm3, xmm5 381 movdqu [edx + 48], xmm3 382 lea edx, [edx + 64] 383 sub ecx, 16 384 jg convertloop 385 ret 386 } 387} 388 389__declspec(naked) 390void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, 391 int width) { 392 __asm { 393 mov eax, [esp + 4] // src_raw 394 mov edx, [esp + 8] // dst_argb 395 mov ecx, [esp + 12] // width 396 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 397 pslld xmm5, 24 398 movdqa xmm4, xmmword ptr kShuffleMaskRAWToARGB 399 400 convertloop: 401 movdqu xmm0, [eax] 402 movdqu xmm1, [eax + 16] 403 movdqu xmm3, [eax + 32] 404 lea eax, [eax + 48] 405 movdqa xmm2, xmm3 406 palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} 407 pshufb xmm2, xmm4 408 por xmm2, xmm5 409 palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} 410 pshufb xmm0, xmm4 411 movdqu [edx + 32], xmm2 412 por xmm0, xmm5 413 pshufb xmm1, xmm4 414 movdqu [edx], xmm0 415 por xmm1, xmm5 416 palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} 417 pshufb xmm3, xmm4 418 movdqu [edx + 16], xmm1 419 por xmm3, xmm5 420 movdqu [edx + 48], xmm3 421 lea edx, [edx + 64] 422 sub ecx, 16 423 jg convertloop 424 ret 425 } 426} 427 428__declspec(naked) 429void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) { 430 __asm { 431 mov eax, [esp + 4] // src_raw 432 mov edx, [esp + 8] // dst_rgb24 433 mov ecx, [esp + 12] // width 434 movdqa xmm3, xmmword ptr kShuffleMaskRAWToRGB24_0 435 movdqa xmm4, xmmword ptr kShuffleMaskRAWToRGB24_1 436 movdqa xmm5, xmmword ptr kShuffleMaskRAWToRGB24_2 437 438 convertloop: 439 movdqu xmm0, [eax] 440 movdqu xmm1, [eax + 4] 441 movdqu xmm2, [eax + 8] 442 lea eax, [eax + 24] 443 pshufb xmm0, xmm3 444 pshufb xmm1, xmm4 445 pshufb xmm2, xmm5 446 movq qword ptr [edx], xmm0 447 movq qword ptr [edx + 8], xmm1 448 movq qword ptr [edx + 16], xmm2 449 lea edx, [edx + 24] 450 sub ecx, 8 451 jg convertloop 452 ret 453 } 454} 455 456// pmul method to replicate bits. 457// Math to replicate bits: 458// (v << 8) | (v << 3) 459// v * 256 + v * 8 460// v * (256 + 8) 461// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 462// 20 instructions. 463__declspec(naked) 464void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, 465 int width) { 466 __asm { 467 mov eax, 0x01080108 // generate multiplier to repeat 5 bits 468 movd xmm5, eax 469 pshufd xmm5, xmm5, 0 470 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits 471 movd xmm6, eax 472 pshufd xmm6, xmm6, 0 473 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red 474 psllw xmm3, 11 475 pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green 476 psllw xmm4, 10 477 psrlw xmm4, 5 478 pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha 479 psllw xmm7, 8 480 481 mov eax, [esp + 4] // src_rgb565 482 mov edx, [esp + 8] // dst_argb 483 mov ecx, [esp + 12] // width 484 sub edx, eax 485 sub edx, eax 486 487 convertloop: 488 movdqu xmm0, [eax] // fetch 8 pixels of bgr565 489 movdqa xmm1, xmm0 490 movdqa xmm2, xmm0 491 pand xmm1, xmm3 // R in upper 5 bits 492 psllw xmm2, 11 // B in upper 5 bits 493 pmulhuw xmm1, xmm5 // * (256 + 8) 494 pmulhuw xmm2, xmm5 // * (256 + 8) 495 psllw xmm1, 8 496 por xmm1, xmm2 // RB 497 pand xmm0, xmm4 // G in middle 6 bits 498 pmulhuw xmm0, xmm6 // << 5 * (256 + 4) 499 por xmm0, xmm7 // AG 500 movdqa xmm2, xmm1 501 punpcklbw xmm1, xmm0 502 punpckhbw xmm2, xmm0 503 movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB 504 movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB 505 lea eax, [eax + 16] 506 sub ecx, 8 507 jg convertloop 508 ret 509 } 510} 511 512#ifdef HAS_RGB565TOARGBROW_AVX2 513// pmul method to replicate bits. 514// Math to replicate bits: 515// (v << 8) | (v << 3) 516// v * 256 + v * 8 517// v * (256 + 8) 518// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 519__declspec(naked) 520void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, 521 int width) { 522 __asm { 523 mov eax, 0x01080108 // generate multiplier to repeat 5 bits 524 vmovd xmm5, eax 525 vbroadcastss ymm5, xmm5 526 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits 527 vmovd xmm6, eax 528 vbroadcastss ymm6, xmm6 529 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red 530 vpsllw ymm3, ymm3, 11 531 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x07e007e0 for Green 532 vpsllw ymm4, ymm4, 10 533 vpsrlw ymm4, ymm4, 5 534 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha 535 vpsllw ymm7, ymm7, 8 536 537 mov eax, [esp + 4] // src_rgb565 538 mov edx, [esp + 8] // dst_argb 539 mov ecx, [esp + 12] // width 540 sub edx, eax 541 sub edx, eax 542 543 convertloop: 544 vmovdqu ymm0, [eax] // fetch 16 pixels of bgr565 545 vpand ymm1, ymm0, ymm3 // R in upper 5 bits 546 vpsllw ymm2, ymm0, 11 // B in upper 5 bits 547 vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8) 548 vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8) 549 vpsllw ymm1, ymm1, 8 550 vpor ymm1, ymm1, ymm2 // RB 551 vpand ymm0, ymm0, ymm4 // G in middle 6 bits 552 vpmulhuw ymm0, ymm0, ymm6 // << 5 * (256 + 4) 553 vpor ymm0, ymm0, ymm7 // AG 554 vpermq ymm0, ymm0, 0xd8 // mutate for unpack 555 vpermq ymm1, ymm1, 0xd8 556 vpunpckhbw ymm2, ymm1, ymm0 557 vpunpcklbw ymm1, ymm1, ymm0 558 vmovdqu [eax * 2 + edx], ymm1 // store 4 pixels of ARGB 559 vmovdqu [eax * 2 + edx + 32], ymm2 // store next 4 pixels of ARGB 560 lea eax, [eax + 32] 561 sub ecx, 16 562 jg convertloop 563 vzeroupper 564 ret 565 } 566} 567#endif // HAS_RGB565TOARGBROW_AVX2 568 569#ifdef HAS_ARGB1555TOARGBROW_AVX2 570__declspec(naked) 571void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb, 572 int width) { 573 __asm { 574 mov eax, 0x01080108 // generate multiplier to repeat 5 bits 575 vmovd xmm5, eax 576 vbroadcastss ymm5, xmm5 577 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits 578 vmovd xmm6, eax 579 vbroadcastss ymm6, xmm6 580 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red 581 vpsllw ymm3, ymm3, 11 582 vpsrlw ymm4, ymm3, 6 // generate mask 0x03e003e0 for Green 583 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha 584 vpsllw ymm7, ymm7, 8 585 586 mov eax, [esp + 4] // src_argb1555 587 mov edx, [esp + 8] // dst_argb 588 mov ecx, [esp + 12] // width 589 sub edx, eax 590 sub edx, eax 591 592 convertloop: 593 vmovdqu ymm0, [eax] // fetch 16 pixels of 1555 594 vpsllw ymm1, ymm0, 1 // R in upper 5 bits 595 vpsllw ymm2, ymm0, 11 // B in upper 5 bits 596 vpand ymm1, ymm1, ymm3 597 vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8) 598 vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8) 599 vpsllw ymm1, ymm1, 8 600 vpor ymm1, ymm1, ymm2 // RB 601 vpsraw ymm2, ymm0, 8 // A 602 vpand ymm0, ymm0, ymm4 // G in middle 5 bits 603 vpmulhuw ymm0, ymm0, ymm6 // << 6 * (256 + 8) 604 vpand ymm2, ymm2, ymm7 605 vpor ymm0, ymm0, ymm2 // AG 606 vpermq ymm0, ymm0, 0xd8 // mutate for unpack 607 vpermq ymm1, ymm1, 0xd8 608 vpunpckhbw ymm2, ymm1, ymm0 609 vpunpcklbw ymm1, ymm1, ymm0 610 vmovdqu [eax * 2 + edx], ymm1 // store 8 pixels of ARGB 611 vmovdqu [eax * 2 + edx + 32], ymm2 // store next 8 pixels of ARGB 612 lea eax, [eax + 32] 613 sub ecx, 16 614 jg convertloop 615 vzeroupper 616 ret 617 } 618} 619#endif // HAS_ARGB1555TOARGBROW_AVX2 620 621#ifdef HAS_ARGB4444TOARGBROW_AVX2 622__declspec(naked) 623void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb, 624 int width) { 625 __asm { 626 mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f 627 vmovd xmm4, eax 628 vbroadcastss ymm4, xmm4 629 vpslld ymm5, ymm4, 4 // 0xf0f0f0f0 for high nibbles 630 mov eax, [esp + 4] // src_argb4444 631 mov edx, [esp + 8] // dst_argb 632 mov ecx, [esp + 12] // width 633 sub edx, eax 634 sub edx, eax 635 636 convertloop: 637 vmovdqu ymm0, [eax] // fetch 16 pixels of bgra4444 638 vpand ymm2, ymm0, ymm5 // mask high nibbles 639 vpand ymm0, ymm0, ymm4 // mask low nibbles 640 vpsrlw ymm3, ymm2, 4 641 vpsllw ymm1, ymm0, 4 642 vpor ymm2, ymm2, ymm3 643 vpor ymm0, ymm0, ymm1 644 vpermq ymm0, ymm0, 0xd8 // mutate for unpack 645 vpermq ymm2, ymm2, 0xd8 646 vpunpckhbw ymm1, ymm0, ymm2 647 vpunpcklbw ymm0, ymm0, ymm2 648 vmovdqu [eax * 2 + edx], ymm0 // store 8 pixels of ARGB 649 vmovdqu [eax * 2 + edx + 32], ymm1 // store next 8 pixels of ARGB 650 lea eax, [eax + 32] 651 sub ecx, 16 652 jg convertloop 653 vzeroupper 654 ret 655 } 656} 657#endif // HAS_ARGB4444TOARGBROW_AVX2 658 659// 24 instructions 660__declspec(naked) 661void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, 662 int width) { 663 __asm { 664 mov eax, 0x01080108 // generate multiplier to repeat 5 bits 665 movd xmm5, eax 666 pshufd xmm5, xmm5, 0 667 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits 668 movd xmm6, eax 669 pshufd xmm6, xmm6, 0 670 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red 671 psllw xmm3, 11 672 movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green 673 psrlw xmm4, 6 674 pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha 675 psllw xmm7, 8 676 677 mov eax, [esp + 4] // src_argb1555 678 mov edx, [esp + 8] // dst_argb 679 mov ecx, [esp + 12] // width 680 sub edx, eax 681 sub edx, eax 682 683 convertloop: 684 movdqu xmm0, [eax] // fetch 8 pixels of 1555 685 movdqa xmm1, xmm0 686 movdqa xmm2, xmm0 687 psllw xmm1, 1 // R in upper 5 bits 688 psllw xmm2, 11 // B in upper 5 bits 689 pand xmm1, xmm3 690 pmulhuw xmm2, xmm5 // * (256 + 8) 691 pmulhuw xmm1, xmm5 // * (256 + 8) 692 psllw xmm1, 8 693 por xmm1, xmm2 // RB 694 movdqa xmm2, xmm0 695 pand xmm0, xmm4 // G in middle 5 bits 696 psraw xmm2, 8 // A 697 pmulhuw xmm0, xmm6 // << 6 * (256 + 8) 698 pand xmm2, xmm7 699 por xmm0, xmm2 // AG 700 movdqa xmm2, xmm1 701 punpcklbw xmm1, xmm0 702 punpckhbw xmm2, xmm0 703 movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB 704 movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB 705 lea eax, [eax + 16] 706 sub ecx, 8 707 jg convertloop 708 ret 709 } 710} 711 712// 18 instructions. 713__declspec(naked) 714void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, 715 int width) { 716 __asm { 717 mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f 718 movd xmm4, eax 719 pshufd xmm4, xmm4, 0 720 movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles 721 pslld xmm5, 4 722 mov eax, [esp + 4] // src_argb4444 723 mov edx, [esp + 8] // dst_argb 724 mov ecx, [esp + 12] // width 725 sub edx, eax 726 sub edx, eax 727 728 convertloop: 729 movdqu xmm0, [eax] // fetch 8 pixels of bgra4444 730 movdqa xmm2, xmm0 731 pand xmm0, xmm4 // mask low nibbles 732 pand xmm2, xmm5 // mask high nibbles 733 movdqa xmm1, xmm0 734 movdqa xmm3, xmm2 735 psllw xmm1, 4 736 psrlw xmm3, 4 737 por xmm0, xmm1 738 por xmm2, xmm3 739 movdqa xmm1, xmm0 740 punpcklbw xmm0, xmm2 741 punpckhbw xmm1, xmm2 742 movdqu [eax * 2 + edx], xmm0 // store 4 pixels of ARGB 743 movdqu [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB 744 lea eax, [eax + 16] 745 sub ecx, 8 746 jg convertloop 747 ret 748 } 749} 750 751__declspec(naked) 752void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) { 753 __asm { 754 mov eax, [esp + 4] // src_argb 755 mov edx, [esp + 8] // dst_rgb 756 mov ecx, [esp + 12] // width 757 movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24 758 759 convertloop: 760 movdqu xmm0, [eax] // fetch 16 pixels of argb 761 movdqu xmm1, [eax + 16] 762 movdqu xmm2, [eax + 32] 763 movdqu xmm3, [eax + 48] 764 lea eax, [eax + 64] 765 pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB 766 pshufb xmm1, xmm6 767 pshufb xmm2, xmm6 768 pshufb xmm3, xmm6 769 movdqa xmm4, xmm1 // 4 bytes from 1 for 0 770 psrldq xmm1, 4 // 8 bytes from 1 771 pslldq xmm4, 12 // 4 bytes from 1 for 0 772 movdqa xmm5, xmm2 // 8 bytes from 2 for 1 773 por xmm0, xmm4 // 4 bytes from 1 for 0 774 pslldq xmm5, 8 // 8 bytes from 2 for 1 775 movdqu [edx], xmm0 // store 0 776 por xmm1, xmm5 // 8 bytes from 2 for 1 777 psrldq xmm2, 8 // 4 bytes from 2 778 pslldq xmm3, 4 // 12 bytes from 3 for 2 779 por xmm2, xmm3 // 12 bytes from 3 for 2 780 movdqu [edx + 16], xmm1 // store 1 781 movdqu [edx + 32], xmm2 // store 2 782 lea edx, [edx + 48] 783 sub ecx, 16 784 jg convertloop 785 ret 786 } 787} 788 789__declspec(naked) 790void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) { 791 __asm { 792 mov eax, [esp + 4] // src_argb 793 mov edx, [esp + 8] // dst_rgb 794 mov ecx, [esp + 12] // width 795 movdqa xmm6, xmmword ptr kShuffleMaskARGBToRAW 796 797 convertloop: 798 movdqu xmm0, [eax] // fetch 16 pixels of argb 799 movdqu xmm1, [eax + 16] 800 movdqu xmm2, [eax + 32] 801 movdqu xmm3, [eax + 48] 802 lea eax, [eax + 64] 803 pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB 804 pshufb xmm1, xmm6 805 pshufb xmm2, xmm6 806 pshufb xmm3, xmm6 807 movdqa xmm4, xmm1 // 4 bytes from 1 for 0 808 psrldq xmm1, 4 // 8 bytes from 1 809 pslldq xmm4, 12 // 4 bytes from 1 for 0 810 movdqa xmm5, xmm2 // 8 bytes from 2 for 1 811 por xmm0, xmm4 // 4 bytes from 1 for 0 812 pslldq xmm5, 8 // 8 bytes from 2 for 1 813 movdqu [edx], xmm0 // store 0 814 por xmm1, xmm5 // 8 bytes from 2 for 1 815 psrldq xmm2, 8 // 4 bytes from 2 816 pslldq xmm3, 4 // 12 bytes from 3 for 2 817 por xmm2, xmm3 // 12 bytes from 3 for 2 818 movdqu [edx + 16], xmm1 // store 1 819 movdqu [edx + 32], xmm2 // store 2 820 lea edx, [edx + 48] 821 sub ecx, 16 822 jg convertloop 823 ret 824 } 825} 826 827__declspec(naked) 828void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) { 829 __asm { 830 mov eax, [esp + 4] // src_argb 831 mov edx, [esp + 8] // dst_rgb 832 mov ecx, [esp + 12] // width 833 pcmpeqb xmm3, xmm3 // generate mask 0x0000001f 834 psrld xmm3, 27 835 pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 836 psrld xmm4, 26 837 pslld xmm4, 5 838 pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 839 pslld xmm5, 11 840 841 convertloop: 842 movdqu xmm0, [eax] // fetch 4 pixels of argb 843 movdqa xmm1, xmm0 // B 844 movdqa xmm2, xmm0 // G 845 pslld xmm0, 8 // R 846 psrld xmm1, 3 // B 847 psrld xmm2, 5 // G 848 psrad xmm0, 16 // R 849 pand xmm1, xmm3 // B 850 pand xmm2, xmm4 // G 851 pand xmm0, xmm5 // R 852 por xmm1, xmm2 // BG 853 por xmm0, xmm1 // BGR 854 packssdw xmm0, xmm0 855 lea eax, [eax + 16] 856 movq qword ptr [edx], xmm0 // store 4 pixels of RGB565 857 lea edx, [edx + 8] 858 sub ecx, 4 859 jg convertloop 860 ret 861 } 862} 863 864__declspec(naked) 865void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb, 866 const uint32 dither4, int width) { 867 __asm { 868 869 mov eax, [esp + 4] // src_argb 870 mov edx, [esp + 8] // dst_rgb 871 movd xmm6, [esp + 12] // dither4 872 mov ecx, [esp + 16] // width 873 punpcklbw xmm6, xmm6 // make dither 16 bytes 874 movdqa xmm7, xmm6 875 punpcklwd xmm6, xmm6 876 punpckhwd xmm7, xmm7 877 pcmpeqb xmm3, xmm3 // generate mask 0x0000001f 878 psrld xmm3, 27 879 pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 880 psrld xmm4, 26 881 pslld xmm4, 5 882 pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 883 pslld xmm5, 11 884 885 convertloop: 886 movdqu xmm0, [eax] // fetch 4 pixels of argb 887 paddusb xmm0, xmm6 // add dither 888 movdqa xmm1, xmm0 // B 889 movdqa xmm2, xmm0 // G 890 pslld xmm0, 8 // R 891 psrld xmm1, 3 // B 892 psrld xmm2, 5 // G 893 psrad xmm0, 16 // R 894 pand xmm1, xmm3 // B 895 pand xmm2, xmm4 // G 896 pand xmm0, xmm5 // R 897 por xmm1, xmm2 // BG 898 por xmm0, xmm1 // BGR 899 packssdw xmm0, xmm0 900 lea eax, [eax + 16] 901 movq qword ptr [edx], xmm0 // store 4 pixels of RGB565 902 lea edx, [edx + 8] 903 sub ecx, 4 904 jg convertloop 905 ret 906 } 907} 908 909#ifdef HAS_ARGBTORGB565DITHERROW_AVX2 910__declspec(naked) 911void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb, 912 const uint32 dither4, int width) { 913 __asm { 914 mov eax, [esp + 4] // src_argb 915 mov edx, [esp + 8] // dst_rgb 916 vbroadcastss xmm6, [esp + 12] // dither4 917 mov ecx, [esp + 16] // width 918 vpunpcklbw xmm6, xmm6, xmm6 // make dither 32 bytes 919 vpermq ymm6, ymm6, 0xd8 920 vpunpcklwd ymm6, ymm6, ymm6 921 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f 922 vpsrld ymm3, ymm3, 27 923 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 924 vpsrld ymm4, ymm4, 26 925 vpslld ymm4, ymm4, 5 926 vpslld ymm5, ymm3, 11 // generate mask 0x0000f800 927 928 convertloop: 929 vmovdqu ymm0, [eax] // fetch 8 pixels of argb 930 vpaddusb ymm0, ymm0, ymm6 // add dither 931 vpsrld ymm2, ymm0, 5 // G 932 vpsrld ymm1, ymm0, 3 // B 933 vpsrld ymm0, ymm0, 8 // R 934 vpand ymm2, ymm2, ymm4 // G 935 vpand ymm1, ymm1, ymm3 // B 936 vpand ymm0, ymm0, ymm5 // R 937 vpor ymm1, ymm1, ymm2 // BG 938 vpor ymm0, ymm0, ymm1 // BGR 939 vpackusdw ymm0, ymm0, ymm0 940 vpermq ymm0, ymm0, 0xd8 941 lea eax, [eax + 32] 942 vmovdqu [edx], xmm0 // store 8 pixels of RGB565 943 lea edx, [edx + 16] 944 sub ecx, 8 945 jg convertloop 946 vzeroupper 947 ret 948 } 949} 950#endif // HAS_ARGBTORGB565DITHERROW_AVX2 951 952// TODO(fbarchard): Improve sign extension/packing. 953__declspec(naked) 954void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) { 955 __asm { 956 mov eax, [esp + 4] // src_argb 957 mov edx, [esp + 8] // dst_rgb 958 mov ecx, [esp + 12] // width 959 pcmpeqb xmm4, xmm4 // generate mask 0x0000001f 960 psrld xmm4, 27 961 movdqa xmm5, xmm4 // generate mask 0x000003e0 962 pslld xmm5, 5 963 movdqa xmm6, xmm4 // generate mask 0x00007c00 964 pslld xmm6, 10 965 pcmpeqb xmm7, xmm7 // generate mask 0xffff8000 966 pslld xmm7, 15 967 968 convertloop: 969 movdqu xmm0, [eax] // fetch 4 pixels of argb 970 movdqa xmm1, xmm0 // B 971 movdqa xmm2, xmm0 // G 972 movdqa xmm3, xmm0 // R 973 psrad xmm0, 16 // A 974 psrld xmm1, 3 // B 975 psrld xmm2, 6 // G 976 psrld xmm3, 9 // R 977 pand xmm0, xmm7 // A 978 pand xmm1, xmm4 // B 979 pand xmm2, xmm5 // G 980 pand xmm3, xmm6 // R 981 por xmm0, xmm1 // BA 982 por xmm2, xmm3 // GR 983 por xmm0, xmm2 // BGRA 984 packssdw xmm0, xmm0 985 lea eax, [eax + 16] 986 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555 987 lea edx, [edx + 8] 988 sub ecx, 4 989 jg convertloop 990 ret 991 } 992} 993 994__declspec(naked) 995void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) { 996 __asm { 997 mov eax, [esp + 4] // src_argb 998 mov edx, [esp + 8] // dst_rgb 999 mov ecx, [esp + 12] // width 1000 pcmpeqb xmm4, xmm4 // generate mask 0xf000f000 1001 psllw xmm4, 12 1002 movdqa xmm3, xmm4 // generate mask 0x00f000f0 1003 psrlw xmm3, 8 1004 1005 convertloop: 1006 movdqu xmm0, [eax] // fetch 4 pixels of argb 1007 movdqa xmm1, xmm0 1008 pand xmm0, xmm3 // low nibble 1009 pand xmm1, xmm4 // high nibble 1010 psrld xmm0, 4 1011 psrld xmm1, 8 1012 por xmm0, xmm1 1013 packuswb xmm0, xmm0 1014 lea eax, [eax + 16] 1015 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444 1016 lea edx, [edx + 8] 1017 sub ecx, 4 1018 jg convertloop 1019 ret 1020 } 1021} 1022 1023#ifdef HAS_ARGBTORGB565ROW_AVX2 1024__declspec(naked) 1025void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) { 1026 __asm { 1027 mov eax, [esp + 4] // src_argb 1028 mov edx, [esp + 8] // dst_rgb 1029 mov ecx, [esp + 12] // width 1030 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f 1031 vpsrld ymm3, ymm3, 27 1032 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 1033 vpsrld ymm4, ymm4, 26 1034 vpslld ymm4, ymm4, 5 1035 vpslld ymm5, ymm3, 11 // generate mask 0x0000f800 1036 1037 convertloop: 1038 vmovdqu ymm0, [eax] // fetch 8 pixels of argb 1039 vpsrld ymm2, ymm0, 5 // G 1040 vpsrld ymm1, ymm0, 3 // B 1041 vpsrld ymm0, ymm0, 8 // R 1042 vpand ymm2, ymm2, ymm4 // G 1043 vpand ymm1, ymm1, ymm3 // B 1044 vpand ymm0, ymm0, ymm5 // R 1045 vpor ymm1, ymm1, ymm2 // BG 1046 vpor ymm0, ymm0, ymm1 // BGR 1047 vpackusdw ymm0, ymm0, ymm0 1048 vpermq ymm0, ymm0, 0xd8 1049 lea eax, [eax + 32] 1050 vmovdqu [edx], xmm0 // store 8 pixels of RGB565 1051 lea edx, [edx + 16] 1052 sub ecx, 8 1053 jg convertloop 1054 vzeroupper 1055 ret 1056 } 1057} 1058#endif // HAS_ARGBTORGB565ROW_AVX2 1059 1060#ifdef HAS_ARGBTOARGB1555ROW_AVX2 1061__declspec(naked) 1062void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) { 1063 __asm { 1064 mov eax, [esp + 4] // src_argb 1065 mov edx, [esp + 8] // dst_rgb 1066 mov ecx, [esp + 12] // width 1067 vpcmpeqb ymm4, ymm4, ymm4 1068 vpsrld ymm4, ymm4, 27 // generate mask 0x0000001f 1069 vpslld ymm5, ymm4, 5 // generate mask 0x000003e0 1070 vpslld ymm6, ymm4, 10 // generate mask 0x00007c00 1071 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xffff8000 1072 vpslld ymm7, ymm7, 15 1073 1074 convertloop: 1075 vmovdqu ymm0, [eax] // fetch 8 pixels of argb 1076 vpsrld ymm3, ymm0, 9 // R 1077 vpsrld ymm2, ymm0, 6 // G 1078 vpsrld ymm1, ymm0, 3 // B 1079 vpsrad ymm0, ymm0, 16 // A 1080 vpand ymm3, ymm3, ymm6 // R 1081 vpand ymm2, ymm2, ymm5 // G 1082 vpand ymm1, ymm1, ymm4 // B 1083 vpand ymm0, ymm0, ymm7 // A 1084 vpor ymm0, ymm0, ymm1 // BA 1085 vpor ymm2, ymm2, ymm3 // GR 1086 vpor ymm0, ymm0, ymm2 // BGRA 1087 vpackssdw ymm0, ymm0, ymm0 1088 vpermq ymm0, ymm0, 0xd8 1089 lea eax, [eax + 32] 1090 vmovdqu [edx], xmm0 // store 8 pixels of ARGB1555 1091 lea edx, [edx + 16] 1092 sub ecx, 8 1093 jg convertloop 1094 vzeroupper 1095 ret 1096 } 1097} 1098#endif // HAS_ARGBTOARGB1555ROW_AVX2 1099 1100#ifdef HAS_ARGBTOARGB4444ROW_AVX2 1101__declspec(naked) 1102void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) { 1103 __asm { 1104 mov eax, [esp + 4] // src_argb 1105 mov edx, [esp + 8] // dst_rgb 1106 mov ecx, [esp + 12] // width 1107 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000 1108 vpsllw ymm4, ymm4, 12 1109 vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0 1110 1111 convertloop: 1112 vmovdqu ymm0, [eax] // fetch 8 pixels of argb 1113 vpand ymm1, ymm0, ymm4 // high nibble 1114 vpand ymm0, ymm0, ymm3 // low nibble 1115 vpsrld ymm1, ymm1, 8 1116 vpsrld ymm0, ymm0, 4 1117 vpor ymm0, ymm0, ymm1 1118 vpackuswb ymm0, ymm0, ymm0 1119 vpermq ymm0, ymm0, 0xd8 1120 lea eax, [eax + 32] 1121 vmovdqu [edx], xmm0 // store 8 pixels of ARGB4444 1122 lea edx, [edx + 16] 1123 sub ecx, 8 1124 jg convertloop 1125 vzeroupper 1126 ret 1127 } 1128} 1129#endif // HAS_ARGBTOARGB4444ROW_AVX2 1130 1131// Convert 16 ARGB pixels (64 bytes) to 16 Y values. 1132__declspec(naked) 1133void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { 1134 __asm { 1135 mov eax, [esp + 4] /* src_argb */ 1136 mov edx, [esp + 8] /* dst_y */ 1137 mov ecx, [esp + 12] /* width */ 1138 movdqa xmm4, xmmword ptr kARGBToY 1139 movdqa xmm5, xmmword ptr kAddY16 1140 1141 convertloop: 1142 movdqu xmm0, [eax] 1143 movdqu xmm1, [eax + 16] 1144 movdqu xmm2, [eax + 32] 1145 movdqu xmm3, [eax + 48] 1146 pmaddubsw xmm0, xmm4 1147 pmaddubsw xmm1, xmm4 1148 pmaddubsw xmm2, xmm4 1149 pmaddubsw xmm3, xmm4 1150 lea eax, [eax + 64] 1151 phaddw xmm0, xmm1 1152 phaddw xmm2, xmm3 1153 psrlw xmm0, 7 1154 psrlw xmm2, 7 1155 packuswb xmm0, xmm2 1156 paddb xmm0, xmm5 1157 movdqu [edx], xmm0 1158 lea edx, [edx + 16] 1159 sub ecx, 16 1160 jg convertloop 1161 ret 1162 } 1163} 1164 1165// Convert 16 ARGB pixels (64 bytes) to 16 YJ values. 1166// Same as ARGBToYRow but different coefficients, no add 16, but do rounding. 1167__declspec(naked) 1168void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { 1169 __asm { 1170 mov eax, [esp + 4] /* src_argb */ 1171 mov edx, [esp + 8] /* dst_y */ 1172 mov ecx, [esp + 12] /* width */ 1173 movdqa xmm4, xmmword ptr kARGBToYJ 1174 movdqa xmm5, xmmword ptr kAddYJ64 1175 1176 convertloop: 1177 movdqu xmm0, [eax] 1178 movdqu xmm1, [eax + 16] 1179 movdqu xmm2, [eax + 32] 1180 movdqu xmm3, [eax + 48] 1181 pmaddubsw xmm0, xmm4 1182 pmaddubsw xmm1, xmm4 1183 pmaddubsw xmm2, xmm4 1184 pmaddubsw xmm3, xmm4 1185 lea eax, [eax + 64] 1186 phaddw xmm0, xmm1 1187 phaddw xmm2, xmm3 1188 paddw xmm0, xmm5 // Add .5 for rounding. 1189 paddw xmm2, xmm5 1190 psrlw xmm0, 7 1191 psrlw xmm2, 7 1192 packuswb xmm0, xmm2 1193 movdqu [edx], xmm0 1194 lea edx, [edx + 16] 1195 sub ecx, 16 1196 jg convertloop 1197 ret 1198 } 1199} 1200 1201#ifdef HAS_ARGBTOYROW_AVX2 1202// vpermd for vphaddw + vpackuswb vpermd. 1203static const lvec32 kPermdARGBToY_AVX = { 1204 0, 4, 1, 5, 2, 6, 3, 7 1205}; 1206 1207// Convert 32 ARGB pixels (128 bytes) to 32 Y values. 1208__declspec(naked) 1209void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) { 1210 __asm { 1211 mov eax, [esp + 4] /* src_argb */ 1212 mov edx, [esp + 8] /* dst_y */ 1213 mov ecx, [esp + 12] /* width */ 1214 vbroadcastf128 ymm4, xmmword ptr kARGBToY 1215 vbroadcastf128 ymm5, xmmword ptr kAddY16 1216 vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX 1217 1218 convertloop: 1219 vmovdqu ymm0, [eax] 1220 vmovdqu ymm1, [eax + 32] 1221 vmovdqu ymm2, [eax + 64] 1222 vmovdqu ymm3, [eax + 96] 1223 vpmaddubsw ymm0, ymm0, ymm4 1224 vpmaddubsw ymm1, ymm1, ymm4 1225 vpmaddubsw ymm2, ymm2, ymm4 1226 vpmaddubsw ymm3, ymm3, ymm4 1227 lea eax, [eax + 128] 1228 vphaddw ymm0, ymm0, ymm1 // mutates. 1229 vphaddw ymm2, ymm2, ymm3 1230 vpsrlw ymm0, ymm0, 7 1231 vpsrlw ymm2, ymm2, 7 1232 vpackuswb ymm0, ymm0, ymm2 // mutates. 1233 vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation. 1234 vpaddb ymm0, ymm0, ymm5 // add 16 for Y 1235 vmovdqu [edx], ymm0 1236 lea edx, [edx + 32] 1237 sub ecx, 32 1238 jg convertloop 1239 vzeroupper 1240 ret 1241 } 1242} 1243#endif // HAS_ARGBTOYROW_AVX2 1244 1245#ifdef HAS_ARGBTOYJROW_AVX2 1246// Convert 32 ARGB pixels (128 bytes) to 32 Y values. 1247__declspec(naked) 1248void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) { 1249 __asm { 1250 mov eax, [esp + 4] /* src_argb */ 1251 mov edx, [esp + 8] /* dst_y */ 1252 mov ecx, [esp + 12] /* width */ 1253 vbroadcastf128 ymm4, xmmword ptr kARGBToYJ 1254 vbroadcastf128 ymm5, xmmword ptr kAddYJ64 1255 vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX 1256 1257 convertloop: 1258 vmovdqu ymm0, [eax] 1259 vmovdqu ymm1, [eax + 32] 1260 vmovdqu ymm2, [eax + 64] 1261 vmovdqu ymm3, [eax + 96] 1262 vpmaddubsw ymm0, ymm0, ymm4 1263 vpmaddubsw ymm1, ymm1, ymm4 1264 vpmaddubsw ymm2, ymm2, ymm4 1265 vpmaddubsw ymm3, ymm3, ymm4 1266 lea eax, [eax + 128] 1267 vphaddw ymm0, ymm0, ymm1 // mutates. 1268 vphaddw ymm2, ymm2, ymm3 1269 vpaddw ymm0, ymm0, ymm5 // Add .5 for rounding. 1270 vpaddw ymm2, ymm2, ymm5 1271 vpsrlw ymm0, ymm0, 7 1272 vpsrlw ymm2, ymm2, 7 1273 vpackuswb ymm0, ymm0, ymm2 // mutates. 1274 vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation. 1275 vmovdqu [edx], ymm0 1276 lea edx, [edx + 32] 1277 sub ecx, 32 1278 jg convertloop 1279 1280 vzeroupper 1281 ret 1282 } 1283} 1284#endif // HAS_ARGBTOYJROW_AVX2 1285 1286__declspec(naked) 1287void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { 1288 __asm { 1289 mov eax, [esp + 4] /* src_argb */ 1290 mov edx, [esp + 8] /* dst_y */ 1291 mov ecx, [esp + 12] /* width */ 1292 movdqa xmm4, xmmword ptr kBGRAToY 1293 movdqa xmm5, xmmword ptr kAddY16 1294 1295 convertloop: 1296 movdqu xmm0, [eax] 1297 movdqu xmm1, [eax + 16] 1298 movdqu xmm2, [eax + 32] 1299 movdqu xmm3, [eax + 48] 1300 pmaddubsw xmm0, xmm4 1301 pmaddubsw xmm1, xmm4 1302 pmaddubsw xmm2, xmm4 1303 pmaddubsw xmm3, xmm4 1304 lea eax, [eax + 64] 1305 phaddw xmm0, xmm1 1306 phaddw xmm2, xmm3 1307 psrlw xmm0, 7 1308 psrlw xmm2, 7 1309 packuswb xmm0, xmm2 1310 paddb xmm0, xmm5 1311 movdqu [edx], xmm0 1312 lea edx, [edx + 16] 1313 sub ecx, 16 1314 jg convertloop 1315 ret 1316 } 1317} 1318 1319__declspec(naked) 1320void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { 1321 __asm { 1322 mov eax, [esp + 4] /* src_argb */ 1323 mov edx, [esp + 8] /* dst_y */ 1324 mov ecx, [esp + 12] /* width */ 1325 movdqa xmm4, xmmword ptr kABGRToY 1326 movdqa xmm5, xmmword ptr kAddY16 1327 1328 convertloop: 1329 movdqu xmm0, [eax] 1330 movdqu xmm1, [eax + 16] 1331 movdqu xmm2, [eax + 32] 1332 movdqu xmm3, [eax + 48] 1333 pmaddubsw xmm0, xmm4 1334 pmaddubsw xmm1, xmm4 1335 pmaddubsw xmm2, xmm4 1336 pmaddubsw xmm3, xmm4 1337 lea eax, [eax + 64] 1338 phaddw xmm0, xmm1 1339 phaddw xmm2, xmm3 1340 psrlw xmm0, 7 1341 psrlw xmm2, 7 1342 packuswb xmm0, xmm2 1343 paddb xmm0, xmm5 1344 movdqu [edx], xmm0 1345 lea edx, [edx + 16] 1346 sub ecx, 16 1347 jg convertloop 1348 ret 1349 } 1350} 1351 1352__declspec(naked) 1353void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { 1354 __asm { 1355 mov eax, [esp + 4] /* src_argb */ 1356 mov edx, [esp + 8] /* dst_y */ 1357 mov ecx, [esp + 12] /* width */ 1358 movdqa xmm4, xmmword ptr kRGBAToY 1359 movdqa xmm5, xmmword ptr kAddY16 1360 1361 convertloop: 1362 movdqu xmm0, [eax] 1363 movdqu xmm1, [eax + 16] 1364 movdqu xmm2, [eax + 32] 1365 movdqu xmm3, [eax + 48] 1366 pmaddubsw xmm0, xmm4 1367 pmaddubsw xmm1, xmm4 1368 pmaddubsw xmm2, xmm4 1369 pmaddubsw xmm3, xmm4 1370 lea eax, [eax + 64] 1371 phaddw xmm0, xmm1 1372 phaddw xmm2, xmm3 1373 psrlw xmm0, 7 1374 psrlw xmm2, 7 1375 packuswb xmm0, xmm2 1376 paddb xmm0, xmm5 1377 movdqu [edx], xmm0 1378 lea edx, [edx + 16] 1379 sub ecx, 16 1380 jg convertloop 1381 ret 1382 } 1383} 1384 1385__declspec(naked) 1386void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 1387 uint8* dst_u, uint8* dst_v, int width) { 1388 __asm { 1389 push esi 1390 push edi 1391 mov eax, [esp + 8 + 4] // src_argb 1392 mov esi, [esp + 8 + 8] // src_stride_argb 1393 mov edx, [esp + 8 + 12] // dst_u 1394 mov edi, [esp + 8 + 16] // dst_v 1395 mov ecx, [esp + 8 + 20] // width 1396 movdqa xmm5, xmmword ptr kAddUV128 1397 movdqa xmm6, xmmword ptr kARGBToV 1398 movdqa xmm7, xmmword ptr kARGBToU 1399 sub edi, edx // stride from u to v 1400 1401 convertloop: 1402 /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1403 movdqu xmm0, [eax] 1404 movdqu xmm4, [eax + esi] 1405 pavgb xmm0, xmm4 1406 movdqu xmm1, [eax + 16] 1407 movdqu xmm4, [eax + esi + 16] 1408 pavgb xmm1, xmm4 1409 movdqu xmm2, [eax + 32] 1410 movdqu xmm4, [eax + esi + 32] 1411 pavgb xmm2, xmm4 1412 movdqu xmm3, [eax + 48] 1413 movdqu xmm4, [eax + esi + 48] 1414 pavgb xmm3, xmm4 1415 1416 lea eax, [eax + 64] 1417 movdqa xmm4, xmm0 1418 shufps xmm0, xmm1, 0x88 1419 shufps xmm4, xmm1, 0xdd 1420 pavgb xmm0, xmm4 1421 movdqa xmm4, xmm2 1422 shufps xmm2, xmm3, 0x88 1423 shufps xmm4, xmm3, 0xdd 1424 pavgb xmm2, xmm4 1425 1426 // step 2 - convert to U and V 1427 // from here down is very similar to Y code except 1428 // instead of 16 different pixels, its 8 pixels of U and 8 of V 1429 movdqa xmm1, xmm0 1430 movdqa xmm3, xmm2 1431 pmaddubsw xmm0, xmm7 // U 1432 pmaddubsw xmm2, xmm7 1433 pmaddubsw xmm1, xmm6 // V 1434 pmaddubsw xmm3, xmm6 1435 phaddw xmm0, xmm2 1436 phaddw xmm1, xmm3 1437 psraw xmm0, 8 1438 psraw xmm1, 8 1439 packsswb xmm0, xmm1 1440 paddb xmm0, xmm5 // -> unsigned 1441 1442 // step 3 - store 8 U and 8 V values 1443 movlps qword ptr [edx], xmm0 // U 1444 movhps qword ptr [edx + edi], xmm0 // V 1445 lea edx, [edx + 8] 1446 sub ecx, 16 1447 jg convertloop 1448 1449 pop edi 1450 pop esi 1451 ret 1452 } 1453} 1454 1455__declspec(naked) 1456void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 1457 uint8* dst_u, uint8* dst_v, int width) { 1458 __asm { 1459 push esi 1460 push edi 1461 mov eax, [esp + 8 + 4] // src_argb 1462 mov esi, [esp + 8 + 8] // src_stride_argb 1463 mov edx, [esp + 8 + 12] // dst_u 1464 mov edi, [esp + 8 + 16] // dst_v 1465 mov ecx, [esp + 8 + 20] // width 1466 movdqa xmm5, xmmword ptr kAddUVJ128 1467 movdqa xmm6, xmmword ptr kARGBToVJ 1468 movdqa xmm7, xmmword ptr kARGBToUJ 1469 sub edi, edx // stride from u to v 1470 1471 convertloop: 1472 /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1473 movdqu xmm0, [eax] 1474 movdqu xmm4, [eax + esi] 1475 pavgb xmm0, xmm4 1476 movdqu xmm1, [eax + 16] 1477 movdqu xmm4, [eax + esi + 16] 1478 pavgb xmm1, xmm4 1479 movdqu xmm2, [eax + 32] 1480 movdqu xmm4, [eax + esi + 32] 1481 pavgb xmm2, xmm4 1482 movdqu xmm3, [eax + 48] 1483 movdqu xmm4, [eax + esi + 48] 1484 pavgb xmm3, xmm4 1485 1486 lea eax, [eax + 64] 1487 movdqa xmm4, xmm0 1488 shufps xmm0, xmm1, 0x88 1489 shufps xmm4, xmm1, 0xdd 1490 pavgb xmm0, xmm4 1491 movdqa xmm4, xmm2 1492 shufps xmm2, xmm3, 0x88 1493 shufps xmm4, xmm3, 0xdd 1494 pavgb xmm2, xmm4 1495 1496 // step 2 - convert to U and V 1497 // from here down is very similar to Y code except 1498 // instead of 16 different pixels, its 8 pixels of U and 8 of V 1499 movdqa xmm1, xmm0 1500 movdqa xmm3, xmm2 1501 pmaddubsw xmm0, xmm7 // U 1502 pmaddubsw xmm2, xmm7 1503 pmaddubsw xmm1, xmm6 // V 1504 pmaddubsw xmm3, xmm6 1505 phaddw xmm0, xmm2 1506 phaddw xmm1, xmm3 1507 paddw xmm0, xmm5 // +.5 rounding -> unsigned 1508 paddw xmm1, xmm5 1509 psraw xmm0, 8 1510 psraw xmm1, 8 1511 packsswb xmm0, xmm1 1512 1513 // step 3 - store 8 U and 8 V values 1514 movlps qword ptr [edx], xmm0 // U 1515 movhps qword ptr [edx + edi], xmm0 // V 1516 lea edx, [edx + 8] 1517 sub ecx, 16 1518 jg convertloop 1519 1520 pop edi 1521 pop esi 1522 ret 1523 } 1524} 1525 1526#ifdef HAS_ARGBTOUVROW_AVX2 1527__declspec(naked) 1528void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, 1529 uint8* dst_u, uint8* dst_v, int width) { 1530 __asm { 1531 push esi 1532 push edi 1533 mov eax, [esp + 8 + 4] // src_argb 1534 mov esi, [esp + 8 + 8] // src_stride_argb 1535 mov edx, [esp + 8 + 12] // dst_u 1536 mov edi, [esp + 8 + 16] // dst_v 1537 mov ecx, [esp + 8 + 20] // width 1538 vbroadcastf128 ymm5, xmmword ptr kAddUV128 1539 vbroadcastf128 ymm6, xmmword ptr kARGBToV 1540 vbroadcastf128 ymm7, xmmword ptr kARGBToU 1541 sub edi, edx // stride from u to v 1542 1543 convertloop: 1544 /* step 1 - subsample 32x2 argb pixels to 16x1 */ 1545 vmovdqu ymm0, [eax] 1546 vmovdqu ymm1, [eax + 32] 1547 vmovdqu ymm2, [eax + 64] 1548 vmovdqu ymm3, [eax + 96] 1549 vpavgb ymm0, ymm0, [eax + esi] 1550 vpavgb ymm1, ymm1, [eax + esi + 32] 1551 vpavgb ymm2, ymm2, [eax + esi + 64] 1552 vpavgb ymm3, ymm3, [eax + esi + 96] 1553 lea eax, [eax + 128] 1554 vshufps ymm4, ymm0, ymm1, 0x88 1555 vshufps ymm0, ymm0, ymm1, 0xdd 1556 vpavgb ymm0, ymm0, ymm4 // mutated by vshufps 1557 vshufps ymm4, ymm2, ymm3, 0x88 1558 vshufps ymm2, ymm2, ymm3, 0xdd 1559 vpavgb ymm2, ymm2, ymm4 // mutated by vshufps 1560 1561 // step 2 - convert to U and V 1562 // from here down is very similar to Y code except 1563 // instead of 32 different pixels, its 16 pixels of U and 16 of V 1564 vpmaddubsw ymm1, ymm0, ymm7 // U 1565 vpmaddubsw ymm3, ymm2, ymm7 1566 vpmaddubsw ymm0, ymm0, ymm6 // V 1567 vpmaddubsw ymm2, ymm2, ymm6 1568 vphaddw ymm1, ymm1, ymm3 // mutates 1569 vphaddw ymm0, ymm0, ymm2 1570 vpsraw ymm1, ymm1, 8 1571 vpsraw ymm0, ymm0, 8 1572 vpacksswb ymm0, ymm1, ymm0 // mutates 1573 vpermq ymm0, ymm0, 0xd8 // For vpacksswb 1574 vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw 1575 vpaddb ymm0, ymm0, ymm5 // -> unsigned 1576 1577 // step 3 - store 16 U and 16 V values 1578 vextractf128 [edx], ymm0, 0 // U 1579 vextractf128 [edx + edi], ymm0, 1 // V 1580 lea edx, [edx + 16] 1581 sub ecx, 32 1582 jg convertloop 1583 1584 pop edi 1585 pop esi 1586 vzeroupper 1587 ret 1588 } 1589} 1590#endif // HAS_ARGBTOUVROW_AVX2 1591 1592#ifdef HAS_ARGBTOUVJROW_AVX2 1593__declspec(naked) 1594void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb, 1595 uint8* dst_u, uint8* dst_v, int width) { 1596 __asm { 1597 push esi 1598 push edi 1599 mov eax, [esp + 8 + 4] // src_argb 1600 mov esi, [esp + 8 + 8] // src_stride_argb 1601 mov edx, [esp + 8 + 12] // dst_u 1602 mov edi, [esp + 8 + 16] // dst_v 1603 mov ecx, [esp + 8 + 20] // width 1604 vbroadcastf128 ymm5, xmmword ptr kAddUV128 1605 vbroadcastf128 ymm6, xmmword ptr kARGBToV 1606 vbroadcastf128 ymm7, xmmword ptr kARGBToU 1607 sub edi, edx // stride from u to v 1608 1609 convertloop: 1610 /* step 1 - subsample 32x2 argb pixels to 16x1 */ 1611 vmovdqu ymm0, [eax] 1612 vmovdqu ymm1, [eax + 32] 1613 vmovdqu ymm2, [eax + 64] 1614 vmovdqu ymm3, [eax + 96] 1615 vpavgb ymm0, ymm0, [eax + esi] 1616 vpavgb ymm1, ymm1, [eax + esi + 32] 1617 vpavgb ymm2, ymm2, [eax + esi + 64] 1618 vpavgb ymm3, ymm3, [eax + esi + 96] 1619 lea eax, [eax + 128] 1620 vshufps ymm4, ymm0, ymm1, 0x88 1621 vshufps ymm0, ymm0, ymm1, 0xdd 1622 vpavgb ymm0, ymm0, ymm4 // mutated by vshufps 1623 vshufps ymm4, ymm2, ymm3, 0x88 1624 vshufps ymm2, ymm2, ymm3, 0xdd 1625 vpavgb ymm2, ymm2, ymm4 // mutated by vshufps 1626 1627 // step 2 - convert to U and V 1628 // from here down is very similar to Y code except 1629 // instead of 32 different pixels, its 16 pixels of U and 16 of V 1630 vpmaddubsw ymm1, ymm0, ymm7 // U 1631 vpmaddubsw ymm3, ymm2, ymm7 1632 vpmaddubsw ymm0, ymm0, ymm6 // V 1633 vpmaddubsw ymm2, ymm2, ymm6 1634 vphaddw ymm1, ymm1, ymm3 // mutates 1635 vphaddw ymm0, ymm0, ymm2 1636 vpaddw ymm1, ymm1, ymm5 // +.5 rounding -> unsigned 1637 vpaddw ymm0, ymm0, ymm5 1638 vpsraw ymm1, ymm1, 8 1639 vpsraw ymm0, ymm0, 8 1640 vpacksswb ymm0, ymm1, ymm0 // mutates 1641 vpermq ymm0, ymm0, 0xd8 // For vpacksswb 1642 vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw 1643 1644 // step 3 - store 16 U and 16 V values 1645 vextractf128 [edx], ymm0, 0 // U 1646 vextractf128 [edx + edi], ymm0, 1 // V 1647 lea edx, [edx + 16] 1648 sub ecx, 32 1649 jg convertloop 1650 1651 pop edi 1652 pop esi 1653 vzeroupper 1654 ret 1655 } 1656} 1657#endif // HAS_ARGBTOUVJROW_AVX2 1658 1659__declspec(naked) 1660void ARGBToUV444Row_SSSE3(const uint8* src_argb0, 1661 uint8* dst_u, uint8* dst_v, int width) { 1662 __asm { 1663 push edi 1664 mov eax, [esp + 4 + 4] // src_argb 1665 mov edx, [esp + 4 + 8] // dst_u 1666 mov edi, [esp + 4 + 12] // dst_v 1667 mov ecx, [esp + 4 + 16] // width 1668 movdqa xmm5, xmmword ptr kAddUV128 1669 movdqa xmm6, xmmword ptr kARGBToV 1670 movdqa xmm7, xmmword ptr kARGBToU 1671 sub edi, edx // stride from u to v 1672 1673 convertloop: 1674 /* convert to U and V */ 1675 movdqu xmm0, [eax] // U 1676 movdqu xmm1, [eax + 16] 1677 movdqu xmm2, [eax + 32] 1678 movdqu xmm3, [eax + 48] 1679 pmaddubsw xmm0, xmm7 1680 pmaddubsw xmm1, xmm7 1681 pmaddubsw xmm2, xmm7 1682 pmaddubsw xmm3, xmm7 1683 phaddw xmm0, xmm1 1684 phaddw xmm2, xmm3 1685 psraw xmm0, 8 1686 psraw xmm2, 8 1687 packsswb xmm0, xmm2 1688 paddb xmm0, xmm5 1689 movdqu [edx], xmm0 1690 1691 movdqu xmm0, [eax] // V 1692 movdqu xmm1, [eax + 16] 1693 movdqu xmm2, [eax + 32] 1694 movdqu xmm3, [eax + 48] 1695 pmaddubsw xmm0, xmm6 1696 pmaddubsw xmm1, xmm6 1697 pmaddubsw xmm2, xmm6 1698 pmaddubsw xmm3, xmm6 1699 phaddw xmm0, xmm1 1700 phaddw xmm2, xmm3 1701 psraw xmm0, 8 1702 psraw xmm2, 8 1703 packsswb xmm0, xmm2 1704 paddb xmm0, xmm5 1705 lea eax, [eax + 64] 1706 movdqu [edx + edi], xmm0 1707 lea edx, [edx + 16] 1708 sub ecx, 16 1709 jg convertloop 1710 1711 pop edi 1712 ret 1713 } 1714} 1715 1716__declspec(naked) 1717void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 1718 uint8* dst_u, uint8* dst_v, int width) { 1719 __asm { 1720 push esi 1721 push edi 1722 mov eax, [esp + 8 + 4] // src_argb 1723 mov esi, [esp + 8 + 8] // src_stride_argb 1724 mov edx, [esp + 8 + 12] // dst_u 1725 mov edi, [esp + 8 + 16] // dst_v 1726 mov ecx, [esp + 8 + 20] // width 1727 movdqa xmm5, xmmword ptr kAddUV128 1728 movdqa xmm6, xmmword ptr kBGRAToV 1729 movdqa xmm7, xmmword ptr kBGRAToU 1730 sub edi, edx // stride from u to v 1731 1732 convertloop: 1733 /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1734 movdqu xmm0, [eax] 1735 movdqu xmm4, [eax + esi] 1736 pavgb xmm0, xmm4 1737 movdqu xmm1, [eax + 16] 1738 movdqu xmm4, [eax + esi + 16] 1739 pavgb xmm1, xmm4 1740 movdqu xmm2, [eax + 32] 1741 movdqu xmm4, [eax + esi + 32] 1742 pavgb xmm2, xmm4 1743 movdqu xmm3, [eax + 48] 1744 movdqu xmm4, [eax + esi + 48] 1745 pavgb xmm3, xmm4 1746 1747 lea eax, [eax + 64] 1748 movdqa xmm4, xmm0 1749 shufps xmm0, xmm1, 0x88 1750 shufps xmm4, xmm1, 0xdd 1751 pavgb xmm0, xmm4 1752 movdqa xmm4, xmm2 1753 shufps xmm2, xmm3, 0x88 1754 shufps xmm4, xmm3, 0xdd 1755 pavgb xmm2, xmm4 1756 1757 // step 2 - convert to U and V 1758 // from here down is very similar to Y code except 1759 // instead of 16 different pixels, its 8 pixels of U and 8 of V 1760 movdqa xmm1, xmm0 1761 movdqa xmm3, xmm2 1762 pmaddubsw xmm0, xmm7 // U 1763 pmaddubsw xmm2, xmm7 1764 pmaddubsw xmm1, xmm6 // V 1765 pmaddubsw xmm3, xmm6 1766 phaddw xmm0, xmm2 1767 phaddw xmm1, xmm3 1768 psraw xmm0, 8 1769 psraw xmm1, 8 1770 packsswb xmm0, xmm1 1771 paddb xmm0, xmm5 // -> unsigned 1772 1773 // step 3 - store 8 U and 8 V values 1774 movlps qword ptr [edx], xmm0 // U 1775 movhps qword ptr [edx + edi], xmm0 // V 1776 lea edx, [edx + 8] 1777 sub ecx, 16 1778 jg convertloop 1779 1780 pop edi 1781 pop esi 1782 ret 1783 } 1784} 1785 1786__declspec(naked) 1787void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 1788 uint8* dst_u, uint8* dst_v, int width) { 1789 __asm { 1790 push esi 1791 push edi 1792 mov eax, [esp + 8 + 4] // src_argb 1793 mov esi, [esp + 8 + 8] // src_stride_argb 1794 mov edx, [esp + 8 + 12] // dst_u 1795 mov edi, [esp + 8 + 16] // dst_v 1796 mov ecx, [esp + 8 + 20] // width 1797 movdqa xmm5, xmmword ptr kAddUV128 1798 movdqa xmm6, xmmword ptr kABGRToV 1799 movdqa xmm7, xmmword ptr kABGRToU 1800 sub edi, edx // stride from u to v 1801 1802 convertloop: 1803 /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1804 movdqu xmm0, [eax] 1805 movdqu xmm4, [eax + esi] 1806 pavgb xmm0, xmm4 1807 movdqu xmm1, [eax + 16] 1808 movdqu xmm4, [eax + esi + 16] 1809 pavgb xmm1, xmm4 1810 movdqu xmm2, [eax + 32] 1811 movdqu xmm4, [eax + esi + 32] 1812 pavgb xmm2, xmm4 1813 movdqu xmm3, [eax + 48] 1814 movdqu xmm4, [eax + esi + 48] 1815 pavgb xmm3, xmm4 1816 1817 lea eax, [eax + 64] 1818 movdqa xmm4, xmm0 1819 shufps xmm0, xmm1, 0x88 1820 shufps xmm4, xmm1, 0xdd 1821 pavgb xmm0, xmm4 1822 movdqa xmm4, xmm2 1823 shufps xmm2, xmm3, 0x88 1824 shufps xmm4, xmm3, 0xdd 1825 pavgb xmm2, xmm4 1826 1827 // step 2 - convert to U and V 1828 // from here down is very similar to Y code except 1829 // instead of 16 different pixels, its 8 pixels of U and 8 of V 1830 movdqa xmm1, xmm0 1831 movdqa xmm3, xmm2 1832 pmaddubsw xmm0, xmm7 // U 1833 pmaddubsw xmm2, xmm7 1834 pmaddubsw xmm1, xmm6 // V 1835 pmaddubsw xmm3, xmm6 1836 phaddw xmm0, xmm2 1837 phaddw xmm1, xmm3 1838 psraw xmm0, 8 1839 psraw xmm1, 8 1840 packsswb xmm0, xmm1 1841 paddb xmm0, xmm5 // -> unsigned 1842 1843 // step 3 - store 8 U and 8 V values 1844 movlps qword ptr [edx], xmm0 // U 1845 movhps qword ptr [edx + edi], xmm0 // V 1846 lea edx, [edx + 8] 1847 sub ecx, 16 1848 jg convertloop 1849 1850 pop edi 1851 pop esi 1852 ret 1853 } 1854} 1855 1856__declspec(naked) 1857void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 1858 uint8* dst_u, uint8* dst_v, int width) { 1859 __asm { 1860 push esi 1861 push edi 1862 mov eax, [esp + 8 + 4] // src_argb 1863 mov esi, [esp + 8 + 8] // src_stride_argb 1864 mov edx, [esp + 8 + 12] // dst_u 1865 mov edi, [esp + 8 + 16] // dst_v 1866 mov ecx, [esp + 8 + 20] // width 1867 movdqa xmm5, xmmword ptr kAddUV128 1868 movdqa xmm6, xmmword ptr kRGBAToV 1869 movdqa xmm7, xmmword ptr kRGBAToU 1870 sub edi, edx // stride from u to v 1871 1872 convertloop: 1873 /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1874 movdqu xmm0, [eax] 1875 movdqu xmm4, [eax + esi] 1876 pavgb xmm0, xmm4 1877 movdqu xmm1, [eax + 16] 1878 movdqu xmm4, [eax + esi + 16] 1879 pavgb xmm1, xmm4 1880 movdqu xmm2, [eax + 32] 1881 movdqu xmm4, [eax + esi + 32] 1882 pavgb xmm2, xmm4 1883 movdqu xmm3, [eax + 48] 1884 movdqu xmm4, [eax + esi + 48] 1885 pavgb xmm3, xmm4 1886 1887 lea eax, [eax + 64] 1888 movdqa xmm4, xmm0 1889 shufps xmm0, xmm1, 0x88 1890 shufps xmm4, xmm1, 0xdd 1891 pavgb xmm0, xmm4 1892 movdqa xmm4, xmm2 1893 shufps xmm2, xmm3, 0x88 1894 shufps xmm4, xmm3, 0xdd 1895 pavgb xmm2, xmm4 1896 1897 // step 2 - convert to U and V 1898 // from here down is very similar to Y code except 1899 // instead of 16 different pixels, its 8 pixels of U and 8 of V 1900 movdqa xmm1, xmm0 1901 movdqa xmm3, xmm2 1902 pmaddubsw xmm0, xmm7 // U 1903 pmaddubsw xmm2, xmm7 1904 pmaddubsw xmm1, xmm6 // V 1905 pmaddubsw xmm3, xmm6 1906 phaddw xmm0, xmm2 1907 phaddw xmm1, xmm3 1908 psraw xmm0, 8 1909 psraw xmm1, 8 1910 packsswb xmm0, xmm1 1911 paddb xmm0, xmm5 // -> unsigned 1912 1913 // step 3 - store 8 U and 8 V values 1914 movlps qword ptr [edx], xmm0 // U 1915 movhps qword ptr [edx + edi], xmm0 // V 1916 lea edx, [edx + 8] 1917 sub ecx, 16 1918 jg convertloop 1919 1920 pop edi 1921 pop esi 1922 ret 1923 } 1924} 1925#endif // HAS_ARGBTOYROW_SSSE3 1926 1927// Read 16 UV from 444 1928#define READYUV444_AVX2 __asm { \ 1929 __asm vmovdqu xmm0, [esi] /* U */ \ 1930 __asm vmovdqu xmm1, [esi + edi] /* V */ \ 1931 __asm lea esi, [esi + 16] \ 1932 __asm vpermq ymm0, ymm0, 0xd8 \ 1933 __asm vpermq ymm1, ymm1, 0xd8 \ 1934 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ 1935 __asm vmovdqu xmm4, [eax] /* Y */ \ 1936 __asm vpermq ymm4, ymm4, 0xd8 \ 1937 __asm vpunpcklbw ymm4, ymm4, ymm4 \ 1938 __asm lea eax, [eax + 16] \ 1939 } 1940 1941// Read 8 UV from 422, upsample to 16 UV. 1942#define READYUV422_AVX2 __asm { \ 1943 __asm vmovq xmm0, qword ptr [esi] /* U */ \ 1944 __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \ 1945 __asm lea esi, [esi + 8] \ 1946 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ 1947 __asm vpermq ymm0, ymm0, 0xd8 \ 1948 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ 1949 __asm vmovdqu xmm4, [eax] /* Y */ \ 1950 __asm vpermq ymm4, ymm4, 0xd8 \ 1951 __asm vpunpcklbw ymm4, ymm4, ymm4 \ 1952 __asm lea eax, [eax + 16] \ 1953 } 1954 1955// Read 8 UV from 422, upsample to 16 UV. With 16 Alpha. 1956#define READYUVA422_AVX2 __asm { \ 1957 __asm vmovq xmm0, qword ptr [esi] /* U */ \ 1958 __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \ 1959 __asm lea esi, [esi + 8] \ 1960 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ 1961 __asm vpermq ymm0, ymm0, 0xd8 \ 1962 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ 1963 __asm vmovdqu xmm4, [eax] /* Y */ \ 1964 __asm vpermq ymm4, ymm4, 0xd8 \ 1965 __asm vpunpcklbw ymm4, ymm4, ymm4 \ 1966 __asm lea eax, [eax + 16] \ 1967 __asm vmovdqu xmm5, [ebp] /* A */ \ 1968 __asm vpermq ymm5, ymm5, 0xd8 \ 1969 __asm lea ebp, [ebp + 16] \ 1970 } 1971 1972// Read 4 UV from 411, upsample to 16 UV. 1973#define READYUV411_AVX2 __asm { \ 1974 __asm vmovd xmm0, dword ptr [esi] /* U */ \ 1975 __asm vmovd xmm1, dword ptr [esi + edi] /* V */ \ 1976 __asm lea esi, [esi + 4] \ 1977 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ 1978 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ 1979 __asm vpermq ymm0, ymm0, 0xd8 \ 1980 __asm vpunpckldq ymm0, ymm0, ymm0 /* UVUVUVUV (upsample) */ \ 1981 __asm vmovdqu xmm4, [eax] /* Y */ \ 1982 __asm vpermq ymm4, ymm4, 0xd8 \ 1983 __asm vpunpcklbw ymm4, ymm4, ymm4 \ 1984 __asm lea eax, [eax + 16] \ 1985 } 1986 1987// Read 8 UV from NV12, upsample to 16 UV. 1988#define READNV12_AVX2 __asm { \ 1989 __asm vmovdqu xmm0, [esi] /* UV */ \ 1990 __asm lea esi, [esi + 16] \ 1991 __asm vpermq ymm0, ymm0, 0xd8 \ 1992 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ 1993 __asm vmovdqu xmm4, [eax] /* Y */ \ 1994 __asm vpermq ymm4, ymm4, 0xd8 \ 1995 __asm vpunpcklbw ymm4, ymm4, ymm4 \ 1996 __asm lea eax, [eax + 16] \ 1997 } 1998 1999// Read 8 UV from NV21, upsample to 16 UV. 2000#define READNV21_AVX2 __asm { \ 2001 __asm vmovdqu xmm0, [esi] /* UV */ \ 2002 __asm lea esi, [esi + 16] \ 2003 __asm vpermq ymm0, ymm0, 0xd8 \ 2004 __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleNV21 \ 2005 __asm vmovdqu xmm4, [eax] /* Y */ \ 2006 __asm vpermq ymm4, ymm4, 0xd8 \ 2007 __asm vpunpcklbw ymm4, ymm4, ymm4 \ 2008 __asm lea eax, [eax + 16] \ 2009 } 2010 2011// Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV. 2012#define READYUY2_AVX2 __asm { \ 2013 __asm vmovdqu ymm4, [eax] /* YUY2 */ \ 2014 __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleYUY2Y \ 2015 __asm vmovdqu ymm0, [eax] /* UV */ \ 2016 __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleYUY2UV \ 2017 __asm lea eax, [eax + 32] \ 2018 } 2019 2020// Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV. 2021#define READUYVY_AVX2 __asm { \ 2022 __asm vmovdqu ymm4, [eax] /* UYVY */ \ 2023 __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleUYVYY \ 2024 __asm vmovdqu ymm0, [eax] /* UV */ \ 2025 __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleUYVYUV \ 2026 __asm lea eax, [eax + 32] \ 2027 } 2028 2029// Convert 16 pixels: 16 UV and 16 Y. 2030#define YUVTORGB_AVX2(YuvConstants) __asm { \ 2031 __asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\ 2032 __asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\ 2033 __asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\ 2034 __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASR] \ 2035 __asm vpsubw ymm2, ymm3, ymm2 \ 2036 __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASG] \ 2037 __asm vpsubw ymm1, ymm3, ymm1 \ 2038 __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASB] \ 2039 __asm vpsubw ymm0, ymm3, ymm0 \ 2040 /* Step 2: Find Y contribution to 16 R,G,B values */ \ 2041 __asm vpmulhuw ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB] \ 2042 __asm vpaddsw ymm0, ymm0, ymm4 /* B += Y */ \ 2043 __asm vpaddsw ymm1, ymm1, ymm4 /* G += Y */ \ 2044 __asm vpaddsw ymm2, ymm2, ymm4 /* R += Y */ \ 2045 __asm vpsraw ymm0, ymm0, 6 \ 2046 __asm vpsraw ymm1, ymm1, 6 \ 2047 __asm vpsraw ymm2, ymm2, 6 \ 2048 __asm vpackuswb ymm0, ymm0, ymm0 /* B */ \ 2049 __asm vpackuswb ymm1, ymm1, ymm1 /* G */ \ 2050 __asm vpackuswb ymm2, ymm2, ymm2 /* R */ \ 2051 } 2052 2053// Store 16 ARGB values. 2054#define STOREARGB_AVX2 __asm { \ 2055 __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \ 2056 __asm vpermq ymm0, ymm0, 0xd8 \ 2057 __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \ 2058 __asm vpermq ymm2, ymm2, 0xd8 \ 2059 __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \ 2060 __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \ 2061 __asm vmovdqu 0[edx], ymm1 \ 2062 __asm vmovdqu 32[edx], ymm0 \ 2063 __asm lea edx, [edx + 64] \ 2064 } 2065 2066// Store 16 RGBA values. 2067#define STORERGBA_AVX2 __asm { \ 2068 __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */ \ 2069 __asm vpermq ymm1, ymm1, 0xd8 \ 2070 __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */ \ 2071 __asm vpermq ymm2, ymm2, 0xd8 \ 2072 __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */ \ 2073 __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */ \ 2074 __asm vmovdqu [edx], ymm0 \ 2075 __asm vmovdqu [edx + 32], ymm1 \ 2076 __asm lea edx, [edx + 64] \ 2077 } 2078 2079#ifdef HAS_I422TOARGBROW_AVX2 2080// 16 pixels 2081// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). 2082__declspec(naked) 2083void I422ToARGBRow_AVX2(const uint8* y_buf, 2084 const uint8* u_buf, 2085 const uint8* v_buf, 2086 uint8* dst_argb, 2087 const struct YuvConstants* yuvconstants, 2088 int width) { 2089 __asm { 2090 push esi 2091 push edi 2092 push ebx 2093 mov eax, [esp + 12 + 4] // Y 2094 mov esi, [esp + 12 + 8] // U 2095 mov edi, [esp + 12 + 12] // V 2096 mov edx, [esp + 12 + 16] // argb 2097 mov ebx, [esp + 12 + 20] // yuvconstants 2098 mov ecx, [esp + 12 + 24] // width 2099 sub edi, esi 2100 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha 2101 2102 convertloop: 2103 READYUV422_AVX2 2104 YUVTORGB_AVX2(ebx) 2105 STOREARGB_AVX2 2106 2107 sub ecx, 16 2108 jg convertloop 2109 2110 pop ebx 2111 pop edi 2112 pop esi 2113 vzeroupper 2114 ret 2115 } 2116} 2117#endif // HAS_I422TOARGBROW_AVX2 2118 2119#ifdef HAS_I422ALPHATOARGBROW_AVX2 2120// 16 pixels 2121// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB. 2122__declspec(naked) 2123void I422AlphaToARGBRow_AVX2(const uint8* y_buf, 2124 const uint8* u_buf, 2125 const uint8* v_buf, 2126 const uint8* a_buf, 2127 uint8* dst_argb, 2128 const struct YuvConstants* yuvconstants, 2129 int width) { 2130 __asm { 2131 push esi 2132 push edi 2133 push ebx 2134 push ebp 2135 mov eax, [esp + 16 + 4] // Y 2136 mov esi, [esp + 16 + 8] // U 2137 mov edi, [esp + 16 + 12] // V 2138 mov ebp, [esp + 16 + 16] // A 2139 mov edx, [esp + 16 + 20] // argb 2140 mov ebx, [esp + 16 + 24] // yuvconstants 2141 mov ecx, [esp + 16 + 28] // width 2142 sub edi, esi 2143 2144 convertloop: 2145 READYUVA422_AVX2 2146 YUVTORGB_AVX2(ebx) 2147 STOREARGB_AVX2 2148 2149 sub ecx, 16 2150 jg convertloop 2151 2152 pop ebp 2153 pop ebx 2154 pop edi 2155 pop esi 2156 vzeroupper 2157 ret 2158 } 2159} 2160#endif // HAS_I422ALPHATOARGBROW_AVX2 2161 2162#ifdef HAS_I444TOARGBROW_AVX2 2163// 16 pixels 2164// 16 UV values with 16 Y producing 16 ARGB (64 bytes). 2165__declspec(naked) 2166void I444ToARGBRow_AVX2(const uint8* y_buf, 2167 const uint8* u_buf, 2168 const uint8* v_buf, 2169 uint8* dst_argb, 2170 const struct YuvConstants* yuvconstants, 2171 int width) { 2172 __asm { 2173 push esi 2174 push edi 2175 push ebx 2176 mov eax, [esp + 12 + 4] // Y 2177 mov esi, [esp + 12 + 8] // U 2178 mov edi, [esp + 12 + 12] // V 2179 mov edx, [esp + 12 + 16] // argb 2180 mov ebx, [esp + 12 + 20] // yuvconstants 2181 mov ecx, [esp + 12 + 24] // width 2182 sub edi, esi 2183 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha 2184 convertloop: 2185 READYUV444_AVX2 2186 YUVTORGB_AVX2(ebx) 2187 STOREARGB_AVX2 2188 2189 sub ecx, 16 2190 jg convertloop 2191 2192 pop ebx 2193 pop edi 2194 pop esi 2195 vzeroupper 2196 ret 2197 } 2198} 2199#endif // HAS_I444TOARGBROW_AVX2 2200 2201#ifdef HAS_I411TOARGBROW_AVX2 2202// 16 pixels 2203// 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). 2204__declspec(naked) 2205void I411ToARGBRow_AVX2(const uint8* y_buf, 2206 const uint8* u_buf, 2207 const uint8* v_buf, 2208 uint8* dst_argb, 2209 const struct YuvConstants* yuvconstants, 2210 int width) { 2211 __asm { 2212 push esi 2213 push edi 2214 push ebx 2215 mov eax, [esp + 12 + 4] // Y 2216 mov esi, [esp + 12 + 8] // U 2217 mov edi, [esp + 12 + 12] // V 2218 mov edx, [esp + 12 + 16] // abgr 2219 mov ebx, [esp + 12 + 20] // yuvconstants 2220 mov ecx, [esp + 12 + 24] // width 2221 sub edi, esi 2222 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha 2223 2224 convertloop: 2225 READYUV411_AVX2 2226 YUVTORGB_AVX2(ebx) 2227 STOREARGB_AVX2 2228 2229 sub ecx, 16 2230 jg convertloop 2231 2232 pop ebx 2233 pop edi 2234 pop esi 2235 vzeroupper 2236 ret 2237 } 2238} 2239#endif // HAS_I411TOARGBROW_AVX2 2240 2241#ifdef HAS_NV12TOARGBROW_AVX2 2242// 16 pixels. 2243// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). 2244__declspec(naked) 2245void NV12ToARGBRow_AVX2(const uint8* y_buf, 2246 const uint8* uv_buf, 2247 uint8* dst_argb, 2248 const struct YuvConstants* yuvconstants, 2249 int width) { 2250 __asm { 2251 push esi 2252 push ebx 2253 mov eax, [esp + 8 + 4] // Y 2254 mov esi, [esp + 8 + 8] // UV 2255 mov edx, [esp + 8 + 12] // argb 2256 mov ebx, [esp + 8 + 16] // yuvconstants 2257 mov ecx, [esp + 8 + 20] // width 2258 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha 2259 2260 convertloop: 2261 READNV12_AVX2 2262 YUVTORGB_AVX2(ebx) 2263 STOREARGB_AVX2 2264 2265 sub ecx, 16 2266 jg convertloop 2267 2268 pop ebx 2269 pop esi 2270 vzeroupper 2271 ret 2272 } 2273} 2274#endif // HAS_NV12TOARGBROW_AVX2 2275 2276#ifdef HAS_NV21TOARGBROW_AVX2 2277// 16 pixels. 2278// 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). 2279__declspec(naked) 2280void NV21ToARGBRow_AVX2(const uint8* y_buf, 2281 const uint8* vu_buf, 2282 uint8* dst_argb, 2283 const struct YuvConstants* yuvconstants, 2284 int width) { 2285 __asm { 2286 push esi 2287 push ebx 2288 mov eax, [esp + 8 + 4] // Y 2289 mov esi, [esp + 8 + 8] // VU 2290 mov edx, [esp + 8 + 12] // argb 2291 mov ebx, [esp + 8 + 16] // yuvconstants 2292 mov ecx, [esp + 8 + 20] // width 2293 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha 2294 2295 convertloop: 2296 READNV21_AVX2 2297 YUVTORGB_AVX2(ebx) 2298 STOREARGB_AVX2 2299 2300 sub ecx, 16 2301 jg convertloop 2302 2303 pop ebx 2304 pop esi 2305 vzeroupper 2306 ret 2307 } 2308} 2309#endif // HAS_NV21TOARGBROW_AVX2 2310 2311#ifdef HAS_YUY2TOARGBROW_AVX2 2312// 16 pixels. 2313// 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes). 2314__declspec(naked) 2315void YUY2ToARGBRow_AVX2(const uint8* src_yuy2, 2316 uint8* dst_argb, 2317 const struct YuvConstants* yuvconstants, 2318 int width) { 2319 __asm { 2320 push ebx 2321 mov eax, [esp + 4 + 4] // yuy2 2322 mov edx, [esp + 4 + 8] // argb 2323 mov ebx, [esp + 4 + 12] // yuvconstants 2324 mov ecx, [esp + 4 + 16] // width 2325 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha 2326 2327 convertloop: 2328 READYUY2_AVX2 2329 YUVTORGB_AVX2(ebx) 2330 STOREARGB_AVX2 2331 2332 sub ecx, 16 2333 jg convertloop 2334 2335 pop ebx 2336 vzeroupper 2337 ret 2338 } 2339} 2340#endif // HAS_YUY2TOARGBROW_AVX2 2341 2342#ifdef HAS_UYVYTOARGBROW_AVX2 2343// 16 pixels. 2344// 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes). 2345__declspec(naked) 2346void UYVYToARGBRow_AVX2(const uint8* src_uyvy, 2347 uint8* dst_argb, 2348 const struct YuvConstants* yuvconstants, 2349 int width) { 2350 __asm { 2351 push ebx 2352 mov eax, [esp + 4 + 4] // uyvy 2353 mov edx, [esp + 4 + 8] // argb 2354 mov ebx, [esp + 4 + 12] // yuvconstants 2355 mov ecx, [esp + 4 + 16] // width 2356 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha 2357 2358 convertloop: 2359 READUYVY_AVX2 2360 YUVTORGB_AVX2(ebx) 2361 STOREARGB_AVX2 2362 2363 sub ecx, 16 2364 jg convertloop 2365 2366 pop ebx 2367 vzeroupper 2368 ret 2369 } 2370} 2371#endif // HAS_UYVYTOARGBROW_AVX2 2372 2373#ifdef HAS_I422TORGBAROW_AVX2 2374// 16 pixels 2375// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). 2376__declspec(naked) 2377void I422ToRGBARow_AVX2(const uint8* y_buf, 2378 const uint8* u_buf, 2379 const uint8* v_buf, 2380 uint8* dst_argb, 2381 const struct YuvConstants* yuvconstants, 2382 int width) { 2383 __asm { 2384 push esi 2385 push edi 2386 push ebx 2387 mov eax, [esp + 12 + 4] // Y 2388 mov esi, [esp + 12 + 8] // U 2389 mov edi, [esp + 12 + 12] // V 2390 mov edx, [esp + 12 + 16] // abgr 2391 mov ebx, [esp + 12 + 20] // yuvconstants 2392 mov ecx, [esp + 12 + 24] // width 2393 sub edi, esi 2394 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha 2395 2396 convertloop: 2397 READYUV422_AVX2 2398 YUVTORGB_AVX2(ebx) 2399 STORERGBA_AVX2 2400 2401 sub ecx, 16 2402 jg convertloop 2403 2404 pop ebx 2405 pop edi 2406 pop esi 2407 vzeroupper 2408 ret 2409 } 2410} 2411#endif // HAS_I422TORGBAROW_AVX2 2412 2413#if defined(HAS_I422TOARGBROW_SSSE3) 2414// TODO(fbarchard): Read that does half size on Y and treats 420 as 444. 2415// Allows a conversion with half size scaling. 2416 2417// Read 8 UV from 444. 2418#define READYUV444 __asm { \ 2419 __asm movq xmm0, qword ptr [esi] /* U */ \ 2420 __asm movq xmm1, qword ptr [esi + edi] /* V */ \ 2421 __asm lea esi, [esi + 8] \ 2422 __asm punpcklbw xmm0, xmm1 /* UV */ \ 2423 __asm movq xmm4, qword ptr [eax] \ 2424 __asm punpcklbw xmm4, xmm4 \ 2425 __asm lea eax, [eax + 8] \ 2426 } 2427 2428// Read 4 UV from 422, upsample to 8 UV. 2429#define READYUV422 __asm { \ 2430 __asm movd xmm0, [esi] /* U */ \ 2431 __asm movd xmm1, [esi + edi] /* V */ \ 2432 __asm lea esi, [esi + 4] \ 2433 __asm punpcklbw xmm0, xmm1 /* UV */ \ 2434 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ 2435 __asm movq xmm4, qword ptr [eax] \ 2436 __asm punpcklbw xmm4, xmm4 \ 2437 __asm lea eax, [eax + 8] \ 2438 } 2439 2440// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. 2441#define READYUVA422 __asm { \ 2442 __asm movd xmm0, [esi] /* U */ \ 2443 __asm movd xmm1, [esi + edi] /* V */ \ 2444 __asm lea esi, [esi + 4] \ 2445 __asm punpcklbw xmm0, xmm1 /* UV */ \ 2446 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ 2447 __asm movq xmm4, qword ptr [eax] /* Y */ \ 2448 __asm punpcklbw xmm4, xmm4 \ 2449 __asm lea eax, [eax + 8] \ 2450 __asm movq xmm5, qword ptr [ebp] /* A */ \ 2451 __asm lea ebp, [ebp + 8] \ 2452 } 2453 2454// Read 2 UV from 411, upsample to 8 UV. 2455// drmemory fails with memory fault if pinsrw used. libyuv bug: 525 2456// __asm pinsrw xmm0, [esi], 0 /* U */ 2457// __asm pinsrw xmm1, [esi + edi], 0 /* V */ 2458#define READYUV411_EBX __asm { \ 2459 __asm movzx ebx, word ptr [esi] /* U */ \ 2460 __asm movd xmm0, ebx \ 2461 __asm movzx ebx, word ptr [esi + edi] /* V */ \ 2462 __asm movd xmm1, ebx \ 2463 __asm lea esi, [esi + 2] \ 2464 __asm punpcklbw xmm0, xmm1 /* UV */ \ 2465 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ 2466 __asm punpckldq xmm0, xmm0 /* UVUVUVUV (upsample) */ \ 2467 __asm movq xmm4, qword ptr [eax] \ 2468 __asm punpcklbw xmm4, xmm4 \ 2469 __asm lea eax, [eax + 8] \ 2470 } 2471 2472// Read 4 UV from NV12, upsample to 8 UV. 2473#define READNV12 __asm { \ 2474 __asm movq xmm0, qword ptr [esi] /* UV */ \ 2475 __asm lea esi, [esi + 8] \ 2476 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ 2477 __asm movq xmm4, qword ptr [eax] \ 2478 __asm punpcklbw xmm4, xmm4 \ 2479 __asm lea eax, [eax + 8] \ 2480 } 2481 2482// Read 4 VU from NV21, upsample to 8 UV. 2483#define READNV21 __asm { \ 2484 __asm movq xmm0, qword ptr [esi] /* UV */ \ 2485 __asm lea esi, [esi + 8] \ 2486 __asm pshufb xmm0, xmmword ptr kShuffleNV21 \ 2487 __asm movq xmm4, qword ptr [eax] \ 2488 __asm punpcklbw xmm4, xmm4 \ 2489 __asm lea eax, [eax + 8] \ 2490 } 2491 2492// Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV. 2493#define READYUY2 __asm { \ 2494 __asm movdqu xmm4, [eax] /* YUY2 */ \ 2495 __asm pshufb xmm4, xmmword ptr kShuffleYUY2Y \ 2496 __asm movdqu xmm0, [eax] /* UV */ \ 2497 __asm pshufb xmm0, xmmword ptr kShuffleYUY2UV \ 2498 __asm lea eax, [eax + 16] \ 2499 } 2500 2501// Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV. 2502#define READUYVY __asm { \ 2503 __asm movdqu xmm4, [eax] /* UYVY */ \ 2504 __asm pshufb xmm4, xmmword ptr kShuffleUYVYY \ 2505 __asm movdqu xmm0, [eax] /* UV */ \ 2506 __asm pshufb xmm0, xmmword ptr kShuffleUYVYUV \ 2507 __asm lea eax, [eax + 16] \ 2508 } 2509 2510// Convert 8 pixels: 8 UV and 8 Y. 2511#define YUVTORGB(YuvConstants) __asm { \ 2512 __asm movdqa xmm1, xmm0 \ 2513 __asm movdqa xmm2, xmm0 \ 2514 __asm movdqa xmm3, xmm0 \ 2515 __asm movdqa xmm0, xmmword ptr [YuvConstants + KUVBIASB] \ 2516 __asm pmaddubsw xmm1, xmmword ptr [YuvConstants + KUVTOB] \ 2517 __asm psubw xmm0, xmm1 \ 2518 __asm movdqa xmm1, xmmword ptr [YuvConstants + KUVBIASG] \ 2519 __asm pmaddubsw xmm2, xmmword ptr [YuvConstants + KUVTOG] \ 2520 __asm psubw xmm1, xmm2 \ 2521 __asm movdqa xmm2, xmmword ptr [YuvConstants + KUVBIASR] \ 2522 __asm pmaddubsw xmm3, xmmword ptr [YuvConstants + KUVTOR] \ 2523 __asm psubw xmm2, xmm3 \ 2524 __asm pmulhuw xmm4, xmmword ptr [YuvConstants + KYTORGB] \ 2525 __asm paddsw xmm0, xmm4 /* B += Y */ \ 2526 __asm paddsw xmm1, xmm4 /* G += Y */ \ 2527 __asm paddsw xmm2, xmm4 /* R += Y */ \ 2528 __asm psraw xmm0, 6 \ 2529 __asm psraw xmm1, 6 \ 2530 __asm psraw xmm2, 6 \ 2531 __asm packuswb xmm0, xmm0 /* B */ \ 2532 __asm packuswb xmm1, xmm1 /* G */ \ 2533 __asm packuswb xmm2, xmm2 /* R */ \ 2534 } 2535 2536// Store 8 ARGB values. 2537#define STOREARGB __asm { \ 2538 __asm punpcklbw xmm0, xmm1 /* BG */ \ 2539 __asm punpcklbw xmm2, xmm5 /* RA */ \ 2540 __asm movdqa xmm1, xmm0 \ 2541 __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \ 2542 __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \ 2543 __asm movdqu 0[edx], xmm0 \ 2544 __asm movdqu 16[edx], xmm1 \ 2545 __asm lea edx, [edx + 32] \ 2546 } 2547 2548// Store 8 BGRA values. 2549#define STOREBGRA __asm { \ 2550 __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ 2551 __asm punpcklbw xmm1, xmm0 /* GB */ \ 2552 __asm punpcklbw xmm5, xmm2 /* AR */ \ 2553 __asm movdqa xmm0, xmm5 \ 2554 __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \ 2555 __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \ 2556 __asm movdqu 0[edx], xmm5 \ 2557 __asm movdqu 16[edx], xmm0 \ 2558 __asm lea edx, [edx + 32] \ 2559 } 2560 2561// Store 8 RGBA values. 2562#define STORERGBA __asm { \ 2563 __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ 2564 __asm punpcklbw xmm1, xmm2 /* GR */ \ 2565 __asm punpcklbw xmm5, xmm0 /* AB */ \ 2566 __asm movdqa xmm0, xmm5 \ 2567 __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \ 2568 __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \ 2569 __asm movdqu 0[edx], xmm5 \ 2570 __asm movdqu 16[edx], xmm0 \ 2571 __asm lea edx, [edx + 32] \ 2572 } 2573 2574// Store 8 RGB24 values. 2575#define STORERGB24 __asm { \ 2576 /* Weave into RRGB */ \ 2577 __asm punpcklbw xmm0, xmm1 /* BG */ \ 2578 __asm punpcklbw xmm2, xmm2 /* RR */ \ 2579 __asm movdqa xmm1, xmm0 \ 2580 __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ 2581 __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \ 2582 /* RRGB -> RGB24 */ \ 2583 __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \ 2584 __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \ 2585 __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \ 2586 __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \ 2587 __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \ 2588 __asm lea edx, [edx + 24] \ 2589 } 2590 2591// Store 8 RGB565 values. 2592#define STORERGB565 __asm { \ 2593 /* Weave into RRGB */ \ 2594 __asm punpcklbw xmm0, xmm1 /* BG */ \ 2595 __asm punpcklbw xmm2, xmm2 /* RR */ \ 2596 __asm movdqa xmm1, xmm0 \ 2597 __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ 2598 __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \ 2599 /* RRGB -> RGB565 */ \ 2600 __asm movdqa xmm3, xmm0 /* B first 4 pixels of argb */ \ 2601 __asm movdqa xmm2, xmm0 /* G */ \ 2602 __asm pslld xmm0, 8 /* R */ \ 2603 __asm psrld xmm3, 3 /* B */ \ 2604 __asm psrld xmm2, 5 /* G */ \ 2605 __asm psrad xmm0, 16 /* R */ \ 2606 __asm pand xmm3, xmm5 /* B */ \ 2607 __asm pand xmm2, xmm6 /* G */ \ 2608 __asm pand xmm0, xmm7 /* R */ \ 2609 __asm por xmm3, xmm2 /* BG */ \ 2610 __asm por xmm0, xmm3 /* BGR */ \ 2611 __asm movdqa xmm3, xmm1 /* B next 4 pixels of argb */ \ 2612 __asm movdqa xmm2, xmm1 /* G */ \ 2613 __asm pslld xmm1, 8 /* R */ \ 2614 __asm psrld xmm3, 3 /* B */ \ 2615 __asm psrld xmm2, 5 /* G */ \ 2616 __asm psrad xmm1, 16 /* R */ \ 2617 __asm pand xmm3, xmm5 /* B */ \ 2618 __asm pand xmm2, xmm6 /* G */ \ 2619 __asm pand xmm1, xmm7 /* R */ \ 2620 __asm por xmm3, xmm2 /* BG */ \ 2621 __asm por xmm1, xmm3 /* BGR */ \ 2622 __asm packssdw xmm0, xmm1 \ 2623 __asm movdqu 0[edx], xmm0 /* store 8 pixels of RGB565 */ \ 2624 __asm lea edx, [edx + 16] \ 2625 } 2626 2627// 8 pixels. 2628// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). 2629__declspec(naked) 2630void I444ToARGBRow_SSSE3(const uint8* y_buf, 2631 const uint8* u_buf, 2632 const uint8* v_buf, 2633 uint8* dst_argb, 2634 const struct YuvConstants* yuvconstants, 2635 int width) { 2636 __asm { 2637 push esi 2638 push edi 2639 push ebx 2640 mov eax, [esp + 12 + 4] // Y 2641 mov esi, [esp + 12 + 8] // U 2642 mov edi, [esp + 12 + 12] // V 2643 mov edx, [esp + 12 + 16] // argb 2644 mov ebx, [esp + 12 + 20] // yuvconstants 2645 mov ecx, [esp + 12 + 24] // width 2646 sub edi, esi 2647 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2648 2649 convertloop: 2650 READYUV444 2651 YUVTORGB(ebx) 2652 STOREARGB 2653 2654 sub ecx, 8 2655 jg convertloop 2656 2657 pop ebx 2658 pop edi 2659 pop esi 2660 ret 2661 } 2662} 2663 2664// 8 pixels. 2665// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes). 2666__declspec(naked) 2667void I422ToRGB24Row_SSSE3(const uint8* y_buf, 2668 const uint8* u_buf, 2669 const uint8* v_buf, 2670 uint8* dst_rgb24, 2671 const struct YuvConstants* yuvconstants, 2672 int width) { 2673 __asm { 2674 push esi 2675 push edi 2676 push ebx 2677 mov eax, [esp + 12 + 4] // Y 2678 mov esi, [esp + 12 + 8] // U 2679 mov edi, [esp + 12 + 12] // V 2680 mov edx, [esp + 12 + 16] // argb 2681 mov ebx, [esp + 12 + 20] // yuvconstants 2682 mov ecx, [esp + 12 + 24] // width 2683 sub edi, esi 2684 movdqa xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0 2685 movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24 2686 2687 convertloop: 2688 READYUV422 2689 YUVTORGB(ebx) 2690 STORERGB24 2691 2692 sub ecx, 8 2693 jg convertloop 2694 2695 pop ebx 2696 pop edi 2697 pop esi 2698 ret 2699 } 2700} 2701 2702// 8 pixels 2703// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes). 2704__declspec(naked) 2705void I422ToRGB565Row_SSSE3(const uint8* y_buf, 2706 const uint8* u_buf, 2707 const uint8* v_buf, 2708 uint8* rgb565_buf, 2709 const struct YuvConstants* yuvconstants, 2710 int width) { 2711 __asm { 2712 push esi 2713 push edi 2714 push ebx 2715 mov eax, [esp + 12 + 4] // Y 2716 mov esi, [esp + 12 + 8] // U 2717 mov edi, [esp + 12 + 12] // V 2718 mov edx, [esp + 12 + 16] // argb 2719 mov ebx, [esp + 12 + 20] // yuvconstants 2720 mov ecx, [esp + 12 + 24] // width 2721 sub edi, esi 2722 pcmpeqb xmm5, xmm5 // generate mask 0x0000001f 2723 psrld xmm5, 27 2724 pcmpeqb xmm6, xmm6 // generate mask 0x000007e0 2725 psrld xmm6, 26 2726 pslld xmm6, 5 2727 pcmpeqb xmm7, xmm7 // generate mask 0xfffff800 2728 pslld xmm7, 11 2729 2730 convertloop: 2731 READYUV422 2732 YUVTORGB(ebx) 2733 STORERGB565 2734 2735 sub ecx, 8 2736 jg convertloop 2737 2738 pop ebx 2739 pop edi 2740 pop esi 2741 ret 2742 } 2743} 2744 2745// 8 pixels. 2746// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 2747__declspec(naked) 2748void I422ToARGBRow_SSSE3(const uint8* y_buf, 2749 const uint8* u_buf, 2750 const uint8* v_buf, 2751 uint8* dst_argb, 2752 const struct YuvConstants* yuvconstants, 2753 int width) { 2754 __asm { 2755 push esi 2756 push edi 2757 push ebx 2758 mov eax, [esp + 12 + 4] // Y 2759 mov esi, [esp + 12 + 8] // U 2760 mov edi, [esp + 12 + 12] // V 2761 mov edx, [esp + 12 + 16] // argb 2762 mov ebx, [esp + 12 + 20] // yuvconstants 2763 mov ecx, [esp + 12 + 24] // width 2764 sub edi, esi 2765 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2766 2767 convertloop: 2768 READYUV422 2769 YUVTORGB(ebx) 2770 STOREARGB 2771 2772 sub ecx, 8 2773 jg convertloop 2774 2775 pop ebx 2776 pop edi 2777 pop esi 2778 ret 2779 } 2780} 2781 2782// 8 pixels. 2783// 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB. 2784__declspec(naked) 2785void I422AlphaToARGBRow_SSSE3(const uint8* y_buf, 2786 const uint8* u_buf, 2787 const uint8* v_buf, 2788 const uint8* a_buf, 2789 uint8* dst_argb, 2790 const struct YuvConstants* yuvconstants, 2791 int width) { 2792 __asm { 2793 push esi 2794 push edi 2795 push ebx 2796 push ebp 2797 mov eax, [esp + 16 + 4] // Y 2798 mov esi, [esp + 16 + 8] // U 2799 mov edi, [esp + 16 + 12] // V 2800 mov ebp, [esp + 16 + 16] // A 2801 mov edx, [esp + 16 + 20] // argb 2802 mov ebx, [esp + 16 + 24] // yuvconstants 2803 mov ecx, [esp + 16 + 28] // width 2804 sub edi, esi 2805 2806 convertloop: 2807 READYUVA422 2808 YUVTORGB(ebx) 2809 STOREARGB 2810 2811 sub ecx, 8 2812 jg convertloop 2813 2814 pop ebp 2815 pop ebx 2816 pop edi 2817 pop esi 2818 ret 2819 } 2820} 2821 2822// 8 pixels. 2823// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 2824// Similar to I420 but duplicate UV once more. 2825__declspec(naked) 2826void I411ToARGBRow_SSSE3(const uint8* y_buf, 2827 const uint8* u_buf, 2828 const uint8* v_buf, 2829 uint8* dst_argb, 2830 const struct YuvConstants* yuvconstants, 2831 int width) { 2832 __asm { 2833 push esi 2834 push edi 2835 push ebx 2836 push ebp 2837 mov eax, [esp + 16 + 4] // Y 2838 mov esi, [esp + 16 + 8] // U 2839 mov edi, [esp + 16 + 12] // V 2840 mov edx, [esp + 16 + 16] // abgr 2841 mov ebp, [esp + 16 + 20] // yuvconstants 2842 mov ecx, [esp + 16 + 24] // width 2843 sub edi, esi 2844 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2845 2846 convertloop: 2847 READYUV411_EBX 2848 YUVTORGB(ebp) 2849 STOREARGB 2850 2851 sub ecx, 8 2852 jg convertloop 2853 2854 pop ebp 2855 pop ebx 2856 pop edi 2857 pop esi 2858 ret 2859 } 2860} 2861 2862// 8 pixels. 2863// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 2864__declspec(naked) 2865void NV12ToARGBRow_SSSE3(const uint8* y_buf, 2866 const uint8* uv_buf, 2867 uint8* dst_argb, 2868 const struct YuvConstants* yuvconstants, 2869 int width) { 2870 __asm { 2871 push esi 2872 push ebx 2873 mov eax, [esp + 8 + 4] // Y 2874 mov esi, [esp + 8 + 8] // UV 2875 mov edx, [esp + 8 + 12] // argb 2876 mov ebx, [esp + 8 + 16] // yuvconstants 2877 mov ecx, [esp + 8 + 20] // width 2878 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2879 2880 convertloop: 2881 READNV12 2882 YUVTORGB(ebx) 2883 STOREARGB 2884 2885 sub ecx, 8 2886 jg convertloop 2887 2888 pop ebx 2889 pop esi 2890 ret 2891 } 2892} 2893 2894// 8 pixels. 2895// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 2896__declspec(naked) 2897void NV21ToARGBRow_SSSE3(const uint8* y_buf, 2898 const uint8* vu_buf, 2899 uint8* dst_argb, 2900 const struct YuvConstants* yuvconstants, 2901 int width) { 2902 __asm { 2903 push esi 2904 push ebx 2905 mov eax, [esp + 8 + 4] // Y 2906 mov esi, [esp + 8 + 8] // VU 2907 mov edx, [esp + 8 + 12] // argb 2908 mov ebx, [esp + 8 + 16] // yuvconstants 2909 mov ecx, [esp + 8 + 20] // width 2910 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2911 2912 convertloop: 2913 READNV21 2914 YUVTORGB(ebx) 2915 STOREARGB 2916 2917 sub ecx, 8 2918 jg convertloop 2919 2920 pop ebx 2921 pop esi 2922 ret 2923 } 2924} 2925 2926// 8 pixels. 2927// 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes). 2928__declspec(naked) 2929void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2, 2930 uint8* dst_argb, 2931 const struct YuvConstants* yuvconstants, 2932 int width) { 2933 __asm { 2934 push ebx 2935 mov eax, [esp + 4 + 4] // yuy2 2936 mov edx, [esp + 4 + 8] // argb 2937 mov ebx, [esp + 4 + 12] // yuvconstants 2938 mov ecx, [esp + 4 + 16] // width 2939 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2940 2941 convertloop: 2942 READYUY2 2943 YUVTORGB(ebx) 2944 STOREARGB 2945 2946 sub ecx, 8 2947 jg convertloop 2948 2949 pop ebx 2950 ret 2951 } 2952} 2953 2954// 8 pixels. 2955// 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes). 2956__declspec(naked) 2957void UYVYToARGBRow_SSSE3(const uint8* src_uyvy, 2958 uint8* dst_argb, 2959 const struct YuvConstants* yuvconstants, 2960 int width) { 2961 __asm { 2962 push ebx 2963 mov eax, [esp + 4 + 4] // uyvy 2964 mov edx, [esp + 4 + 8] // argb 2965 mov ebx, [esp + 4 + 12] // yuvconstants 2966 mov ecx, [esp + 4 + 16] // width 2967 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2968 2969 convertloop: 2970 READUYVY 2971 YUVTORGB(ebx) 2972 STOREARGB 2973 2974 sub ecx, 8 2975 jg convertloop 2976 2977 pop ebx 2978 ret 2979 } 2980} 2981 2982__declspec(naked) 2983void I422ToRGBARow_SSSE3(const uint8* y_buf, 2984 const uint8* u_buf, 2985 const uint8* v_buf, 2986 uint8* dst_rgba, 2987 const struct YuvConstants* yuvconstants, 2988 int width) { 2989 __asm { 2990 push esi 2991 push edi 2992 push ebx 2993 mov eax, [esp + 12 + 4] // Y 2994 mov esi, [esp + 12 + 8] // U 2995 mov edi, [esp + 12 + 12] // V 2996 mov edx, [esp + 12 + 16] // argb 2997 mov ebx, [esp + 12 + 20] // yuvconstants 2998 mov ecx, [esp + 12 + 24] // width 2999 sub edi, esi 3000 3001 convertloop: 3002 READYUV422 3003 YUVTORGB(ebx) 3004 STORERGBA 3005 3006 sub ecx, 8 3007 jg convertloop 3008 3009 pop ebx 3010 pop edi 3011 pop esi 3012 ret 3013 } 3014} 3015#endif // HAS_I422TOARGBROW_SSSE3 3016 3017#ifdef HAS_I400TOARGBROW_SSE2 3018// 8 pixels of Y converted to 8 pixels of ARGB (32 bytes). 3019__declspec(naked) 3020void I400ToARGBRow_SSE2(const uint8* y_buf, 3021 uint8* rgb_buf, 3022 int width) { 3023 __asm { 3024 mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) 3025 movd xmm2, eax 3026 pshufd xmm2, xmm2,0 3027 mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16) 3028 movd xmm3, eax 3029 pshufd xmm3, xmm3, 0 3030 pcmpeqb xmm4, xmm4 // generate mask 0xff000000 3031 pslld xmm4, 24 3032 3033 mov eax, [esp + 4] // Y 3034 mov edx, [esp + 8] // rgb 3035 mov ecx, [esp + 12] // width 3036 3037 convertloop: 3038 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 3039 movq xmm0, qword ptr [eax] 3040 lea eax, [eax + 8] 3041 punpcklbw xmm0, xmm0 // Y.Y 3042 pmulhuw xmm0, xmm2 3043 psubusw xmm0, xmm3 3044 psrlw xmm0, 6 3045 packuswb xmm0, xmm0 // G 3046 3047 // Step 2: Weave into ARGB 3048 punpcklbw xmm0, xmm0 // GG 3049 movdqa xmm1, xmm0 3050 punpcklwd xmm0, xmm0 // BGRA first 4 pixels 3051 punpckhwd xmm1, xmm1 // BGRA next 4 pixels 3052 por xmm0, xmm4 3053 por xmm1, xmm4 3054 movdqu [edx], xmm0 3055 movdqu [edx + 16], xmm1 3056 lea edx, [edx + 32] 3057 sub ecx, 8 3058 jg convertloop 3059 ret 3060 } 3061} 3062#endif // HAS_I400TOARGBROW_SSE2 3063 3064#ifdef HAS_I400TOARGBROW_AVX2 3065// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes). 3066// note: vpunpcklbw mutates and vpackuswb unmutates. 3067__declspec(naked) 3068void I400ToARGBRow_AVX2(const uint8* y_buf, 3069 uint8* rgb_buf, 3070 int width) { 3071 __asm { 3072 mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) 3073 vmovd xmm2, eax 3074 vbroadcastss ymm2, xmm2 3075 mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16) 3076 vmovd xmm3, eax 3077 vbroadcastss ymm3, xmm3 3078 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xff000000 3079 vpslld ymm4, ymm4, 24 3080 3081 mov eax, [esp + 4] // Y 3082 mov edx, [esp + 8] // rgb 3083 mov ecx, [esp + 12] // width 3084 3085 convertloop: 3086 // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164 3087 vmovdqu xmm0, [eax] 3088 lea eax, [eax + 16] 3089 vpermq ymm0, ymm0, 0xd8 // vpunpcklbw mutates 3090 vpunpcklbw ymm0, ymm0, ymm0 // Y.Y 3091 vpmulhuw ymm0, ymm0, ymm2 3092 vpsubusw ymm0, ymm0, ymm3 3093 vpsrlw ymm0, ymm0, 6 3094 vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120 3095 3096 // TODO(fbarchard): Weave alpha with unpack. 3097 // Step 2: Weave into ARGB 3098 vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates 3099 vpermq ymm1, ymm1, 0xd8 3100 vpunpcklwd ymm0, ymm1, ymm1 // GGGG first 8 pixels 3101 vpunpckhwd ymm1, ymm1, ymm1 // GGGG next 8 pixels 3102 vpor ymm0, ymm0, ymm4 3103 vpor ymm1, ymm1, ymm4 3104 vmovdqu [edx], ymm0 3105 vmovdqu [edx + 32], ymm1 3106 lea edx, [edx + 64] 3107 sub ecx, 16 3108 jg convertloop 3109 vzeroupper 3110 ret 3111 } 3112} 3113#endif // HAS_I400TOARGBROW_AVX2 3114 3115#ifdef HAS_MIRRORROW_SSSE3 3116// Shuffle table for reversing the bytes. 3117static const uvec8 kShuffleMirror = { 3118 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u 3119}; 3120 3121// TODO(fbarchard): Replace lea with -16 offset. 3122__declspec(naked) 3123void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { 3124 __asm { 3125 mov eax, [esp + 4] // src 3126 mov edx, [esp + 8] // dst 3127 mov ecx, [esp + 12] // width 3128 movdqa xmm5, xmmword ptr kShuffleMirror 3129 3130 convertloop: 3131 movdqu xmm0, [eax - 16 + ecx] 3132 pshufb xmm0, xmm5 3133 movdqu [edx], xmm0 3134 lea edx, [edx + 16] 3135 sub ecx, 16 3136 jg convertloop 3137 ret 3138 } 3139} 3140#endif // HAS_MIRRORROW_SSSE3 3141 3142#ifdef HAS_MIRRORROW_AVX2 3143__declspec(naked) 3144void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { 3145 __asm { 3146 mov eax, [esp + 4] // src 3147 mov edx, [esp + 8] // dst 3148 mov ecx, [esp + 12] // width 3149 vbroadcastf128 ymm5, xmmword ptr kShuffleMirror 3150 3151 convertloop: 3152 vmovdqu ymm0, [eax - 32 + ecx] 3153 vpshufb ymm0, ymm0, ymm5 3154 vpermq ymm0, ymm0, 0x4e // swap high and low halfs 3155 vmovdqu [edx], ymm0 3156 lea edx, [edx + 32] 3157 sub ecx, 32 3158 jg convertloop 3159 vzeroupper 3160 ret 3161 } 3162} 3163#endif // HAS_MIRRORROW_AVX2 3164 3165#ifdef HAS_MIRRORUVROW_SSSE3 3166// Shuffle table for reversing the bytes of UV channels. 3167static const uvec8 kShuffleMirrorUV = { 3168 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u 3169}; 3170 3171__declspec(naked) 3172void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, 3173 int width) { 3174 __asm { 3175 push edi 3176 mov eax, [esp + 4 + 4] // src 3177 mov edx, [esp + 4 + 8] // dst_u 3178 mov edi, [esp + 4 + 12] // dst_v 3179 mov ecx, [esp + 4 + 16] // width 3180 movdqa xmm1, xmmword ptr kShuffleMirrorUV 3181 lea eax, [eax + ecx * 2 - 16] 3182 sub edi, edx 3183 3184 convertloop: 3185 movdqu xmm0, [eax] 3186 lea eax, [eax - 16] 3187 pshufb xmm0, xmm1 3188 movlpd qword ptr [edx], xmm0 3189 movhpd qword ptr [edx + edi], xmm0 3190 lea edx, [edx + 8] 3191 sub ecx, 8 3192 jg convertloop 3193 3194 pop edi 3195 ret 3196 } 3197} 3198#endif // HAS_MIRRORUVROW_SSSE3 3199 3200#ifdef HAS_ARGBMIRRORROW_SSE2 3201__declspec(naked) 3202void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) { 3203 __asm { 3204 mov eax, [esp + 4] // src 3205 mov edx, [esp + 8] // dst 3206 mov ecx, [esp + 12] // width 3207 lea eax, [eax - 16 + ecx * 4] // last 4 pixels. 3208 3209 convertloop: 3210 movdqu xmm0, [eax] 3211 lea eax, [eax - 16] 3212 pshufd xmm0, xmm0, 0x1b 3213 movdqu [edx], xmm0 3214 lea edx, [edx + 16] 3215 sub ecx, 4 3216 jg convertloop 3217 ret 3218 } 3219} 3220#endif // HAS_ARGBMIRRORROW_SSE2 3221 3222#ifdef HAS_ARGBMIRRORROW_AVX2 3223// Shuffle table for reversing the bytes. 3224static const ulvec32 kARGBShuffleMirror_AVX2 = { 3225 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u 3226}; 3227 3228__declspec(naked) 3229void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { 3230 __asm { 3231 mov eax, [esp + 4] // src 3232 mov edx, [esp + 8] // dst 3233 mov ecx, [esp + 12] // width 3234 vmovdqu ymm5, ymmword ptr kARGBShuffleMirror_AVX2 3235 3236 convertloop: 3237 vpermd ymm0, ymm5, [eax - 32 + ecx * 4] // permute dword order 3238 vmovdqu [edx], ymm0 3239 lea edx, [edx + 32] 3240 sub ecx, 8 3241 jg convertloop 3242 vzeroupper 3243 ret 3244 } 3245} 3246#endif // HAS_ARGBMIRRORROW_AVX2 3247 3248#ifdef HAS_SPLITUVROW_SSE2 3249__declspec(naked) 3250void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, 3251 int width) { 3252 __asm { 3253 push edi 3254 mov eax, [esp + 4 + 4] // src_uv 3255 mov edx, [esp + 4 + 8] // dst_u 3256 mov edi, [esp + 4 + 12] // dst_v 3257 mov ecx, [esp + 4 + 16] // width 3258 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 3259 psrlw xmm5, 8 3260 sub edi, edx 3261 3262 convertloop: 3263 movdqu xmm0, [eax] 3264 movdqu xmm1, [eax + 16] 3265 lea eax, [eax + 32] 3266 movdqa xmm2, xmm0 3267 movdqa xmm3, xmm1 3268 pand xmm0, xmm5 // even bytes 3269 pand xmm1, xmm5 3270 packuswb xmm0, xmm1 3271 psrlw xmm2, 8 // odd bytes 3272 psrlw xmm3, 8 3273 packuswb xmm2, xmm3 3274 movdqu [edx], xmm0 3275 movdqu [edx + edi], xmm2 3276 lea edx, [edx + 16] 3277 sub ecx, 16 3278 jg convertloop 3279 3280 pop edi 3281 ret 3282 } 3283} 3284 3285#endif // HAS_SPLITUVROW_SSE2 3286 3287#ifdef HAS_SPLITUVROW_AVX2 3288__declspec(naked) 3289void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, 3290 int width) { 3291 __asm { 3292 push edi 3293 mov eax, [esp + 4 + 4] // src_uv 3294 mov edx, [esp + 4 + 8] // dst_u 3295 mov edi, [esp + 4 + 12] // dst_v 3296 mov ecx, [esp + 4 + 16] // width 3297 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff 3298 vpsrlw ymm5, ymm5, 8 3299 sub edi, edx 3300 3301 convertloop: 3302 vmovdqu ymm0, [eax] 3303 vmovdqu ymm1, [eax + 32] 3304 lea eax, [eax + 64] 3305 vpsrlw ymm2, ymm0, 8 // odd bytes 3306 vpsrlw ymm3, ymm1, 8 3307 vpand ymm0, ymm0, ymm5 // even bytes 3308 vpand ymm1, ymm1, ymm5 3309 vpackuswb ymm0, ymm0, ymm1 3310 vpackuswb ymm2, ymm2, ymm3 3311 vpermq ymm0, ymm0, 0xd8 3312 vpermq ymm2, ymm2, 0xd8 3313 vmovdqu [edx], ymm0 3314 vmovdqu [edx + edi], ymm2 3315 lea edx, [edx + 32] 3316 sub ecx, 32 3317 jg convertloop 3318 3319 pop edi 3320 vzeroupper 3321 ret 3322 } 3323} 3324#endif // HAS_SPLITUVROW_AVX2 3325 3326#ifdef HAS_MERGEUVROW_SSE2 3327__declspec(naked) 3328void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, 3329 int width) { 3330 __asm { 3331 push edi 3332 mov eax, [esp + 4 + 4] // src_u 3333 mov edx, [esp + 4 + 8] // src_v 3334 mov edi, [esp + 4 + 12] // dst_uv 3335 mov ecx, [esp + 4 + 16] // width 3336 sub edx, eax 3337 3338 convertloop: 3339 movdqu xmm0, [eax] // read 16 U's 3340 movdqu xmm1, [eax + edx] // and 16 V's 3341 lea eax, [eax + 16] 3342 movdqa xmm2, xmm0 3343 punpcklbw xmm0, xmm1 // first 8 UV pairs 3344 punpckhbw xmm2, xmm1 // next 8 UV pairs 3345 movdqu [edi], xmm0 3346 movdqu [edi + 16], xmm2 3347 lea edi, [edi + 32] 3348 sub ecx, 16 3349 jg convertloop 3350 3351 pop edi 3352 ret 3353 } 3354} 3355#endif // HAS_MERGEUVROW_SSE2 3356 3357#ifdef HAS_MERGEUVROW_AVX2 3358__declspec(naked) 3359void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, 3360 int width) { 3361 __asm { 3362 push edi 3363 mov eax, [esp + 4 + 4] // src_u 3364 mov edx, [esp + 4 + 8] // src_v 3365 mov edi, [esp + 4 + 12] // dst_uv 3366 mov ecx, [esp + 4 + 16] // width 3367 sub edx, eax 3368 3369 convertloop: 3370 vmovdqu ymm0, [eax] // read 32 U's 3371 vmovdqu ymm1, [eax + edx] // and 32 V's 3372 lea eax, [eax + 32] 3373 vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2 3374 vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3 3375 vextractf128 [edi], ymm2, 0 // bytes 0..15 3376 vextractf128 [edi + 16], ymm0, 0 // bytes 16..31 3377 vextractf128 [edi + 32], ymm2, 1 // bytes 32..47 3378 vextractf128 [edi + 48], ymm0, 1 // bytes 47..63 3379 lea edi, [edi + 64] 3380 sub ecx, 32 3381 jg convertloop 3382 3383 pop edi 3384 vzeroupper 3385 ret 3386 } 3387} 3388#endif // HAS_MERGEUVROW_AVX2 3389 3390#ifdef HAS_COPYROW_SSE2 3391// CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time. 3392__declspec(naked) 3393void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { 3394 __asm { 3395 mov eax, [esp + 4] // src 3396 mov edx, [esp + 8] // dst 3397 mov ecx, [esp + 12] // count 3398 test eax, 15 3399 jne convertloopu 3400 test edx, 15 3401 jne convertloopu 3402 3403 convertloopa: 3404 movdqa xmm0, [eax] 3405 movdqa xmm1, [eax + 16] 3406 lea eax, [eax + 32] 3407 movdqa [edx], xmm0 3408 movdqa [edx + 16], xmm1 3409 lea edx, [edx + 32] 3410 sub ecx, 32 3411 jg convertloopa 3412 ret 3413 3414 convertloopu: 3415 movdqu xmm0, [eax] 3416 movdqu xmm1, [eax + 16] 3417 lea eax, [eax + 32] 3418 movdqu [edx], xmm0 3419 movdqu [edx + 16], xmm1 3420 lea edx, [edx + 32] 3421 sub ecx, 32 3422 jg convertloopu 3423 ret 3424 } 3425} 3426#endif // HAS_COPYROW_SSE2 3427 3428#ifdef HAS_COPYROW_AVX 3429// CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time. 3430__declspec(naked) 3431void CopyRow_AVX(const uint8* src, uint8* dst, int count) { 3432 __asm { 3433 mov eax, [esp + 4] // src 3434 mov edx, [esp + 8] // dst 3435 mov ecx, [esp + 12] // count 3436 3437 convertloop: 3438 vmovdqu ymm0, [eax] 3439 vmovdqu ymm1, [eax + 32] 3440 lea eax, [eax + 64] 3441 vmovdqu [edx], ymm0 3442 vmovdqu [edx + 32], ymm1 3443 lea edx, [edx + 64] 3444 sub ecx, 64 3445 jg convertloop 3446 3447 vzeroupper 3448 ret 3449 } 3450} 3451#endif // HAS_COPYROW_AVX 3452 3453// Multiple of 1. 3454__declspec(naked) 3455void CopyRow_ERMS(const uint8* src, uint8* dst, int count) { 3456 __asm { 3457 mov eax, esi 3458 mov edx, edi 3459 mov esi, [esp + 4] // src 3460 mov edi, [esp + 8] // dst 3461 mov ecx, [esp + 12] // count 3462 rep movsb 3463 mov edi, edx 3464 mov esi, eax 3465 ret 3466 } 3467} 3468 3469#ifdef HAS_ARGBCOPYALPHAROW_SSE2 3470// width in pixels 3471__declspec(naked) 3472void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { 3473 __asm { 3474 mov eax, [esp + 4] // src 3475 mov edx, [esp + 8] // dst 3476 mov ecx, [esp + 12] // count 3477 pcmpeqb xmm0, xmm0 // generate mask 0xff000000 3478 pslld xmm0, 24 3479 pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff 3480 psrld xmm1, 8 3481 3482 convertloop: 3483 movdqu xmm2, [eax] 3484 movdqu xmm3, [eax + 16] 3485 lea eax, [eax + 32] 3486 movdqu xmm4, [edx] 3487 movdqu xmm5, [edx + 16] 3488 pand xmm2, xmm0 3489 pand xmm3, xmm0 3490 pand xmm4, xmm1 3491 pand xmm5, xmm1 3492 por xmm2, xmm4 3493 por xmm3, xmm5 3494 movdqu [edx], xmm2 3495 movdqu [edx + 16], xmm3 3496 lea edx, [edx + 32] 3497 sub ecx, 8 3498 jg convertloop 3499 3500 ret 3501 } 3502} 3503#endif // HAS_ARGBCOPYALPHAROW_SSE2 3504 3505#ifdef HAS_ARGBCOPYALPHAROW_AVX2 3506// width in pixels 3507__declspec(naked) 3508void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { 3509 __asm { 3510 mov eax, [esp + 4] // src 3511 mov edx, [esp + 8] // dst 3512 mov ecx, [esp + 12] // count 3513 vpcmpeqb ymm0, ymm0, ymm0 3514 vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff 3515 3516 convertloop: 3517 vmovdqu ymm1, [eax] 3518 vmovdqu ymm2, [eax + 32] 3519 lea eax, [eax + 64] 3520 vpblendvb ymm1, ymm1, [edx], ymm0 3521 vpblendvb ymm2, ymm2, [edx + 32], ymm0 3522 vmovdqu [edx], ymm1 3523 vmovdqu [edx + 32], ymm2 3524 lea edx, [edx + 64] 3525 sub ecx, 16 3526 jg convertloop 3527 3528 vzeroupper 3529 ret 3530 } 3531} 3532#endif // HAS_ARGBCOPYALPHAROW_AVX2 3533 3534#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2 3535// width in pixels 3536__declspec(naked) 3537void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) { 3538 __asm { 3539 mov eax, [esp + 4] // src_argb 3540 mov edx, [esp + 8] // dst_a 3541 mov ecx, [esp + 12] // width 3542 3543 extractloop: 3544 movdqu xmm0, [eax] 3545 movdqu xmm1, [eax + 16] 3546 lea eax, [eax + 32] 3547 psrld xmm0, 24 3548 psrld xmm1, 24 3549 packssdw xmm0, xmm1 3550 packuswb xmm0, xmm0 3551 movq qword ptr [edx], xmm0 3552 lea edx, [edx + 8] 3553 sub ecx, 8 3554 jg extractloop 3555 3556 ret 3557 } 3558} 3559#endif // HAS_ARGBEXTRACTALPHAROW_SSE2 3560 3561#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 3562// width in pixels 3563__declspec(naked) 3564void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { 3565 __asm { 3566 mov eax, [esp + 4] // src 3567 mov edx, [esp + 8] // dst 3568 mov ecx, [esp + 12] // count 3569 pcmpeqb xmm0, xmm0 // generate mask 0xff000000 3570 pslld xmm0, 24 3571 pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff 3572 psrld xmm1, 8 3573 3574 convertloop: 3575 movq xmm2, qword ptr [eax] // 8 Y's 3576 lea eax, [eax + 8] 3577 punpcklbw xmm2, xmm2 3578 punpckhwd xmm3, xmm2 3579 punpcklwd xmm2, xmm2 3580 movdqu xmm4, [edx] 3581 movdqu xmm5, [edx + 16] 3582 pand xmm2, xmm0 3583 pand xmm3, xmm0 3584 pand xmm4, xmm1 3585 pand xmm5, xmm1 3586 por xmm2, xmm4 3587 por xmm3, xmm5 3588 movdqu [edx], xmm2 3589 movdqu [edx + 16], xmm3 3590 lea edx, [edx + 32] 3591 sub ecx, 8 3592 jg convertloop 3593 3594 ret 3595 } 3596} 3597#endif // HAS_ARGBCOPYYTOALPHAROW_SSE2 3598 3599#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 3600// width in pixels 3601__declspec(naked) 3602void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { 3603 __asm { 3604 mov eax, [esp + 4] // src 3605 mov edx, [esp + 8] // dst 3606 mov ecx, [esp + 12] // count 3607 vpcmpeqb ymm0, ymm0, ymm0 3608 vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff 3609 3610 convertloop: 3611 vpmovzxbd ymm1, qword ptr [eax] 3612 vpmovzxbd ymm2, qword ptr [eax + 8] 3613 lea eax, [eax + 16] 3614 vpslld ymm1, ymm1, 24 3615 vpslld ymm2, ymm2, 24 3616 vpblendvb ymm1, ymm1, [edx], ymm0 3617 vpblendvb ymm2, ymm2, [edx + 32], ymm0 3618 vmovdqu [edx], ymm1 3619 vmovdqu [edx + 32], ymm2 3620 lea edx, [edx + 64] 3621 sub ecx, 16 3622 jg convertloop 3623 3624 vzeroupper 3625 ret 3626 } 3627} 3628#endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 3629 3630#ifdef HAS_SETROW_X86 3631// Write 'count' bytes using an 8 bit value repeated. 3632// Count should be multiple of 4. 3633__declspec(naked) 3634void SetRow_X86(uint8* dst, uint8 v8, int count) { 3635 __asm { 3636 movzx eax, byte ptr [esp + 8] // v8 3637 mov edx, 0x01010101 // Duplicate byte to all bytes. 3638 mul edx // overwrites edx with upper part of result. 3639 mov edx, edi 3640 mov edi, [esp + 4] // dst 3641 mov ecx, [esp + 12] // count 3642 shr ecx, 2 3643 rep stosd 3644 mov edi, edx 3645 ret 3646 } 3647} 3648 3649// Write 'count' bytes using an 8 bit value repeated. 3650__declspec(naked) 3651void SetRow_ERMS(uint8* dst, uint8 v8, int count) { 3652 __asm { 3653 mov edx, edi 3654 mov edi, [esp + 4] // dst 3655 mov eax, [esp + 8] // v8 3656 mov ecx, [esp + 12] // count 3657 rep stosb 3658 mov edi, edx 3659 ret 3660 } 3661} 3662 3663// Write 'count' 32 bit values. 3664__declspec(naked) 3665void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) { 3666 __asm { 3667 mov edx, edi 3668 mov edi, [esp + 4] // dst 3669 mov eax, [esp + 8] // v32 3670 mov ecx, [esp + 12] // count 3671 rep stosd 3672 mov edi, edx 3673 ret 3674 } 3675} 3676#endif // HAS_SETROW_X86 3677 3678#ifdef HAS_YUY2TOYROW_AVX2 3679__declspec(naked) 3680void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) { 3681 __asm { 3682 mov eax, [esp + 4] // src_yuy2 3683 mov edx, [esp + 8] // dst_y 3684 mov ecx, [esp + 12] // width 3685 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff 3686 vpsrlw ymm5, ymm5, 8 3687 3688 convertloop: 3689 vmovdqu ymm0, [eax] 3690 vmovdqu ymm1, [eax + 32] 3691 lea eax, [eax + 64] 3692 vpand ymm0, ymm0, ymm5 // even bytes are Y 3693 vpand ymm1, ymm1, ymm5 3694 vpackuswb ymm0, ymm0, ymm1 // mutates. 3695 vpermq ymm0, ymm0, 0xd8 3696 vmovdqu [edx], ymm0 3697 lea edx, [edx + 32] 3698 sub ecx, 32 3699 jg convertloop 3700 vzeroupper 3701 ret 3702 } 3703} 3704 3705__declspec(naked) 3706void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, 3707 uint8* dst_u, uint8* dst_v, int width) { 3708 __asm { 3709 push esi 3710 push edi 3711 mov eax, [esp + 8 + 4] // src_yuy2 3712 mov esi, [esp + 8 + 8] // stride_yuy2 3713 mov edx, [esp + 8 + 12] // dst_u 3714 mov edi, [esp + 8 + 16] // dst_v 3715 mov ecx, [esp + 8 + 20] // width 3716 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff 3717 vpsrlw ymm5, ymm5, 8 3718 sub edi, edx 3719 3720 convertloop: 3721 vmovdqu ymm0, [eax] 3722 vmovdqu ymm1, [eax + 32] 3723 vpavgb ymm0, ymm0, [eax + esi] 3724 vpavgb ymm1, ymm1, [eax + esi + 32] 3725 lea eax, [eax + 64] 3726 vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV 3727 vpsrlw ymm1, ymm1, 8 3728 vpackuswb ymm0, ymm0, ymm1 // mutates. 3729 vpermq ymm0, ymm0, 0xd8 3730 vpand ymm1, ymm0, ymm5 // U 3731 vpsrlw ymm0, ymm0, 8 // V 3732 vpackuswb ymm1, ymm1, ymm1 // mutates. 3733 vpackuswb ymm0, ymm0, ymm0 // mutates. 3734 vpermq ymm1, ymm1, 0xd8 3735 vpermq ymm0, ymm0, 0xd8 3736 vextractf128 [edx], ymm1, 0 // U 3737 vextractf128 [edx + edi], ymm0, 0 // V 3738 lea edx, [edx + 16] 3739 sub ecx, 32 3740 jg convertloop 3741 3742 pop edi 3743 pop esi 3744 vzeroupper 3745 ret 3746 } 3747} 3748 3749__declspec(naked) 3750void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, 3751 uint8* dst_u, uint8* dst_v, int width) { 3752 __asm { 3753 push edi 3754 mov eax, [esp + 4 + 4] // src_yuy2 3755 mov edx, [esp + 4 + 8] // dst_u 3756 mov edi, [esp + 4 + 12] // dst_v 3757 mov ecx, [esp + 4 + 16] // width 3758 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff 3759 vpsrlw ymm5, ymm5, 8 3760 sub edi, edx 3761 3762 convertloop: 3763 vmovdqu ymm0, [eax] 3764 vmovdqu ymm1, [eax + 32] 3765 lea eax, [eax + 64] 3766 vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV 3767 vpsrlw ymm1, ymm1, 8 3768 vpackuswb ymm0, ymm0, ymm1 // mutates. 3769 vpermq ymm0, ymm0, 0xd8 3770 vpand ymm1, ymm0, ymm5 // U 3771 vpsrlw ymm0, ymm0, 8 // V 3772 vpackuswb ymm1, ymm1, ymm1 // mutates. 3773 vpackuswb ymm0, ymm0, ymm0 // mutates. 3774 vpermq ymm1, ymm1, 0xd8 3775 vpermq ymm0, ymm0, 0xd8 3776 vextractf128 [edx], ymm1, 0 // U 3777 vextractf128 [edx + edi], ymm0, 0 // V 3778 lea edx, [edx + 16] 3779 sub ecx, 32 3780 jg convertloop 3781 3782 pop edi 3783 vzeroupper 3784 ret 3785 } 3786} 3787 3788__declspec(naked) 3789void UYVYToYRow_AVX2(const uint8* src_uyvy, 3790 uint8* dst_y, int width) { 3791 __asm { 3792 mov eax, [esp + 4] // src_uyvy 3793 mov edx, [esp + 8] // dst_y 3794 mov ecx, [esp + 12] // width 3795 3796 convertloop: 3797 vmovdqu ymm0, [eax] 3798 vmovdqu ymm1, [eax + 32] 3799 lea eax, [eax + 64] 3800 vpsrlw ymm0, ymm0, 8 // odd bytes are Y 3801 vpsrlw ymm1, ymm1, 8 3802 vpackuswb ymm0, ymm0, ymm1 // mutates. 3803 vpermq ymm0, ymm0, 0xd8 3804 vmovdqu [edx], ymm0 3805 lea edx, [edx + 32] 3806 sub ecx, 32 3807 jg convertloop 3808 vzeroupper 3809 ret 3810 } 3811} 3812 3813__declspec(naked) 3814void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, 3815 uint8* dst_u, uint8* dst_v, int width) { 3816 __asm { 3817 push esi 3818 push edi 3819 mov eax, [esp + 8 + 4] // src_yuy2 3820 mov esi, [esp + 8 + 8] // stride_yuy2 3821 mov edx, [esp + 8 + 12] // dst_u 3822 mov edi, [esp + 8 + 16] // dst_v 3823 mov ecx, [esp + 8 + 20] // width 3824 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff 3825 vpsrlw ymm5, ymm5, 8 3826 sub edi, edx 3827 3828 convertloop: 3829 vmovdqu ymm0, [eax] 3830 vmovdqu ymm1, [eax + 32] 3831 vpavgb ymm0, ymm0, [eax + esi] 3832 vpavgb ymm1, ymm1, [eax + esi + 32] 3833 lea eax, [eax + 64] 3834 vpand ymm0, ymm0, ymm5 // UYVY -> UVUV 3835 vpand ymm1, ymm1, ymm5 3836 vpackuswb ymm0, ymm0, ymm1 // mutates. 3837 vpermq ymm0, ymm0, 0xd8 3838 vpand ymm1, ymm0, ymm5 // U 3839 vpsrlw ymm0, ymm0, 8 // V 3840 vpackuswb ymm1, ymm1, ymm1 // mutates. 3841 vpackuswb ymm0, ymm0, ymm0 // mutates. 3842 vpermq ymm1, ymm1, 0xd8 3843 vpermq ymm0, ymm0, 0xd8 3844 vextractf128 [edx], ymm1, 0 // U 3845 vextractf128 [edx + edi], ymm0, 0 // V 3846 lea edx, [edx + 16] 3847 sub ecx, 32 3848 jg convertloop 3849 3850 pop edi 3851 pop esi 3852 vzeroupper 3853 ret 3854 } 3855} 3856 3857__declspec(naked) 3858void UYVYToUV422Row_AVX2(const uint8* src_uyvy, 3859 uint8* dst_u, uint8* dst_v, int width) { 3860 __asm { 3861 push edi 3862 mov eax, [esp + 4 + 4] // src_yuy2 3863 mov edx, [esp + 4 + 8] // dst_u 3864 mov edi, [esp + 4 + 12] // dst_v 3865 mov ecx, [esp + 4 + 16] // width 3866 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff 3867 vpsrlw ymm5, ymm5, 8 3868 sub edi, edx 3869 3870 convertloop: 3871 vmovdqu ymm0, [eax] 3872 vmovdqu ymm1, [eax + 32] 3873 lea eax, [eax + 64] 3874 vpand ymm0, ymm0, ymm5 // UYVY -> UVUV 3875 vpand ymm1, ymm1, ymm5 3876 vpackuswb ymm0, ymm0, ymm1 // mutates. 3877 vpermq ymm0, ymm0, 0xd8 3878 vpand ymm1, ymm0, ymm5 // U 3879 vpsrlw ymm0, ymm0, 8 // V 3880 vpackuswb ymm1, ymm1, ymm1 // mutates. 3881 vpackuswb ymm0, ymm0, ymm0 // mutates. 3882 vpermq ymm1, ymm1, 0xd8 3883 vpermq ymm0, ymm0, 0xd8 3884 vextractf128 [edx], ymm1, 0 // U 3885 vextractf128 [edx + edi], ymm0, 0 // V 3886 lea edx, [edx + 16] 3887 sub ecx, 32 3888 jg convertloop 3889 3890 pop edi 3891 vzeroupper 3892 ret 3893 } 3894} 3895#endif // HAS_YUY2TOYROW_AVX2 3896 3897#ifdef HAS_YUY2TOYROW_SSE2 3898__declspec(naked) 3899void YUY2ToYRow_SSE2(const uint8* src_yuy2, 3900 uint8* dst_y, int width) { 3901 __asm { 3902 mov eax, [esp + 4] // src_yuy2 3903 mov edx, [esp + 8] // dst_y 3904 mov ecx, [esp + 12] // width 3905 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 3906 psrlw xmm5, 8 3907 3908 convertloop: 3909 movdqu xmm0, [eax] 3910 movdqu xmm1, [eax + 16] 3911 lea eax, [eax + 32] 3912 pand xmm0, xmm5 // even bytes are Y 3913 pand xmm1, xmm5 3914 packuswb xmm0, xmm1 3915 movdqu [edx], xmm0 3916 lea edx, [edx + 16] 3917 sub ecx, 16 3918 jg convertloop 3919 ret 3920 } 3921} 3922 3923__declspec(naked) 3924void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, 3925 uint8* dst_u, uint8* dst_v, int width) { 3926 __asm { 3927 push esi 3928 push edi 3929 mov eax, [esp + 8 + 4] // src_yuy2 3930 mov esi, [esp + 8 + 8] // stride_yuy2 3931 mov edx, [esp + 8 + 12] // dst_u 3932 mov edi, [esp + 8 + 16] // dst_v 3933 mov ecx, [esp + 8 + 20] // width 3934 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 3935 psrlw xmm5, 8 3936 sub edi, edx 3937 3938 convertloop: 3939 movdqu xmm0, [eax] 3940 movdqu xmm1, [eax + 16] 3941 movdqu xmm2, [eax + esi] 3942 movdqu xmm3, [eax + esi + 16] 3943 lea eax, [eax + 32] 3944 pavgb xmm0, xmm2 3945 pavgb xmm1, xmm3 3946 psrlw xmm0, 8 // YUYV -> UVUV 3947 psrlw xmm1, 8 3948 packuswb xmm0, xmm1 3949 movdqa xmm1, xmm0 3950 pand xmm0, xmm5 // U 3951 packuswb xmm0, xmm0 3952 psrlw xmm1, 8 // V 3953 packuswb xmm1, xmm1 3954 movq qword ptr [edx], xmm0 3955 movq qword ptr [edx + edi], xmm1 3956 lea edx, [edx + 8] 3957 sub ecx, 16 3958 jg convertloop 3959 3960 pop edi 3961 pop esi 3962 ret 3963 } 3964} 3965 3966__declspec(naked) 3967void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, 3968 uint8* dst_u, uint8* dst_v, int width) { 3969 __asm { 3970 push edi 3971 mov eax, [esp + 4 + 4] // src_yuy2 3972 mov edx, [esp + 4 + 8] // dst_u 3973 mov edi, [esp + 4 + 12] // dst_v 3974 mov ecx, [esp + 4 + 16] // width 3975 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 3976 psrlw xmm5, 8 3977 sub edi, edx 3978 3979 convertloop: 3980 movdqu xmm0, [eax] 3981 movdqu xmm1, [eax + 16] 3982 lea eax, [eax + 32] 3983 psrlw xmm0, 8 // YUYV -> UVUV 3984 psrlw xmm1, 8 3985 packuswb xmm0, xmm1 3986 movdqa xmm1, xmm0 3987 pand xmm0, xmm5 // U 3988 packuswb xmm0, xmm0 3989 psrlw xmm1, 8 // V 3990 packuswb xmm1, xmm1 3991 movq qword ptr [edx], xmm0 3992 movq qword ptr [edx + edi], xmm1 3993 lea edx, [edx + 8] 3994 sub ecx, 16 3995 jg convertloop 3996 3997 pop edi 3998 ret 3999 } 4000} 4001 4002__declspec(naked) 4003void UYVYToYRow_SSE2(const uint8* src_uyvy, 4004 uint8* dst_y, int width) { 4005 __asm { 4006 mov eax, [esp + 4] // src_uyvy 4007 mov edx, [esp + 8] // dst_y 4008 mov ecx, [esp + 12] // width 4009 4010 convertloop: 4011 movdqu xmm0, [eax] 4012 movdqu xmm1, [eax + 16] 4013 lea eax, [eax + 32] 4014 psrlw xmm0, 8 // odd bytes are Y 4015 psrlw xmm1, 8 4016 packuswb xmm0, xmm1 4017 movdqu [edx], xmm0 4018 lea edx, [edx + 16] 4019 sub ecx, 16 4020 jg convertloop 4021 ret 4022 } 4023} 4024 4025__declspec(naked) 4026void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, 4027 uint8* dst_u, uint8* dst_v, int width) { 4028 __asm { 4029 push esi 4030 push edi 4031 mov eax, [esp + 8 + 4] // src_yuy2 4032 mov esi, [esp + 8 + 8] // stride_yuy2 4033 mov edx, [esp + 8 + 12] // dst_u 4034 mov edi, [esp + 8 + 16] // dst_v 4035 mov ecx, [esp + 8 + 20] // width 4036 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 4037 psrlw xmm5, 8 4038 sub edi, edx 4039 4040 convertloop: 4041 movdqu xmm0, [eax] 4042 movdqu xmm1, [eax + 16] 4043 movdqu xmm2, [eax + esi] 4044 movdqu xmm3, [eax + esi + 16] 4045 lea eax, [eax + 32] 4046 pavgb xmm0, xmm2 4047 pavgb xmm1, xmm3 4048 pand xmm0, xmm5 // UYVY -> UVUV 4049 pand xmm1, xmm5 4050 packuswb xmm0, xmm1 4051 movdqa xmm1, xmm0 4052 pand xmm0, xmm5 // U 4053 packuswb xmm0, xmm0 4054 psrlw xmm1, 8 // V 4055 packuswb xmm1, xmm1 4056 movq qword ptr [edx], xmm0 4057 movq qword ptr [edx + edi], xmm1 4058 lea edx, [edx + 8] 4059 sub ecx, 16 4060 jg convertloop 4061 4062 pop edi 4063 pop esi 4064 ret 4065 } 4066} 4067 4068__declspec(naked) 4069void UYVYToUV422Row_SSE2(const uint8* src_uyvy, 4070 uint8* dst_u, uint8* dst_v, int width) { 4071 __asm { 4072 push edi 4073 mov eax, [esp + 4 + 4] // src_yuy2 4074 mov edx, [esp + 4 + 8] // dst_u 4075 mov edi, [esp + 4 + 12] // dst_v 4076 mov ecx, [esp + 4 + 16] // width 4077 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 4078 psrlw xmm5, 8 4079 sub edi, edx 4080 4081 convertloop: 4082 movdqu xmm0, [eax] 4083 movdqu xmm1, [eax + 16] 4084 lea eax, [eax + 32] 4085 pand xmm0, xmm5 // UYVY -> UVUV 4086 pand xmm1, xmm5 4087 packuswb xmm0, xmm1 4088 movdqa xmm1, xmm0 4089 pand xmm0, xmm5 // U 4090 packuswb xmm0, xmm0 4091 psrlw xmm1, 8 // V 4092 packuswb xmm1, xmm1 4093 movq qword ptr [edx], xmm0 4094 movq qword ptr [edx + edi], xmm1 4095 lea edx, [edx + 8] 4096 sub ecx, 16 4097 jg convertloop 4098 4099 pop edi 4100 ret 4101 } 4102} 4103#endif // HAS_YUY2TOYROW_SSE2 4104 4105#ifdef HAS_BLENDPLANEROW_SSSE3 4106// Blend 8 pixels at a time. 4107// unsigned version of math 4108// =((A2*C2)+(B2*(255-C2))+255)/256 4109// signed version of math 4110// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 4111__declspec(naked) 4112void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1, 4113 const uint8* alpha, uint8* dst, int width) { 4114 __asm { 4115 push esi 4116 push edi 4117 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 4118 psllw xmm5, 8 4119 mov eax, 0x80808080 // 128 for biasing image to signed. 4120 movd xmm6, eax 4121 pshufd xmm6, xmm6, 0x00 4122 4123 mov eax, 0x807f807f // 32768 + 127 for unbias and round. 4124 movd xmm7, eax 4125 pshufd xmm7, xmm7, 0x00 4126 mov eax, [esp + 8 + 4] // src0 4127 mov edx, [esp + 8 + 8] // src1 4128 mov esi, [esp + 8 + 12] // alpha 4129 mov edi, [esp + 8 + 16] // dst 4130 mov ecx, [esp + 8 + 20] // width 4131 sub eax, esi 4132 sub edx, esi 4133 sub edi, esi 4134 4135 // 8 pixel loop. 4136 convertloop8: 4137 movq xmm0, qword ptr [esi] // alpha 4138 punpcklbw xmm0, xmm0 4139 pxor xmm0, xmm5 // a, 255-a 4140 movq xmm1, qword ptr [eax + esi] // src0 4141 movq xmm2, qword ptr [edx + esi] // src1 4142 punpcklbw xmm1, xmm2 4143 psubb xmm1, xmm6 // bias src0/1 - 128 4144 pmaddubsw xmm0, xmm1 4145 paddw xmm0, xmm7 // unbias result - 32768 and round. 4146 psrlw xmm0, 8 4147 packuswb xmm0, xmm0 4148 movq qword ptr [edi + esi], xmm0 4149 lea esi, [esi + 8] 4150 sub ecx, 8 4151 jg convertloop8 4152 4153 pop edi 4154 pop esi 4155 ret 4156 } 4157} 4158#endif // HAS_BLENDPLANEROW_SSSE3 4159 4160#ifdef HAS_BLENDPLANEROW_AVX2 4161// Blend 32 pixels at a time. 4162// unsigned version of math 4163// =((A2*C2)+(B2*(255-C2))+255)/256 4164// signed version of math 4165// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 4166__declspec(naked) 4167void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1, 4168 const uint8* alpha, uint8* dst, int width) { 4169 __asm { 4170 push esi 4171 push edi 4172 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff00ff00 4173 vpsllw ymm5, ymm5, 8 4174 mov eax, 0x80808080 // 128 for biasing image to signed. 4175 vmovd xmm6, eax 4176 vbroadcastss ymm6, xmm6 4177 mov eax, 0x807f807f // 32768 + 127 for unbias and round. 4178 vmovd xmm7, eax 4179 vbroadcastss ymm7, xmm7 4180 mov eax, [esp + 8 + 4] // src0 4181 mov edx, [esp + 8 + 8] // src1 4182 mov esi, [esp + 8 + 12] // alpha 4183 mov edi, [esp + 8 + 16] // dst 4184 mov ecx, [esp + 8 + 20] // width 4185 sub eax, esi 4186 sub edx, esi 4187 sub edi, esi 4188 4189 // 32 pixel loop. 4190 convertloop32: 4191 vmovdqu ymm0, [esi] // alpha 4192 vpunpckhbw ymm3, ymm0, ymm0 // 8..15, 24..31 4193 vpunpcklbw ymm0, ymm0, ymm0 // 0..7, 16..23 4194 vpxor ymm3, ymm3, ymm5 // a, 255-a 4195 vpxor ymm0, ymm0, ymm5 // a, 255-a 4196 vmovdqu ymm1, [eax + esi] // src0 4197 vmovdqu ymm2, [edx + esi] // src1 4198 vpunpckhbw ymm4, ymm1, ymm2 4199 vpunpcklbw ymm1, ymm1, ymm2 4200 vpsubb ymm4, ymm4, ymm6 // bias src0/1 - 128 4201 vpsubb ymm1, ymm1, ymm6 // bias src0/1 - 128 4202 vpmaddubsw ymm3, ymm3, ymm4 4203 vpmaddubsw ymm0, ymm0, ymm1 4204 vpaddw ymm3, ymm3, ymm7 // unbias result - 32768 and round. 4205 vpaddw ymm0, ymm0, ymm7 // unbias result - 32768 and round. 4206 vpsrlw ymm3, ymm3, 8 4207 vpsrlw ymm0, ymm0, 8 4208 vpackuswb ymm0, ymm0, ymm3 4209 vmovdqu [edi + esi], ymm0 4210 lea esi, [esi + 32] 4211 sub ecx, 32 4212 jg convertloop32 4213 4214 pop edi 4215 pop esi 4216 vzeroupper 4217 ret 4218 } 4219} 4220#endif // HAS_BLENDPLANEROW_AVX2 4221 4222#ifdef HAS_ARGBBLENDROW_SSSE3 4223// Shuffle table for isolating alpha. 4224static const uvec8 kShuffleAlpha = { 4225 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, 4226 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 4227}; 4228 4229// Blend 8 pixels at a time. 4230__declspec(naked) 4231void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, 4232 uint8* dst_argb, int width) { 4233 __asm { 4234 push esi 4235 mov eax, [esp + 4 + 4] // src_argb0 4236 mov esi, [esp + 4 + 8] // src_argb1 4237 mov edx, [esp + 4 + 12] // dst_argb 4238 mov ecx, [esp + 4 + 16] // width 4239 pcmpeqb xmm7, xmm7 // generate constant 0x0001 4240 psrlw xmm7, 15 4241 pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff 4242 psrlw xmm6, 8 4243 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 4244 psllw xmm5, 8 4245 pcmpeqb xmm4, xmm4 // generate mask 0xff000000 4246 pslld xmm4, 24 4247 sub ecx, 4 4248 jl convertloop4b // less than 4 pixels? 4249 4250 // 4 pixel loop. 4251 convertloop4: 4252 movdqu xmm3, [eax] // src argb 4253 lea eax, [eax + 16] 4254 movdqa xmm0, xmm3 // src argb 4255 pxor xmm3, xmm4 // ~alpha 4256 movdqu xmm2, [esi] // _r_b 4257 pshufb xmm3, xmmword ptr kShuffleAlpha // alpha 4258 pand xmm2, xmm6 // _r_b 4259 paddw xmm3, xmm7 // 256 - alpha 4260 pmullw xmm2, xmm3 // _r_b * alpha 4261 movdqu xmm1, [esi] // _a_g 4262 lea esi, [esi + 16] 4263 psrlw xmm1, 8 // _a_g 4264 por xmm0, xmm4 // set alpha to 255 4265 pmullw xmm1, xmm3 // _a_g * alpha 4266 psrlw xmm2, 8 // _r_b convert to 8 bits again 4267 paddusb xmm0, xmm2 // + src argb 4268 pand xmm1, xmm5 // a_g_ convert to 8 bits again 4269 paddusb xmm0, xmm1 // + src argb 4270 movdqu [edx], xmm0 4271 lea edx, [edx + 16] 4272 sub ecx, 4 4273 jge convertloop4 4274 4275 convertloop4b: 4276 add ecx, 4 - 1 4277 jl convertloop1b 4278 4279 // 1 pixel loop. 4280 convertloop1: 4281 movd xmm3, [eax] // src argb 4282 lea eax, [eax + 4] 4283 movdqa xmm0, xmm3 // src argb 4284 pxor xmm3, xmm4 // ~alpha 4285 movd xmm2, [esi] // _r_b 4286 pshufb xmm3, xmmword ptr kShuffleAlpha // alpha 4287 pand xmm2, xmm6 // _r_b 4288 paddw xmm3, xmm7 // 256 - alpha 4289 pmullw xmm2, xmm3 // _r_b * alpha 4290 movd xmm1, [esi] // _a_g 4291 lea esi, [esi + 4] 4292 psrlw xmm1, 8 // _a_g 4293 por xmm0, xmm4 // set alpha to 255 4294 pmullw xmm1, xmm3 // _a_g * alpha 4295 psrlw xmm2, 8 // _r_b convert to 8 bits again 4296 paddusb xmm0, xmm2 // + src argb 4297 pand xmm1, xmm5 // a_g_ convert to 8 bits again 4298 paddusb xmm0, xmm1 // + src argb 4299 movd [edx], xmm0 4300 lea edx, [edx + 4] 4301 sub ecx, 1 4302 jge convertloop1 4303 4304 convertloop1b: 4305 pop esi 4306 ret 4307 } 4308} 4309#endif // HAS_ARGBBLENDROW_SSSE3 4310 4311#ifdef HAS_ARGBATTENUATEROW_SSSE3 4312// Shuffle table duplicating alpha. 4313static const uvec8 kShuffleAlpha0 = { 4314 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u, 4315}; 4316static const uvec8 kShuffleAlpha1 = { 4317 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, 4318 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u, 4319}; 4320__declspec(naked) 4321void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { 4322 __asm { 4323 mov eax, [esp + 4] // src_argb0 4324 mov edx, [esp + 8] // dst_argb 4325 mov ecx, [esp + 12] // width 4326 pcmpeqb xmm3, xmm3 // generate mask 0xff000000 4327 pslld xmm3, 24 4328 movdqa xmm4, xmmword ptr kShuffleAlpha0 4329 movdqa xmm5, xmmword ptr kShuffleAlpha1 4330 4331 convertloop: 4332 movdqu xmm0, [eax] // read 4 pixels 4333 pshufb xmm0, xmm4 // isolate first 2 alphas 4334 movdqu xmm1, [eax] // read 4 pixels 4335 punpcklbw xmm1, xmm1 // first 2 pixel rgbs 4336 pmulhuw xmm0, xmm1 // rgb * a 4337 movdqu xmm1, [eax] // read 4 pixels 4338 pshufb xmm1, xmm5 // isolate next 2 alphas 4339 movdqu xmm2, [eax] // read 4 pixels 4340 punpckhbw xmm2, xmm2 // next 2 pixel rgbs 4341 pmulhuw xmm1, xmm2 // rgb * a 4342 movdqu xmm2, [eax] // mask original alpha 4343 lea eax, [eax + 16] 4344 pand xmm2, xmm3 4345 psrlw xmm0, 8 4346 psrlw xmm1, 8 4347 packuswb xmm0, xmm1 4348 por xmm0, xmm2 // copy original alpha 4349 movdqu [edx], xmm0 4350 lea edx, [edx + 16] 4351 sub ecx, 4 4352 jg convertloop 4353 4354 ret 4355 } 4356} 4357#endif // HAS_ARGBATTENUATEROW_SSSE3 4358 4359#ifdef HAS_ARGBATTENUATEROW_AVX2 4360// Shuffle table duplicating alpha. 4361static const uvec8 kShuffleAlpha_AVX2 = { 4362 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u 4363}; 4364__declspec(naked) 4365void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { 4366 __asm { 4367 mov eax, [esp + 4] // src_argb0 4368 mov edx, [esp + 8] // dst_argb 4369 mov ecx, [esp + 12] // width 4370 sub edx, eax 4371 vbroadcastf128 ymm4, xmmword ptr kShuffleAlpha_AVX2 4372 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 4373 vpslld ymm5, ymm5, 24 4374 4375 convertloop: 4376 vmovdqu ymm6, [eax] // read 8 pixels. 4377 vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. 4378 vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. 4379 vpshufb ymm2, ymm0, ymm4 // low 4 alphas 4380 vpshufb ymm3, ymm1, ymm4 // high 4 alphas 4381 vpmulhuw ymm0, ymm0, ymm2 // rgb * a 4382 vpmulhuw ymm1, ymm1, ymm3 // rgb * a 4383 vpand ymm6, ymm6, ymm5 // isolate alpha 4384 vpsrlw ymm0, ymm0, 8 4385 vpsrlw ymm1, ymm1, 8 4386 vpackuswb ymm0, ymm0, ymm1 // unmutated. 4387 vpor ymm0, ymm0, ymm6 // copy original alpha 4388 vmovdqu [eax + edx], ymm0 4389 lea eax, [eax + 32] 4390 sub ecx, 8 4391 jg convertloop 4392 4393 vzeroupper 4394 ret 4395 } 4396} 4397#endif // HAS_ARGBATTENUATEROW_AVX2 4398 4399#ifdef HAS_ARGBUNATTENUATEROW_SSE2 4400// Unattenuate 4 pixels at a time. 4401__declspec(naked) 4402void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, 4403 int width) { 4404 __asm { 4405 push ebx 4406 push esi 4407 push edi 4408 mov eax, [esp + 12 + 4] // src_argb 4409 mov edx, [esp + 12 + 8] // dst_argb 4410 mov ecx, [esp + 12 + 12] // width 4411 lea ebx, fixed_invtbl8 4412 4413 convertloop: 4414 movdqu xmm0, [eax] // read 4 pixels 4415 movzx esi, byte ptr [eax + 3] // first alpha 4416 movzx edi, byte ptr [eax + 7] // second alpha 4417 punpcklbw xmm0, xmm0 // first 2 4418 movd xmm2, dword ptr [ebx + esi * 4] 4419 movd xmm3, dword ptr [ebx + edi * 4] 4420 pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words. 1, a, a, a 4421 pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words 4422 movlhps xmm2, xmm3 4423 pmulhuw xmm0, xmm2 // rgb * a 4424 4425 movdqu xmm1, [eax] // read 4 pixels 4426 movzx esi, byte ptr [eax + 11] // third alpha 4427 movzx edi, byte ptr [eax + 15] // forth alpha 4428 punpckhbw xmm1, xmm1 // next 2 4429 movd xmm2, dword ptr [ebx + esi * 4] 4430 movd xmm3, dword ptr [ebx + edi * 4] 4431 pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words 4432 pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words 4433 movlhps xmm2, xmm3 4434 pmulhuw xmm1, xmm2 // rgb * a 4435 lea eax, [eax + 16] 4436 packuswb xmm0, xmm1 4437 movdqu [edx], xmm0 4438 lea edx, [edx + 16] 4439 sub ecx, 4 4440 jg convertloop 4441 4442 pop edi 4443 pop esi 4444 pop ebx 4445 ret 4446 } 4447} 4448#endif // HAS_ARGBUNATTENUATEROW_SSE2 4449 4450#ifdef HAS_ARGBUNATTENUATEROW_AVX2 4451// Shuffle table duplicating alpha. 4452static const uvec8 kUnattenShuffleAlpha_AVX2 = { 4453 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u 4454}; 4455// TODO(fbarchard): Enable USE_GATHER for future hardware if faster. 4456// USE_GATHER is not on by default, due to being a slow instruction. 4457#ifdef USE_GATHER 4458__declspec(naked) 4459void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, 4460 int width) { 4461 __asm { 4462 mov eax, [esp + 4] // src_argb0 4463 mov edx, [esp + 8] // dst_argb 4464 mov ecx, [esp + 12] // width 4465 sub edx, eax 4466 vbroadcastf128 ymm4, xmmword ptr kUnattenShuffleAlpha_AVX2 4467 4468 convertloop: 4469 vmovdqu ymm6, [eax] // read 8 pixels. 4470 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xffffffff for gather. 4471 vpsrld ymm2, ymm6, 24 // alpha in low 8 bits. 4472 vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. 4473 vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. 4474 vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5 // ymm5 cleared. 1, a 4475 vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a 4476 vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated. 4477 vpshufb ymm2, ymm2, ymm4 // replicate low 4 alphas. 1, a, a, a 4478 vpshufb ymm3, ymm3, ymm4 // replicate high 4 alphas 4479 vpmulhuw ymm0, ymm0, ymm2 // rgb * ia 4480 vpmulhuw ymm1, ymm1, ymm3 // rgb * ia 4481 vpackuswb ymm0, ymm0, ymm1 // unmutated. 4482 vmovdqu [eax + edx], ymm0 4483 lea eax, [eax + 32] 4484 sub ecx, 8 4485 jg convertloop 4486 4487 vzeroupper 4488 ret 4489 } 4490} 4491#else // USE_GATHER 4492__declspec(naked) 4493void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, 4494 int width) { 4495 __asm { 4496 4497 push ebx 4498 push esi 4499 push edi 4500 mov eax, [esp + 12 + 4] // src_argb 4501 mov edx, [esp + 12 + 8] // dst_argb 4502 mov ecx, [esp + 12 + 12] // width 4503 sub edx, eax 4504 lea ebx, fixed_invtbl8 4505 vbroadcastf128 ymm5, xmmword ptr kUnattenShuffleAlpha_AVX2 4506 4507 convertloop: 4508 // replace VPGATHER 4509 movzx esi, byte ptr [eax + 3] // alpha0 4510 movzx edi, byte ptr [eax + 7] // alpha1 4511 vmovd xmm0, dword ptr [ebx + esi * 4] // [1,a0] 4512 vmovd xmm1, dword ptr [ebx + edi * 4] // [1,a1] 4513 movzx esi, byte ptr [eax + 11] // alpha2 4514 movzx edi, byte ptr [eax + 15] // alpha3 4515 vpunpckldq xmm6, xmm0, xmm1 // [1,a1,1,a0] 4516 vmovd xmm2, dword ptr [ebx + esi * 4] // [1,a2] 4517 vmovd xmm3, dword ptr [ebx + edi * 4] // [1,a3] 4518 movzx esi, byte ptr [eax + 19] // alpha4 4519 movzx edi, byte ptr [eax + 23] // alpha5 4520 vpunpckldq xmm7, xmm2, xmm3 // [1,a3,1,a2] 4521 vmovd xmm0, dword ptr [ebx + esi * 4] // [1,a4] 4522 vmovd xmm1, dword ptr [ebx + edi * 4] // [1,a5] 4523 movzx esi, byte ptr [eax + 27] // alpha6 4524 movzx edi, byte ptr [eax + 31] // alpha7 4525 vpunpckldq xmm0, xmm0, xmm1 // [1,a5,1,a4] 4526 vmovd xmm2, dword ptr [ebx + esi * 4] // [1,a6] 4527 vmovd xmm3, dword ptr [ebx + edi * 4] // [1,a7] 4528 vpunpckldq xmm2, xmm2, xmm3 // [1,a7,1,a6] 4529 vpunpcklqdq xmm3, xmm6, xmm7 // [1,a3,1,a2,1,a1,1,a0] 4530 vpunpcklqdq xmm0, xmm0, xmm2 // [1,a7,1,a6,1,a5,1,a4] 4531 vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0] 4532 // end of VPGATHER 4533 4534 vmovdqu ymm6, [eax] // read 8 pixels. 4535 vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. 4536 vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. 4537 vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a 4538 vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated. 4539 vpshufb ymm2, ymm2, ymm5 // replicate low 4 alphas. 1, a, a, a 4540 vpshufb ymm3, ymm3, ymm5 // replicate high 4 alphas 4541 vpmulhuw ymm0, ymm0, ymm2 // rgb * ia 4542 vpmulhuw ymm1, ymm1, ymm3 // rgb * ia 4543 vpackuswb ymm0, ymm0, ymm1 // unmutated. 4544 vmovdqu [eax + edx], ymm0 4545 lea eax, [eax + 32] 4546 sub ecx, 8 4547 jg convertloop 4548 4549 pop edi 4550 pop esi 4551 pop ebx 4552 vzeroupper 4553 ret 4554 } 4555} 4556#endif // USE_GATHER 4557#endif // HAS_ARGBATTENUATEROW_AVX2 4558 4559#ifdef HAS_ARGBGRAYROW_SSSE3 4560// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels. 4561__declspec(naked) 4562void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { 4563 __asm { 4564 mov eax, [esp + 4] /* src_argb */ 4565 mov edx, [esp + 8] /* dst_argb */ 4566 mov ecx, [esp + 12] /* width */ 4567 movdqa xmm4, xmmword ptr kARGBToYJ 4568 movdqa xmm5, xmmword ptr kAddYJ64 4569 4570 convertloop: 4571 movdqu xmm0, [eax] // G 4572 movdqu xmm1, [eax + 16] 4573 pmaddubsw xmm0, xmm4 4574 pmaddubsw xmm1, xmm4 4575 phaddw xmm0, xmm1 4576 paddw xmm0, xmm5 // Add .5 for rounding. 4577 psrlw xmm0, 7 4578 packuswb xmm0, xmm0 // 8 G bytes 4579 movdqu xmm2, [eax] // A 4580 movdqu xmm3, [eax + 16] 4581 lea eax, [eax + 32] 4582 psrld xmm2, 24 4583 psrld xmm3, 24 4584 packuswb xmm2, xmm3 4585 packuswb xmm2, xmm2 // 8 A bytes 4586 movdqa xmm3, xmm0 // Weave into GG, GA, then GGGA 4587 punpcklbw xmm0, xmm0 // 8 GG words 4588 punpcklbw xmm3, xmm2 // 8 GA words 4589 movdqa xmm1, xmm0 4590 punpcklwd xmm0, xmm3 // GGGA first 4 4591 punpckhwd xmm1, xmm3 // GGGA next 4 4592 movdqu [edx], xmm0 4593 movdqu [edx + 16], xmm1 4594 lea edx, [edx + 32] 4595 sub ecx, 8 4596 jg convertloop 4597 ret 4598 } 4599} 4600#endif // HAS_ARGBGRAYROW_SSSE3 4601 4602#ifdef HAS_ARGBSEPIAROW_SSSE3 4603// b = (r * 35 + g * 68 + b * 17) >> 7 4604// g = (r * 45 + g * 88 + b * 22) >> 7 4605// r = (r * 50 + g * 98 + b * 24) >> 7 4606// Constant for ARGB color to sepia tone. 4607static const vec8 kARGBToSepiaB = { 4608 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0 4609}; 4610 4611static const vec8 kARGBToSepiaG = { 4612 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0 4613}; 4614 4615static const vec8 kARGBToSepiaR = { 4616 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0 4617}; 4618 4619// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. 4620__declspec(naked) 4621void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { 4622 __asm { 4623 mov eax, [esp + 4] /* dst_argb */ 4624 mov ecx, [esp + 8] /* width */ 4625 movdqa xmm2, xmmword ptr kARGBToSepiaB 4626 movdqa xmm3, xmmword ptr kARGBToSepiaG 4627 movdqa xmm4, xmmword ptr kARGBToSepiaR 4628 4629 convertloop: 4630 movdqu xmm0, [eax] // B 4631 movdqu xmm6, [eax + 16] 4632 pmaddubsw xmm0, xmm2 4633 pmaddubsw xmm6, xmm2 4634 phaddw xmm0, xmm6 4635 psrlw xmm0, 7 4636 packuswb xmm0, xmm0 // 8 B values 4637 movdqu xmm5, [eax] // G 4638 movdqu xmm1, [eax + 16] 4639 pmaddubsw xmm5, xmm3 4640 pmaddubsw xmm1, xmm3 4641 phaddw xmm5, xmm1 4642 psrlw xmm5, 7 4643 packuswb xmm5, xmm5 // 8 G values 4644 punpcklbw xmm0, xmm5 // 8 BG values 4645 movdqu xmm5, [eax] // R 4646 movdqu xmm1, [eax + 16] 4647 pmaddubsw xmm5, xmm4 4648 pmaddubsw xmm1, xmm4 4649 phaddw xmm5, xmm1 4650 psrlw xmm5, 7 4651 packuswb xmm5, xmm5 // 8 R values 4652 movdqu xmm6, [eax] // A 4653 movdqu xmm1, [eax + 16] 4654 psrld xmm6, 24 4655 psrld xmm1, 24 4656 packuswb xmm6, xmm1 4657 packuswb xmm6, xmm6 // 8 A values 4658 punpcklbw xmm5, xmm6 // 8 RA values 4659 movdqa xmm1, xmm0 // Weave BG, RA together 4660 punpcklwd xmm0, xmm5 // BGRA first 4 4661 punpckhwd xmm1, xmm5 // BGRA next 4 4662 movdqu [eax], xmm0 4663 movdqu [eax + 16], xmm1 4664 lea eax, [eax + 32] 4665 sub ecx, 8 4666 jg convertloop 4667 ret 4668 } 4669} 4670#endif // HAS_ARGBSEPIAROW_SSSE3 4671 4672#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 4673// Tranform 8 ARGB pixels (32 bytes) with color matrix. 4674// Same as Sepia except matrix is provided. 4675// TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R 4676// and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd. 4677__declspec(naked) 4678void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, 4679 const int8* matrix_argb, int width) { 4680 __asm { 4681 mov eax, [esp + 4] /* src_argb */ 4682 mov edx, [esp + 8] /* dst_argb */ 4683 mov ecx, [esp + 12] /* matrix_argb */ 4684 movdqu xmm5, [ecx] 4685 pshufd xmm2, xmm5, 0x00 4686 pshufd xmm3, xmm5, 0x55 4687 pshufd xmm4, xmm5, 0xaa 4688 pshufd xmm5, xmm5, 0xff 4689 mov ecx, [esp + 16] /* width */ 4690 4691 convertloop: 4692 movdqu xmm0, [eax] // B 4693 movdqu xmm7, [eax + 16] 4694 pmaddubsw xmm0, xmm2 4695 pmaddubsw xmm7, xmm2 4696 movdqu xmm6, [eax] // G 4697 movdqu xmm1, [eax + 16] 4698 pmaddubsw xmm6, xmm3 4699 pmaddubsw xmm1, xmm3 4700 phaddsw xmm0, xmm7 // B 4701 phaddsw xmm6, xmm1 // G 4702 psraw xmm0, 6 // B 4703 psraw xmm6, 6 // G 4704 packuswb xmm0, xmm0 // 8 B values 4705 packuswb xmm6, xmm6 // 8 G values 4706 punpcklbw xmm0, xmm6 // 8 BG values 4707 movdqu xmm1, [eax] // R 4708 movdqu xmm7, [eax + 16] 4709 pmaddubsw xmm1, xmm4 4710 pmaddubsw xmm7, xmm4 4711 phaddsw xmm1, xmm7 // R 4712 movdqu xmm6, [eax] // A 4713 movdqu xmm7, [eax + 16] 4714 pmaddubsw xmm6, xmm5 4715 pmaddubsw xmm7, xmm5 4716 phaddsw xmm6, xmm7 // A 4717 psraw xmm1, 6 // R 4718 psraw xmm6, 6 // A 4719 packuswb xmm1, xmm1 // 8 R values 4720 packuswb xmm6, xmm6 // 8 A values 4721 punpcklbw xmm1, xmm6 // 8 RA values 4722 movdqa xmm6, xmm0 // Weave BG, RA together 4723 punpcklwd xmm0, xmm1 // BGRA first 4 4724 punpckhwd xmm6, xmm1 // BGRA next 4 4725 movdqu [edx], xmm0 4726 movdqu [edx + 16], xmm6 4727 lea eax, [eax + 32] 4728 lea edx, [edx + 32] 4729 sub ecx, 8 4730 jg convertloop 4731 ret 4732 } 4733} 4734#endif // HAS_ARGBCOLORMATRIXROW_SSSE3 4735 4736#ifdef HAS_ARGBQUANTIZEROW_SSE2 4737// Quantize 4 ARGB pixels (16 bytes). 4738__declspec(naked) 4739void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, 4740 int interval_offset, int width) { 4741 __asm { 4742 mov eax, [esp + 4] /* dst_argb */ 4743 movd xmm2, [esp + 8] /* scale */ 4744 movd xmm3, [esp + 12] /* interval_size */ 4745 movd xmm4, [esp + 16] /* interval_offset */ 4746 mov ecx, [esp + 20] /* width */ 4747 pshuflw xmm2, xmm2, 040h 4748 pshufd xmm2, xmm2, 044h 4749 pshuflw xmm3, xmm3, 040h 4750 pshufd xmm3, xmm3, 044h 4751 pshuflw xmm4, xmm4, 040h 4752 pshufd xmm4, xmm4, 044h 4753 pxor xmm5, xmm5 // constant 0 4754 pcmpeqb xmm6, xmm6 // generate mask 0xff000000 4755 pslld xmm6, 24 4756 4757 convertloop: 4758 movdqu xmm0, [eax] // read 4 pixels 4759 punpcklbw xmm0, xmm5 // first 2 pixels 4760 pmulhuw xmm0, xmm2 // pixel * scale >> 16 4761 movdqu xmm1, [eax] // read 4 pixels 4762 punpckhbw xmm1, xmm5 // next 2 pixels 4763 pmulhuw xmm1, xmm2 4764 pmullw xmm0, xmm3 // * interval_size 4765 movdqu xmm7, [eax] // read 4 pixels 4766 pmullw xmm1, xmm3 4767 pand xmm7, xmm6 // mask alpha 4768 paddw xmm0, xmm4 // + interval_size / 2 4769 paddw xmm1, xmm4 4770 packuswb xmm0, xmm1 4771 por xmm0, xmm7 4772 movdqu [eax], xmm0 4773 lea eax, [eax + 16] 4774 sub ecx, 4 4775 jg convertloop 4776 ret 4777 } 4778} 4779#endif // HAS_ARGBQUANTIZEROW_SSE2 4780 4781#ifdef HAS_ARGBSHADEROW_SSE2 4782// Shade 4 pixels at a time by specified value. 4783__declspec(naked) 4784void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, 4785 uint32 value) { 4786 __asm { 4787 mov eax, [esp + 4] // src_argb 4788 mov edx, [esp + 8] // dst_argb 4789 mov ecx, [esp + 12] // width 4790 movd xmm2, [esp + 16] // value 4791 punpcklbw xmm2, xmm2 4792 punpcklqdq xmm2, xmm2 4793 4794 convertloop: 4795 movdqu xmm0, [eax] // read 4 pixels 4796 lea eax, [eax + 16] 4797 movdqa xmm1, xmm0 4798 punpcklbw xmm0, xmm0 // first 2 4799 punpckhbw xmm1, xmm1 // next 2 4800 pmulhuw xmm0, xmm2 // argb * value 4801 pmulhuw xmm1, xmm2 // argb * value 4802 psrlw xmm0, 8 4803 psrlw xmm1, 8 4804 packuswb xmm0, xmm1 4805 movdqu [edx], xmm0 4806 lea edx, [edx + 16] 4807 sub ecx, 4 4808 jg convertloop 4809 4810 ret 4811 } 4812} 4813#endif // HAS_ARGBSHADEROW_SSE2 4814 4815#ifdef HAS_ARGBMULTIPLYROW_SSE2 4816// Multiply 2 rows of ARGB pixels together, 4 pixels at a time. 4817__declspec(naked) 4818void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, 4819 uint8* dst_argb, int width) { 4820 __asm { 4821 push esi 4822 mov eax, [esp + 4 + 4] // src_argb0 4823 mov esi, [esp + 4 + 8] // src_argb1 4824 mov edx, [esp + 4 + 12] // dst_argb 4825 mov ecx, [esp + 4 + 16] // width 4826 pxor xmm5, xmm5 // constant 0 4827 4828 convertloop: 4829 movdqu xmm0, [eax] // read 4 pixels from src_argb0 4830 movdqu xmm2, [esi] // read 4 pixels from src_argb1 4831 movdqu xmm1, xmm0 4832 movdqu xmm3, xmm2 4833 punpcklbw xmm0, xmm0 // first 2 4834 punpckhbw xmm1, xmm1 // next 2 4835 punpcklbw xmm2, xmm5 // first 2 4836 punpckhbw xmm3, xmm5 // next 2 4837 pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2 4838 pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2 4839 lea eax, [eax + 16] 4840 lea esi, [esi + 16] 4841 packuswb xmm0, xmm1 4842 movdqu [edx], xmm0 4843 lea edx, [edx + 16] 4844 sub ecx, 4 4845 jg convertloop 4846 4847 pop esi 4848 ret 4849 } 4850} 4851#endif // HAS_ARGBMULTIPLYROW_SSE2 4852 4853#ifdef HAS_ARGBADDROW_SSE2 4854// Add 2 rows of ARGB pixels together, 4 pixels at a time. 4855// TODO(fbarchard): Port this to posix, neon and other math functions. 4856__declspec(naked) 4857void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, 4858 uint8* dst_argb, int width) { 4859 __asm { 4860 push esi 4861 mov eax, [esp + 4 + 4] // src_argb0 4862 mov esi, [esp + 4 + 8] // src_argb1 4863 mov edx, [esp + 4 + 12] // dst_argb 4864 mov ecx, [esp + 4 + 16] // width 4865 4866 sub ecx, 4 4867 jl convertloop49 4868 4869 convertloop4: 4870 movdqu xmm0, [eax] // read 4 pixels from src_argb0 4871 lea eax, [eax + 16] 4872 movdqu xmm1, [esi] // read 4 pixels from src_argb1 4873 lea esi, [esi + 16] 4874 paddusb xmm0, xmm1 // src_argb0 + src_argb1 4875 movdqu [edx], xmm0 4876 lea edx, [edx + 16] 4877 sub ecx, 4 4878 jge convertloop4 4879 4880 convertloop49: 4881 add ecx, 4 - 1 4882 jl convertloop19 4883 4884 convertloop1: 4885 movd xmm0, [eax] // read 1 pixels from src_argb0 4886 lea eax, [eax + 4] 4887 movd xmm1, [esi] // read 1 pixels from src_argb1 4888 lea esi, [esi + 4] 4889 paddusb xmm0, xmm1 // src_argb0 + src_argb1 4890 movd [edx], xmm0 4891 lea edx, [edx + 4] 4892 sub ecx, 1 4893 jge convertloop1 4894 4895 convertloop19: 4896 pop esi 4897 ret 4898 } 4899} 4900#endif // HAS_ARGBADDROW_SSE2 4901 4902#ifdef HAS_ARGBSUBTRACTROW_SSE2 4903// Subtract 2 rows of ARGB pixels together, 4 pixels at a time. 4904__declspec(naked) 4905void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, 4906 uint8* dst_argb, int width) { 4907 __asm { 4908 push esi 4909 mov eax, [esp + 4 + 4] // src_argb0 4910 mov esi, [esp + 4 + 8] // src_argb1 4911 mov edx, [esp + 4 + 12] // dst_argb 4912 mov ecx, [esp + 4 + 16] // width 4913 4914 convertloop: 4915 movdqu xmm0, [eax] // read 4 pixels from src_argb0 4916 lea eax, [eax + 16] 4917 movdqu xmm1, [esi] // read 4 pixels from src_argb1 4918 lea esi, [esi + 16] 4919 psubusb xmm0, xmm1 // src_argb0 - src_argb1 4920 movdqu [edx], xmm0 4921 lea edx, [edx + 16] 4922 sub ecx, 4 4923 jg convertloop 4924 4925 pop esi 4926 ret 4927 } 4928} 4929#endif // HAS_ARGBSUBTRACTROW_SSE2 4930 4931#ifdef HAS_ARGBMULTIPLYROW_AVX2 4932// Multiply 2 rows of ARGB pixels together, 8 pixels at a time. 4933__declspec(naked) 4934void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, 4935 uint8* dst_argb, int width) { 4936 __asm { 4937 push esi 4938 mov eax, [esp + 4 + 4] // src_argb0 4939 mov esi, [esp + 4 + 8] // src_argb1 4940 mov edx, [esp + 4 + 12] // dst_argb 4941 mov ecx, [esp + 4 + 16] // width 4942 vpxor ymm5, ymm5, ymm5 // constant 0 4943 4944 convertloop: 4945 vmovdqu ymm1, [eax] // read 8 pixels from src_argb0 4946 lea eax, [eax + 32] 4947 vmovdqu ymm3, [esi] // read 8 pixels from src_argb1 4948 lea esi, [esi + 32] 4949 vpunpcklbw ymm0, ymm1, ymm1 // low 4 4950 vpunpckhbw ymm1, ymm1, ymm1 // high 4 4951 vpunpcklbw ymm2, ymm3, ymm5 // low 4 4952 vpunpckhbw ymm3, ymm3, ymm5 // high 4 4953 vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4 4954 vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4 4955 vpackuswb ymm0, ymm0, ymm1 4956 vmovdqu [edx], ymm0 4957 lea edx, [edx + 32] 4958 sub ecx, 8 4959 jg convertloop 4960 4961 pop esi 4962 vzeroupper 4963 ret 4964 } 4965} 4966#endif // HAS_ARGBMULTIPLYROW_AVX2 4967 4968#ifdef HAS_ARGBADDROW_AVX2 4969// Add 2 rows of ARGB pixels together, 8 pixels at a time. 4970__declspec(naked) 4971void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, 4972 uint8* dst_argb, int width) { 4973 __asm { 4974 push esi 4975 mov eax, [esp + 4 + 4] // src_argb0 4976 mov esi, [esp + 4 + 8] // src_argb1 4977 mov edx, [esp + 4 + 12] // dst_argb 4978 mov ecx, [esp + 4 + 16] // width 4979 4980 convertloop: 4981 vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 4982 lea eax, [eax + 32] 4983 vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1 4984 lea esi, [esi + 32] 4985 vmovdqu [edx], ymm0 4986 lea edx, [edx + 32] 4987 sub ecx, 8 4988 jg convertloop 4989 4990 pop esi 4991 vzeroupper 4992 ret 4993 } 4994} 4995#endif // HAS_ARGBADDROW_AVX2 4996 4997#ifdef HAS_ARGBSUBTRACTROW_AVX2 4998// Subtract 2 rows of ARGB pixels together, 8 pixels at a time. 4999__declspec(naked) 5000void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, 5001 uint8* dst_argb, int width) { 5002 __asm { 5003 push esi 5004 mov eax, [esp + 4 + 4] // src_argb0 5005 mov esi, [esp + 4 + 8] // src_argb1 5006 mov edx, [esp + 4 + 12] // dst_argb 5007 mov ecx, [esp + 4 + 16] // width 5008 5009 convertloop: 5010 vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 5011 lea eax, [eax + 32] 5012 vpsubusb ymm0, ymm0, [esi] // src_argb0 - src_argb1 5013 lea esi, [esi + 32] 5014 vmovdqu [edx], ymm0 5015 lea edx, [edx + 32] 5016 sub ecx, 8 5017 jg convertloop 5018 5019 pop esi 5020 vzeroupper 5021 ret 5022 } 5023} 5024#endif // HAS_ARGBSUBTRACTROW_AVX2 5025 5026#ifdef HAS_SOBELXROW_SSE2 5027// SobelX as a matrix is 5028// -1 0 1 5029// -2 0 2 5030// -1 0 1 5031__declspec(naked) 5032void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, 5033 const uint8* src_y2, uint8* dst_sobelx, int width) { 5034 __asm { 5035 push esi 5036 push edi 5037 mov eax, [esp + 8 + 4] // src_y0 5038 mov esi, [esp + 8 + 8] // src_y1 5039 mov edi, [esp + 8 + 12] // src_y2 5040 mov edx, [esp + 8 + 16] // dst_sobelx 5041 mov ecx, [esp + 8 + 20] // width 5042 sub esi, eax 5043 sub edi, eax 5044 sub edx, eax 5045 pxor xmm5, xmm5 // constant 0 5046 5047 convertloop: 5048 movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] 5049 movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2] 5050 punpcklbw xmm0, xmm5 5051 punpcklbw xmm1, xmm5 5052 psubw xmm0, xmm1 5053 movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] 5054 movq xmm2, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2] 5055 punpcklbw xmm1, xmm5 5056 punpcklbw xmm2, xmm5 5057 psubw xmm1, xmm2 5058 movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0] 5059 movq xmm3, qword ptr [eax + edi + 2] // read 8 pixels from src_y2[2] 5060 punpcklbw xmm2, xmm5 5061 punpcklbw xmm3, xmm5 5062 psubw xmm2, xmm3 5063 paddw xmm0, xmm2 5064 paddw xmm0, xmm1 5065 paddw xmm0, xmm1 5066 pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw 5067 psubw xmm1, xmm0 5068 pmaxsw xmm0, xmm1 5069 packuswb xmm0, xmm0 5070 movq qword ptr [eax + edx], xmm0 5071 lea eax, [eax + 8] 5072 sub ecx, 8 5073 jg convertloop 5074 5075 pop edi 5076 pop esi 5077 ret 5078 } 5079} 5080#endif // HAS_SOBELXROW_SSE2 5081 5082#ifdef HAS_SOBELYROW_SSE2 5083// SobelY as a matrix is 5084// -1 -2 -1 5085// 0 0 0 5086// 1 2 1 5087__declspec(naked) 5088void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, 5089 uint8* dst_sobely, int width) { 5090 __asm { 5091 push esi 5092 mov eax, [esp + 4 + 4] // src_y0 5093 mov esi, [esp + 4 + 8] // src_y1 5094 mov edx, [esp + 4 + 12] // dst_sobely 5095 mov ecx, [esp + 4 + 16] // width 5096 sub esi, eax 5097 sub edx, eax 5098 pxor xmm5, xmm5 // constant 0 5099 5100 convertloop: 5101 movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] 5102 movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] 5103 punpcklbw xmm0, xmm5 5104 punpcklbw xmm1, xmm5 5105 psubw xmm0, xmm1 5106 movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1] 5107 movq xmm2, qword ptr [eax + esi + 1] // read 8 pixels from src_y1[1] 5108 punpcklbw xmm1, xmm5 5109 punpcklbw xmm2, xmm5 5110 psubw xmm1, xmm2 5111 movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2] 5112 movq xmm3, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2] 5113 punpcklbw xmm2, xmm5 5114 punpcklbw xmm3, xmm5 5115 psubw xmm2, xmm3 5116 paddw xmm0, xmm2 5117 paddw xmm0, xmm1 5118 paddw xmm0, xmm1 5119 pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw 5120 psubw xmm1, xmm0 5121 pmaxsw xmm0, xmm1 5122 packuswb xmm0, xmm0 5123 movq qword ptr [eax + edx], xmm0 5124 lea eax, [eax + 8] 5125 sub ecx, 8 5126 jg convertloop 5127 5128 pop esi 5129 ret 5130 } 5131} 5132#endif // HAS_SOBELYROW_SSE2 5133 5134#ifdef HAS_SOBELROW_SSE2 5135// Adds Sobel X and Sobel Y and stores Sobel into ARGB. 5136// A = 255 5137// R = Sobel 5138// G = Sobel 5139// B = Sobel 5140__declspec(naked) 5141void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, 5142 uint8* dst_argb, int width) { 5143 __asm { 5144 push esi 5145 mov eax, [esp + 4 + 4] // src_sobelx 5146 mov esi, [esp + 4 + 8] // src_sobely 5147 mov edx, [esp + 4 + 12] // dst_argb 5148 mov ecx, [esp + 4 + 16] // width 5149 sub esi, eax 5150 pcmpeqb xmm5, xmm5 // alpha 255 5151 pslld xmm5, 24 // 0xff000000 5152 5153 convertloop: 5154 movdqu xmm0, [eax] // read 16 pixels src_sobelx 5155 movdqu xmm1, [eax + esi] // read 16 pixels src_sobely 5156 lea eax, [eax + 16] 5157 paddusb xmm0, xmm1 // sobel = sobelx + sobely 5158 movdqa xmm2, xmm0 // GG 5159 punpcklbw xmm2, xmm0 // First 8 5160 punpckhbw xmm0, xmm0 // Next 8 5161 movdqa xmm1, xmm2 // GGGG 5162 punpcklwd xmm1, xmm2 // First 4 5163 punpckhwd xmm2, xmm2 // Next 4 5164 por xmm1, xmm5 // GGGA 5165 por xmm2, xmm5 5166 movdqa xmm3, xmm0 // GGGG 5167 punpcklwd xmm3, xmm0 // Next 4 5168 punpckhwd xmm0, xmm0 // Last 4 5169 por xmm3, xmm5 // GGGA 5170 por xmm0, xmm5 5171 movdqu [edx], xmm1 5172 movdqu [edx + 16], xmm2 5173 movdqu [edx + 32], xmm3 5174 movdqu [edx + 48], xmm0 5175 lea edx, [edx + 64] 5176 sub ecx, 16 5177 jg convertloop 5178 5179 pop esi 5180 ret 5181 } 5182} 5183#endif // HAS_SOBELROW_SSE2 5184 5185#ifdef HAS_SOBELTOPLANEROW_SSE2 5186// Adds Sobel X and Sobel Y and stores Sobel into a plane. 5187__declspec(naked) 5188void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, 5189 uint8* dst_y, int width) { 5190 __asm { 5191 push esi 5192 mov eax, [esp + 4 + 4] // src_sobelx 5193 mov esi, [esp + 4 + 8] // src_sobely 5194 mov edx, [esp + 4 + 12] // dst_argb 5195 mov ecx, [esp + 4 + 16] // width 5196 sub esi, eax 5197 5198 convertloop: 5199 movdqu xmm0, [eax] // read 16 pixels src_sobelx 5200 movdqu xmm1, [eax + esi] // read 16 pixels src_sobely 5201 lea eax, [eax + 16] 5202 paddusb xmm0, xmm1 // sobel = sobelx + sobely 5203 movdqu [edx], xmm0 5204 lea edx, [edx + 16] 5205 sub ecx, 16 5206 jg convertloop 5207 5208 pop esi 5209 ret 5210 } 5211} 5212#endif // HAS_SOBELTOPLANEROW_SSE2 5213 5214#ifdef HAS_SOBELXYROW_SSE2 5215// Mixes Sobel X, Sobel Y and Sobel into ARGB. 5216// A = 255 5217// R = Sobel X 5218// G = Sobel 5219// B = Sobel Y 5220__declspec(naked) 5221void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, 5222 uint8* dst_argb, int width) { 5223 __asm { 5224 push esi 5225 mov eax, [esp + 4 + 4] // src_sobelx 5226 mov esi, [esp + 4 + 8] // src_sobely 5227 mov edx, [esp + 4 + 12] // dst_argb 5228 mov ecx, [esp + 4 + 16] // width 5229 sub esi, eax 5230 pcmpeqb xmm5, xmm5 // alpha 255 5231 5232 convertloop: 5233 movdqu xmm0, [eax] // read 16 pixels src_sobelx 5234 movdqu xmm1, [eax + esi] // read 16 pixels src_sobely 5235 lea eax, [eax + 16] 5236 movdqa xmm2, xmm0 5237 paddusb xmm2, xmm1 // sobel = sobelx + sobely 5238 movdqa xmm3, xmm0 // XA 5239 punpcklbw xmm3, xmm5 5240 punpckhbw xmm0, xmm5 5241 movdqa xmm4, xmm1 // YS 5242 punpcklbw xmm4, xmm2 5243 punpckhbw xmm1, xmm2 5244 movdqa xmm6, xmm4 // YSXA 5245 punpcklwd xmm6, xmm3 // First 4 5246 punpckhwd xmm4, xmm3 // Next 4 5247 movdqa xmm7, xmm1 // YSXA 5248 punpcklwd xmm7, xmm0 // Next 4 5249 punpckhwd xmm1, xmm0 // Last 4 5250 movdqu [edx], xmm6 5251 movdqu [edx + 16], xmm4 5252 movdqu [edx + 32], xmm7 5253 movdqu [edx + 48], xmm1 5254 lea edx, [edx + 64] 5255 sub ecx, 16 5256 jg convertloop 5257 5258 pop esi 5259 ret 5260 } 5261} 5262#endif // HAS_SOBELXYROW_SSE2 5263 5264#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 5265// Consider float CumulativeSum. 5266// Consider calling CumulativeSum one row at time as needed. 5267// Consider circular CumulativeSum buffer of radius * 2 + 1 height. 5268// Convert cumulative sum for an area to an average for 1 pixel. 5269// topleft is pointer to top left of CumulativeSum buffer for area. 5270// botleft is pointer to bottom left of CumulativeSum buffer. 5271// width is offset from left to right of area in CumulativeSum buffer measured 5272// in number of ints. 5273// area is the number of pixels in the area being averaged. 5274// dst points to pixel to store result to. 5275// count is number of averaged pixels to produce. 5276// Does 4 pixels at a time. 5277// This function requires alignment on accumulation buffer pointers. 5278void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, 5279 int width, int area, uint8* dst, 5280 int count) { 5281 __asm { 5282 mov eax, topleft // eax topleft 5283 mov esi, botleft // esi botleft 5284 mov edx, width 5285 movd xmm5, area 5286 mov edi, dst 5287 mov ecx, count 5288 cvtdq2ps xmm5, xmm5 5289 rcpss xmm4, xmm5 // 1.0f / area 5290 pshufd xmm4, xmm4, 0 5291 sub ecx, 4 5292 jl l4b 5293 5294 cmp area, 128 // 128 pixels will not overflow 15 bits. 5295 ja l4 5296 5297 pshufd xmm5, xmm5, 0 // area 5298 pcmpeqb xmm6, xmm6 // constant of 65536.0 - 1 = 65535.0 5299 psrld xmm6, 16 5300 cvtdq2ps xmm6, xmm6 5301 addps xmm5, xmm6 // (65536.0 + area - 1) 5302 mulps xmm5, xmm4 // (65536.0 + area - 1) * 1 / area 5303 cvtps2dq xmm5, xmm5 // 0.16 fixed point 5304 packssdw xmm5, xmm5 // 16 bit shorts 5305 5306 // 4 pixel loop small blocks. 5307 s4: 5308 // top left 5309 movdqu xmm0, [eax] 5310 movdqu xmm1, [eax + 16] 5311 movdqu xmm2, [eax + 32] 5312 movdqu xmm3, [eax + 48] 5313 5314 // - top right 5315 psubd xmm0, [eax + edx * 4] 5316 psubd xmm1, [eax + edx * 4 + 16] 5317 psubd xmm2, [eax + edx * 4 + 32] 5318 psubd xmm3, [eax + edx * 4 + 48] 5319 lea eax, [eax + 64] 5320 5321 // - bottom left 5322 psubd xmm0, [esi] 5323 psubd xmm1, [esi + 16] 5324 psubd xmm2, [esi + 32] 5325 psubd xmm3, [esi + 48] 5326 5327 // + bottom right 5328 paddd xmm0, [esi + edx * 4] 5329 paddd xmm1, [esi + edx * 4 + 16] 5330 paddd xmm2, [esi + edx * 4 + 32] 5331 paddd xmm3, [esi + edx * 4 + 48] 5332 lea esi, [esi + 64] 5333 5334 packssdw xmm0, xmm1 // pack 4 pixels into 2 registers 5335 packssdw xmm2, xmm3 5336 5337 pmulhuw xmm0, xmm5 5338 pmulhuw xmm2, xmm5 5339 5340 packuswb xmm0, xmm2 5341 movdqu [edi], xmm0 5342 lea edi, [edi + 16] 5343 sub ecx, 4 5344 jge s4 5345 5346 jmp l4b 5347 5348 // 4 pixel loop 5349 l4: 5350 // top left 5351 movdqu xmm0, [eax] 5352 movdqu xmm1, [eax + 16] 5353 movdqu xmm2, [eax + 32] 5354 movdqu xmm3, [eax + 48] 5355 5356 // - top right 5357 psubd xmm0, [eax + edx * 4] 5358 psubd xmm1, [eax + edx * 4 + 16] 5359 psubd xmm2, [eax + edx * 4 + 32] 5360 psubd xmm3, [eax + edx * 4 + 48] 5361 lea eax, [eax + 64] 5362 5363 // - bottom left 5364 psubd xmm0, [esi] 5365 psubd xmm1, [esi + 16] 5366 psubd xmm2, [esi + 32] 5367 psubd xmm3, [esi + 48] 5368 5369 // + bottom right 5370 paddd xmm0, [esi + edx * 4] 5371 paddd xmm1, [esi + edx * 4 + 16] 5372 paddd xmm2, [esi + edx * 4 + 32] 5373 paddd xmm3, [esi + edx * 4 + 48] 5374 lea esi, [esi + 64] 5375 5376 cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area 5377 cvtdq2ps xmm1, xmm1 5378 mulps xmm0, xmm4 5379 mulps xmm1, xmm4 5380 cvtdq2ps xmm2, xmm2 5381 cvtdq2ps xmm3, xmm3 5382 mulps xmm2, xmm4 5383 mulps xmm3, xmm4 5384 cvtps2dq xmm0, xmm0 5385 cvtps2dq xmm1, xmm1 5386 cvtps2dq xmm2, xmm2 5387 cvtps2dq xmm3, xmm3 5388 packssdw xmm0, xmm1 5389 packssdw xmm2, xmm3 5390 packuswb xmm0, xmm2 5391 movdqu [edi], xmm0 5392 lea edi, [edi + 16] 5393 sub ecx, 4 5394 jge l4 5395 5396 l4b: 5397 add ecx, 4 - 1 5398 jl l1b 5399 5400 // 1 pixel loop 5401 l1: 5402 movdqu xmm0, [eax] 5403 psubd xmm0, [eax + edx * 4] 5404 lea eax, [eax + 16] 5405 psubd xmm0, [esi] 5406 paddd xmm0, [esi + edx * 4] 5407 lea esi, [esi + 16] 5408 cvtdq2ps xmm0, xmm0 5409 mulps xmm0, xmm4 5410 cvtps2dq xmm0, xmm0 5411 packssdw xmm0, xmm0 5412 packuswb xmm0, xmm0 5413 movd dword ptr [edi], xmm0 5414 lea edi, [edi + 4] 5415 sub ecx, 1 5416 jge l1 5417 l1b: 5418 } 5419} 5420#endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 5421 5422#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 5423// Creates a table of cumulative sums where each value is a sum of all values 5424// above and to the left of the value. 5425void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, 5426 const int32* previous_cumsum, int width) { 5427 __asm { 5428 mov eax, row 5429 mov edx, cumsum 5430 mov esi, previous_cumsum 5431 mov ecx, width 5432 pxor xmm0, xmm0 5433 pxor xmm1, xmm1 5434 5435 sub ecx, 4 5436 jl l4b 5437 test edx, 15 5438 jne l4b 5439 5440 // 4 pixel loop 5441 l4: 5442 movdqu xmm2, [eax] // 4 argb pixels 16 bytes. 5443 lea eax, [eax + 16] 5444 movdqa xmm4, xmm2 5445 5446 punpcklbw xmm2, xmm1 5447 movdqa xmm3, xmm2 5448 punpcklwd xmm2, xmm1 5449 punpckhwd xmm3, xmm1 5450 5451 punpckhbw xmm4, xmm1 5452 movdqa xmm5, xmm4 5453 punpcklwd xmm4, xmm1 5454 punpckhwd xmm5, xmm1 5455 5456 paddd xmm0, xmm2 5457 movdqu xmm2, [esi] // previous row above. 5458 paddd xmm2, xmm0 5459 5460 paddd xmm0, xmm3 5461 movdqu xmm3, [esi + 16] 5462 paddd xmm3, xmm0 5463 5464 paddd xmm0, xmm4 5465 movdqu xmm4, [esi + 32] 5466 paddd xmm4, xmm0 5467 5468 paddd xmm0, xmm5 5469 movdqu xmm5, [esi + 48] 5470 lea esi, [esi + 64] 5471 paddd xmm5, xmm0 5472 5473 movdqu [edx], xmm2 5474 movdqu [edx + 16], xmm3 5475 movdqu [edx + 32], xmm4 5476 movdqu [edx + 48], xmm5 5477 5478 lea edx, [edx + 64] 5479 sub ecx, 4 5480 jge l4 5481 5482 l4b: 5483 add ecx, 4 - 1 5484 jl l1b 5485 5486 // 1 pixel loop 5487 l1: 5488 movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes. 5489 lea eax, [eax + 4] 5490 punpcklbw xmm2, xmm1 5491 punpcklwd xmm2, xmm1 5492 paddd xmm0, xmm2 5493 movdqu xmm2, [esi] 5494 lea esi, [esi + 16] 5495 paddd xmm2, xmm0 5496 movdqu [edx], xmm2 5497 lea edx, [edx + 16] 5498 sub ecx, 1 5499 jge l1 5500 5501 l1b: 5502 } 5503} 5504#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 5505 5506#ifdef HAS_ARGBAFFINEROW_SSE2 5507// Copy ARGB pixels from source image with slope to a row of destination. 5508__declspec(naked) 5509LIBYUV_API 5510void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, 5511 uint8* dst_argb, const float* uv_dudv, int width) { 5512 __asm { 5513 push esi 5514 push edi 5515 mov eax, [esp + 12] // src_argb 5516 mov esi, [esp + 16] // stride 5517 mov edx, [esp + 20] // dst_argb 5518 mov ecx, [esp + 24] // pointer to uv_dudv 5519 movq xmm2, qword ptr [ecx] // uv 5520 movq xmm7, qword ptr [ecx + 8] // dudv 5521 mov ecx, [esp + 28] // width 5522 shl esi, 16 // 4, stride 5523 add esi, 4 5524 movd xmm5, esi 5525 sub ecx, 4 5526 jl l4b 5527 5528 // setup for 4 pixel loop 5529 pshufd xmm7, xmm7, 0x44 // dup dudv 5530 pshufd xmm5, xmm5, 0 // dup 4, stride 5531 movdqa xmm0, xmm2 // x0, y0, x1, y1 5532 addps xmm0, xmm7 5533 movlhps xmm2, xmm0 5534 movdqa xmm4, xmm7 5535 addps xmm4, xmm4 // dudv *= 2 5536 movdqa xmm3, xmm2 // x2, y2, x3, y3 5537 addps xmm3, xmm4 5538 addps xmm4, xmm4 // dudv *= 4 5539 5540 // 4 pixel loop 5541 l4: 5542 cvttps2dq xmm0, xmm2 // x, y float to int first 2 5543 cvttps2dq xmm1, xmm3 // x, y float to int next 2 5544 packssdw xmm0, xmm1 // x, y as 8 shorts 5545 pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride. 5546 movd esi, xmm0 5547 pshufd xmm0, xmm0, 0x39 // shift right 5548 movd edi, xmm0 5549 pshufd xmm0, xmm0, 0x39 // shift right 5550 movd xmm1, [eax + esi] // read pixel 0 5551 movd xmm6, [eax + edi] // read pixel 1 5552 punpckldq xmm1, xmm6 // combine pixel 0 and 1 5553 addps xmm2, xmm4 // x, y += dx, dy first 2 5554 movq qword ptr [edx], xmm1 5555 movd esi, xmm0 5556 pshufd xmm0, xmm0, 0x39 // shift right 5557 movd edi, xmm0 5558 movd xmm6, [eax + esi] // read pixel 2 5559 movd xmm0, [eax + edi] // read pixel 3 5560 punpckldq xmm6, xmm0 // combine pixel 2 and 3 5561 addps xmm3, xmm4 // x, y += dx, dy next 2 5562 movq qword ptr 8[edx], xmm6 5563 lea edx, [edx + 16] 5564 sub ecx, 4 5565 jge l4 5566 5567 l4b: 5568 add ecx, 4 - 1 5569 jl l1b 5570 5571 // 1 pixel loop 5572 l1: 5573 cvttps2dq xmm0, xmm2 // x, y float to int 5574 packssdw xmm0, xmm0 // x, y as shorts 5575 pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride 5576 addps xmm2, xmm7 // x, y += dx, dy 5577 movd esi, xmm0 5578 movd xmm0, [eax + esi] // copy a pixel 5579 movd [edx], xmm0 5580 lea edx, [edx + 4] 5581 sub ecx, 1 5582 jge l1 5583 l1b: 5584 pop edi 5585 pop esi 5586 ret 5587 } 5588} 5589#endif // HAS_ARGBAFFINEROW_SSE2 5590 5591#ifdef HAS_INTERPOLATEROW_AVX2 5592// Bilinear filter 32x2 -> 32x1 5593__declspec(naked) 5594void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, 5595 ptrdiff_t src_stride, int dst_width, 5596 int source_y_fraction) { 5597 __asm { 5598 push esi 5599 push edi 5600 mov edi, [esp + 8 + 4] // dst_ptr 5601 mov esi, [esp + 8 + 8] // src_ptr 5602 mov edx, [esp + 8 + 12] // src_stride 5603 mov ecx, [esp + 8 + 16] // dst_width 5604 mov eax, [esp + 8 + 20] // source_y_fraction (0..255) 5605 // Dispatch to specialized filters if applicable. 5606 cmp eax, 0 5607 je xloop100 // 0 / 256. Blend 100 / 0. 5608 sub edi, esi 5609 cmp eax, 128 5610 je xloop50 // 128 /256 is 0.50. Blend 50 / 50. 5611 5612 vmovd xmm0, eax // high fraction 0..255 5613 neg eax 5614 add eax, 256 5615 vmovd xmm5, eax // low fraction 256..1 5616 vpunpcklbw xmm5, xmm5, xmm0 5617 vpunpcklwd xmm5, xmm5, xmm5 5618 vbroadcastss ymm5, xmm5 5619 5620 mov eax, 0x80808080 // 128b for bias and rounding. 5621 vmovd xmm4, eax 5622 vbroadcastss ymm4, xmm4 5623 5624 xloop: 5625 vmovdqu ymm0, [esi] 5626 vmovdqu ymm2, [esi + edx] 5627 vpunpckhbw ymm1, ymm0, ymm2 // mutates 5628 vpunpcklbw ymm0, ymm0, ymm2 5629 vpsubb ymm1, ymm1, ymm4 // bias to signed image 5630 vpsubb ymm0, ymm0, ymm4 5631 vpmaddubsw ymm1, ymm5, ymm1 5632 vpmaddubsw ymm0, ymm5, ymm0 5633 vpaddw ymm1, ymm1, ymm4 // unbias and round 5634 vpaddw ymm0, ymm0, ymm4 5635 vpsrlw ymm1, ymm1, 8 5636 vpsrlw ymm0, ymm0, 8 5637 vpackuswb ymm0, ymm0, ymm1 // unmutates 5638 vmovdqu [esi + edi], ymm0 5639 lea esi, [esi + 32] 5640 sub ecx, 32 5641 jg xloop 5642 jmp xloop99 5643 5644 // Blend 50 / 50. 5645 xloop50: 5646 vmovdqu ymm0, [esi] 5647 vpavgb ymm0, ymm0, [esi + edx] 5648 vmovdqu [esi + edi], ymm0 5649 lea esi, [esi + 32] 5650 sub ecx, 32 5651 jg xloop50 5652 jmp xloop99 5653 5654 // Blend 100 / 0 - Copy row unchanged. 5655 xloop100: 5656 rep movsb 5657 5658 xloop99: 5659 pop edi 5660 pop esi 5661 vzeroupper 5662 ret 5663 } 5664} 5665#endif // HAS_INTERPOLATEROW_AVX2 5666 5667// Bilinear filter 16x2 -> 16x1 5668// TODO(fbarchard): Consider allowing 256 using memcpy. 5669__declspec(naked) 5670void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, 5671 ptrdiff_t src_stride, int dst_width, 5672 int source_y_fraction) { 5673 __asm { 5674 push esi 5675 push edi 5676 5677 mov edi, [esp + 8 + 4] // dst_ptr 5678 mov esi, [esp + 8 + 8] // src_ptr 5679 mov edx, [esp + 8 + 12] // src_stride 5680 mov ecx, [esp + 8 + 16] // dst_width 5681 mov eax, [esp + 8 + 20] // source_y_fraction (0..255) 5682 sub edi, esi 5683 // Dispatch to specialized filters if applicable. 5684 cmp eax, 0 5685 je xloop100 // 0 /256. Blend 100 / 0. 5686 cmp eax, 128 5687 je xloop50 // 128 / 256 is 0.50. Blend 50 / 50. 5688 5689 movd xmm0, eax // high fraction 0..255 5690 neg eax 5691 add eax, 256 5692 movd xmm5, eax // low fraction 255..1 5693 punpcklbw xmm5, xmm0 5694 punpcklwd xmm5, xmm5 5695 pshufd xmm5, xmm5, 0 5696 mov eax, 0x80808080 // 128 for biasing image to signed. 5697 movd xmm4, eax 5698 pshufd xmm4, xmm4, 0x00 5699 5700 xloop: 5701 movdqu xmm0, [esi] 5702 movdqu xmm2, [esi + edx] 5703 movdqu xmm1, xmm0 5704 punpcklbw xmm0, xmm2 5705 punpckhbw xmm1, xmm2 5706 psubb xmm0, xmm4 // bias image by -128 5707 psubb xmm1, xmm4 5708 movdqa xmm2, xmm5 5709 movdqa xmm3, xmm5 5710 pmaddubsw xmm2, xmm0 5711 pmaddubsw xmm3, xmm1 5712 paddw xmm2, xmm4 5713 paddw xmm3, xmm4 5714 psrlw xmm2, 8 5715 psrlw xmm3, 8 5716 packuswb xmm2, xmm3 5717 movdqu [esi + edi], xmm2 5718 lea esi, [esi + 16] 5719 sub ecx, 16 5720 jg xloop 5721 jmp xloop99 5722 5723 // Blend 50 / 50. 5724 xloop50: 5725 movdqu xmm0, [esi] 5726 movdqu xmm1, [esi + edx] 5727 pavgb xmm0, xmm1 5728 movdqu [esi + edi], xmm0 5729 lea esi, [esi + 16] 5730 sub ecx, 16 5731 jg xloop50 5732 jmp xloop99 5733 5734 // Blend 100 / 0 - Copy row unchanged. 5735 xloop100: 5736 movdqu xmm0, [esi] 5737 movdqu [esi + edi], xmm0 5738 lea esi, [esi + 16] 5739 sub ecx, 16 5740 jg xloop100 5741 5742 xloop99: 5743 pop edi 5744 pop esi 5745 ret 5746 } 5747} 5748 5749// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. 5750__declspec(naked) 5751void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, 5752 const uint8* shuffler, int width) { 5753 __asm { 5754 mov eax, [esp + 4] // src_argb 5755 mov edx, [esp + 8] // dst_argb 5756 mov ecx, [esp + 12] // shuffler 5757 movdqu xmm5, [ecx] 5758 mov ecx, [esp + 16] // width 5759 5760 wloop: 5761 movdqu xmm0, [eax] 5762 movdqu xmm1, [eax + 16] 5763 lea eax, [eax + 32] 5764 pshufb xmm0, xmm5 5765 pshufb xmm1, xmm5 5766 movdqu [edx], xmm0 5767 movdqu [edx + 16], xmm1 5768 lea edx, [edx + 32] 5769 sub ecx, 8 5770 jg wloop 5771 ret 5772 } 5773} 5774 5775#ifdef HAS_ARGBSHUFFLEROW_AVX2 5776__declspec(naked) 5777void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, 5778 const uint8* shuffler, int width) { 5779 __asm { 5780 mov eax, [esp + 4] // src_argb 5781 mov edx, [esp + 8] // dst_argb 5782 mov ecx, [esp + 12] // shuffler 5783 vbroadcastf128 ymm5, [ecx] // same shuffle in high as low. 5784 mov ecx, [esp + 16] // width 5785 5786 wloop: 5787 vmovdqu ymm0, [eax] 5788 vmovdqu ymm1, [eax + 32] 5789 lea eax, [eax + 64] 5790 vpshufb ymm0, ymm0, ymm5 5791 vpshufb ymm1, ymm1, ymm5 5792 vmovdqu [edx], ymm0 5793 vmovdqu [edx + 32], ymm1 5794 lea edx, [edx + 64] 5795 sub ecx, 16 5796 jg wloop 5797 5798 vzeroupper 5799 ret 5800 } 5801} 5802#endif // HAS_ARGBSHUFFLEROW_AVX2 5803 5804__declspec(naked) 5805void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, 5806 const uint8* shuffler, int width) { 5807 __asm { 5808 push ebx 5809 push esi 5810 mov eax, [esp + 8 + 4] // src_argb 5811 mov edx, [esp + 8 + 8] // dst_argb 5812 mov esi, [esp + 8 + 12] // shuffler 5813 mov ecx, [esp + 8 + 16] // width 5814 pxor xmm5, xmm5 5815 5816 mov ebx, [esi] // shuffler 5817 cmp ebx, 0x03000102 5818 je shuf_3012 5819 cmp ebx, 0x00010203 5820 je shuf_0123 5821 cmp ebx, 0x00030201 5822 je shuf_0321 5823 cmp ebx, 0x02010003 5824 je shuf_2103 5825 5826 // TODO(fbarchard): Use one source pointer and 3 offsets. 5827 shuf_any1: 5828 movzx ebx, byte ptr [esi] 5829 movzx ebx, byte ptr [eax + ebx] 5830 mov [edx], bl 5831 movzx ebx, byte ptr [esi + 1] 5832 movzx ebx, byte ptr [eax + ebx] 5833 mov [edx + 1], bl 5834 movzx ebx, byte ptr [esi + 2] 5835 movzx ebx, byte ptr [eax + ebx] 5836 mov [edx + 2], bl 5837 movzx ebx, byte ptr [esi + 3] 5838 movzx ebx, byte ptr [eax + ebx] 5839 mov [edx + 3], bl 5840 lea eax, [eax + 4] 5841 lea edx, [edx + 4] 5842 sub ecx, 1 5843 jg shuf_any1 5844 jmp shuf99 5845 5846 shuf_0123: 5847 movdqu xmm0, [eax] 5848 lea eax, [eax + 16] 5849 movdqa xmm1, xmm0 5850 punpcklbw xmm0, xmm5 5851 punpckhbw xmm1, xmm5 5852 pshufhw xmm0, xmm0, 01Bh // 1B = 00011011 = 0x0123 = BGRAToARGB 5853 pshuflw xmm0, xmm0, 01Bh 5854 pshufhw xmm1, xmm1, 01Bh 5855 pshuflw xmm1, xmm1, 01Bh 5856 packuswb xmm0, xmm1 5857 movdqu [edx], xmm0 5858 lea edx, [edx + 16] 5859 sub ecx, 4 5860 jg shuf_0123 5861 jmp shuf99 5862 5863 shuf_0321: 5864 movdqu xmm0, [eax] 5865 lea eax, [eax + 16] 5866 movdqa xmm1, xmm0 5867 punpcklbw xmm0, xmm5 5868 punpckhbw xmm1, xmm5 5869 pshufhw xmm0, xmm0, 039h // 39 = 00111001 = 0x0321 = RGBAToARGB 5870 pshuflw xmm0, xmm0, 039h 5871 pshufhw xmm1, xmm1, 039h 5872 pshuflw xmm1, xmm1, 039h 5873 packuswb xmm0, xmm1 5874 movdqu [edx], xmm0 5875 lea edx, [edx + 16] 5876 sub ecx, 4 5877 jg shuf_0321 5878 jmp shuf99 5879 5880 shuf_2103: 5881 movdqu xmm0, [eax] 5882 lea eax, [eax + 16] 5883 movdqa xmm1, xmm0 5884 punpcklbw xmm0, xmm5 5885 punpckhbw xmm1, xmm5 5886 pshufhw xmm0, xmm0, 093h // 93 = 10010011 = 0x2103 = ARGBToRGBA 5887 pshuflw xmm0, xmm0, 093h 5888 pshufhw xmm1, xmm1, 093h 5889 pshuflw xmm1, xmm1, 093h 5890 packuswb xmm0, xmm1 5891 movdqu [edx], xmm0 5892 lea edx, [edx + 16] 5893 sub ecx, 4 5894 jg shuf_2103 5895 jmp shuf99 5896 5897 shuf_3012: 5898 movdqu xmm0, [eax] 5899 lea eax, [eax + 16] 5900 movdqa xmm1, xmm0 5901 punpcklbw xmm0, xmm5 5902 punpckhbw xmm1, xmm5 5903 pshufhw xmm0, xmm0, 0C6h // C6 = 11000110 = 0x3012 = ABGRToARGB 5904 pshuflw xmm0, xmm0, 0C6h 5905 pshufhw xmm1, xmm1, 0C6h 5906 pshuflw xmm1, xmm1, 0C6h 5907 packuswb xmm0, xmm1 5908 movdqu [edx], xmm0 5909 lea edx, [edx + 16] 5910 sub ecx, 4 5911 jg shuf_3012 5912 5913 shuf99: 5914 pop esi 5915 pop ebx 5916 ret 5917 } 5918} 5919 5920// YUY2 - Macro-pixel = 2 image pixels 5921// Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4.... 5922 5923// UYVY - Macro-pixel = 2 image pixels 5924// U0Y0V0Y1 5925 5926__declspec(naked) 5927void I422ToYUY2Row_SSE2(const uint8* src_y, 5928 const uint8* src_u, 5929 const uint8* src_v, 5930 uint8* dst_frame, int width) { 5931 __asm { 5932 push esi 5933 push edi 5934 mov eax, [esp + 8 + 4] // src_y 5935 mov esi, [esp + 8 + 8] // src_u 5936 mov edx, [esp + 8 + 12] // src_v 5937 mov edi, [esp + 8 + 16] // dst_frame 5938 mov ecx, [esp + 8 + 20] // width 5939 sub edx, esi 5940 5941 convertloop: 5942 movq xmm2, qword ptr [esi] // U 5943 movq xmm3, qword ptr [esi + edx] // V 5944 lea esi, [esi + 8] 5945 punpcklbw xmm2, xmm3 // UV 5946 movdqu xmm0, [eax] // Y 5947 lea eax, [eax + 16] 5948 movdqa xmm1, xmm0 5949 punpcklbw xmm0, xmm2 // YUYV 5950 punpckhbw xmm1, xmm2 5951 movdqu [edi], xmm0 5952 movdqu [edi + 16], xmm1 5953 lea edi, [edi + 32] 5954 sub ecx, 16 5955 jg convertloop 5956 5957 pop edi 5958 pop esi 5959 ret 5960 } 5961} 5962 5963__declspec(naked) 5964void I422ToUYVYRow_SSE2(const uint8* src_y, 5965 const uint8* src_u, 5966 const uint8* src_v, 5967 uint8* dst_frame, int width) { 5968 __asm { 5969 push esi 5970 push edi 5971 mov eax, [esp + 8 + 4] // src_y 5972 mov esi, [esp + 8 + 8] // src_u 5973 mov edx, [esp + 8 + 12] // src_v 5974 mov edi, [esp + 8 + 16] // dst_frame 5975 mov ecx, [esp + 8 + 20] // width 5976 sub edx, esi 5977 5978 convertloop: 5979 movq xmm2, qword ptr [esi] // U 5980 movq xmm3, qword ptr [esi + edx] // V 5981 lea esi, [esi + 8] 5982 punpcklbw xmm2, xmm3 // UV 5983 movdqu xmm0, [eax] // Y 5984 movdqa xmm1, xmm2 5985 lea eax, [eax + 16] 5986 punpcklbw xmm1, xmm0 // UYVY 5987 punpckhbw xmm2, xmm0 5988 movdqu [edi], xmm1 5989 movdqu [edi + 16], xmm2 5990 lea edi, [edi + 32] 5991 sub ecx, 16 5992 jg convertloop 5993 5994 pop edi 5995 pop esi 5996 ret 5997 } 5998} 5999 6000#ifdef HAS_ARGBPOLYNOMIALROW_SSE2 6001__declspec(naked) 6002void ARGBPolynomialRow_SSE2(const uint8* src_argb, 6003 uint8* dst_argb, const float* poly, 6004 int width) { 6005 __asm { 6006 push esi 6007 mov eax, [esp + 4 + 4] /* src_argb */ 6008 mov edx, [esp + 4 + 8] /* dst_argb */ 6009 mov esi, [esp + 4 + 12] /* poly */ 6010 mov ecx, [esp + 4 + 16] /* width */ 6011 pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints. 6012 6013 // 2 pixel loop. 6014 convertloop: 6015// pmovzxbd xmm0, dword ptr [eax] // BGRA pixel 6016// pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel 6017 movq xmm0, qword ptr [eax] // BGRABGRA 6018 lea eax, [eax + 8] 6019 punpcklbw xmm0, xmm3 6020 movdqa xmm4, xmm0 6021 punpcklwd xmm0, xmm3 // pixel 0 6022 punpckhwd xmm4, xmm3 // pixel 1 6023 cvtdq2ps xmm0, xmm0 // 4 floats 6024 cvtdq2ps xmm4, xmm4 6025 movdqa xmm1, xmm0 // X 6026 movdqa xmm5, xmm4 6027 mulps xmm0, [esi + 16] // C1 * X 6028 mulps xmm4, [esi + 16] 6029 addps xmm0, [esi] // result = C0 + C1 * X 6030 addps xmm4, [esi] 6031 movdqa xmm2, xmm1 6032 movdqa xmm6, xmm5 6033 mulps xmm2, xmm1 // X * X 6034 mulps xmm6, xmm5 6035 mulps xmm1, xmm2 // X * X * X 6036 mulps xmm5, xmm6 6037 mulps xmm2, [esi + 32] // C2 * X * X 6038 mulps xmm6, [esi + 32] 6039 mulps xmm1, [esi + 48] // C3 * X * X * X 6040 mulps xmm5, [esi + 48] 6041 addps xmm0, xmm2 // result += C2 * X * X 6042 addps xmm4, xmm6 6043 addps xmm0, xmm1 // result += C3 * X * X * X 6044 addps xmm4, xmm5 6045 cvttps2dq xmm0, xmm0 6046 cvttps2dq xmm4, xmm4 6047 packuswb xmm0, xmm4 6048 packuswb xmm0, xmm0 6049 movq qword ptr [edx], xmm0 6050 lea edx, [edx + 8] 6051 sub ecx, 2 6052 jg convertloop 6053 pop esi 6054 ret 6055 } 6056} 6057#endif // HAS_ARGBPOLYNOMIALROW_SSE2 6058 6059#ifdef HAS_ARGBPOLYNOMIALROW_AVX2 6060__declspec(naked) 6061void ARGBPolynomialRow_AVX2(const uint8* src_argb, 6062 uint8* dst_argb, const float* poly, 6063 int width) { 6064 __asm { 6065 mov eax, [esp + 4] /* src_argb */ 6066 mov edx, [esp + 8] /* dst_argb */ 6067 mov ecx, [esp + 12] /* poly */ 6068 vbroadcastf128 ymm4, [ecx] // C0 6069 vbroadcastf128 ymm5, [ecx + 16] // C1 6070 vbroadcastf128 ymm6, [ecx + 32] // C2 6071 vbroadcastf128 ymm7, [ecx + 48] // C3 6072 mov ecx, [esp + 16] /* width */ 6073 6074 // 2 pixel loop. 6075 convertloop: 6076 vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels 6077 lea eax, [eax + 8] 6078 vcvtdq2ps ymm0, ymm0 // X 8 floats 6079 vmulps ymm2, ymm0, ymm0 // X * X 6080 vmulps ymm3, ymm0, ymm7 // C3 * X 6081 vfmadd132ps ymm0, ymm4, ymm5 // result = C0 + C1 * X 6082 vfmadd231ps ymm0, ymm2, ymm6 // result += C2 * X * X 6083 vfmadd231ps ymm0, ymm2, ymm3 // result += C3 * X * X * X 6084 vcvttps2dq ymm0, ymm0 6085 vpackusdw ymm0, ymm0, ymm0 // b0g0r0a0_00000000_b0g0r0a0_00000000 6086 vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000 6087 vpackuswb xmm0, xmm0, xmm0 // bgrabgra_00000000_00000000_00000000 6088 vmovq qword ptr [edx], xmm0 6089 lea edx, [edx + 8] 6090 sub ecx, 2 6091 jg convertloop 6092 vzeroupper 6093 ret 6094 } 6095} 6096#endif // HAS_ARGBPOLYNOMIALROW_AVX2 6097 6098#ifdef HAS_ARGBCOLORTABLEROW_X86 6099// Tranform ARGB pixels with color table. 6100__declspec(naked) 6101void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, 6102 int width) { 6103 __asm { 6104 push esi 6105 mov eax, [esp + 4 + 4] /* dst_argb */ 6106 mov esi, [esp + 4 + 8] /* table_argb */ 6107 mov ecx, [esp + 4 + 12] /* width */ 6108 6109 // 1 pixel loop. 6110 convertloop: 6111 movzx edx, byte ptr [eax] 6112 lea eax, [eax + 4] 6113 movzx edx, byte ptr [esi + edx * 4] 6114 mov byte ptr [eax - 4], dl 6115 movzx edx, byte ptr [eax - 4 + 1] 6116 movzx edx, byte ptr [esi + edx * 4 + 1] 6117 mov byte ptr [eax - 4 + 1], dl 6118 movzx edx, byte ptr [eax - 4 + 2] 6119 movzx edx, byte ptr [esi + edx * 4 + 2] 6120 mov byte ptr [eax - 4 + 2], dl 6121 movzx edx, byte ptr [eax - 4 + 3] 6122 movzx edx, byte ptr [esi + edx * 4 + 3] 6123 mov byte ptr [eax - 4 + 3], dl 6124 dec ecx 6125 jg convertloop 6126 pop esi 6127 ret 6128 } 6129} 6130#endif // HAS_ARGBCOLORTABLEROW_X86 6131 6132#ifdef HAS_RGBCOLORTABLEROW_X86 6133// Tranform RGB pixels with color table. 6134__declspec(naked) 6135void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { 6136 __asm { 6137 push esi 6138 mov eax, [esp + 4 + 4] /* dst_argb */ 6139 mov esi, [esp + 4 + 8] /* table_argb */ 6140 mov ecx, [esp + 4 + 12] /* width */ 6141 6142 // 1 pixel loop. 6143 convertloop: 6144 movzx edx, byte ptr [eax] 6145 lea eax, [eax + 4] 6146 movzx edx, byte ptr [esi + edx * 4] 6147 mov byte ptr [eax - 4], dl 6148 movzx edx, byte ptr [eax - 4 + 1] 6149 movzx edx, byte ptr [esi + edx * 4 + 1] 6150 mov byte ptr [eax - 4 + 1], dl 6151 movzx edx, byte ptr [eax - 4 + 2] 6152 movzx edx, byte ptr [esi + edx * 4 + 2] 6153 mov byte ptr [eax - 4 + 2], dl 6154 dec ecx 6155 jg convertloop 6156 6157 pop esi 6158 ret 6159 } 6160} 6161#endif // HAS_RGBCOLORTABLEROW_X86 6162 6163#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 6164// Tranform RGB pixels with luma table. 6165__declspec(naked) 6166void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, 6167 int width, 6168 const uint8* luma, uint32 lumacoeff) { 6169 __asm { 6170 push esi 6171 push edi 6172 mov eax, [esp + 8 + 4] /* src_argb */ 6173 mov edi, [esp + 8 + 8] /* dst_argb */ 6174 mov ecx, [esp + 8 + 12] /* width */ 6175 movd xmm2, dword ptr [esp + 8 + 16] // luma table 6176 movd xmm3, dword ptr [esp + 8 + 20] // lumacoeff 6177 pshufd xmm2, xmm2, 0 6178 pshufd xmm3, xmm3, 0 6179 pcmpeqb xmm4, xmm4 // generate mask 0xff00ff00 6180 psllw xmm4, 8 6181 pxor xmm5, xmm5 6182 6183 // 4 pixel loop. 6184 convertloop: 6185 movdqu xmm0, xmmword ptr [eax] // generate luma ptr 6186 pmaddubsw xmm0, xmm3 6187 phaddw xmm0, xmm0 6188 pand xmm0, xmm4 // mask out low bits 6189 punpcklwd xmm0, xmm5 6190 paddd xmm0, xmm2 // add table base 6191 movd esi, xmm0 6192 pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 6193 6194 movzx edx, byte ptr [eax] 6195 movzx edx, byte ptr [esi + edx] 6196 mov byte ptr [edi], dl 6197 movzx edx, byte ptr [eax + 1] 6198 movzx edx, byte ptr [esi + edx] 6199 mov byte ptr [edi + 1], dl 6200 movzx edx, byte ptr [eax + 2] 6201 movzx edx, byte ptr [esi + edx] 6202 mov byte ptr [edi + 2], dl 6203 movzx edx, byte ptr [eax + 3] // copy alpha. 6204 mov byte ptr [edi + 3], dl 6205 6206 movd esi, xmm0 6207 pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 6208 6209 movzx edx, byte ptr [eax + 4] 6210 movzx edx, byte ptr [esi + edx] 6211 mov byte ptr [edi + 4], dl 6212 movzx edx, byte ptr [eax + 5] 6213 movzx edx, byte ptr [esi + edx] 6214 mov byte ptr [edi + 5], dl 6215 movzx edx, byte ptr [eax + 6] 6216 movzx edx, byte ptr [esi + edx] 6217 mov byte ptr [edi + 6], dl 6218 movzx edx, byte ptr [eax + 7] // copy alpha. 6219 mov byte ptr [edi + 7], dl 6220 6221 movd esi, xmm0 6222 pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 6223 6224 movzx edx, byte ptr [eax + 8] 6225 movzx edx, byte ptr [esi + edx] 6226 mov byte ptr [edi + 8], dl 6227 movzx edx, byte ptr [eax + 9] 6228 movzx edx, byte ptr [esi + edx] 6229 mov byte ptr [edi + 9], dl 6230 movzx edx, byte ptr [eax + 10] 6231 movzx edx, byte ptr [esi + edx] 6232 mov byte ptr [edi + 10], dl 6233 movzx edx, byte ptr [eax + 11] // copy alpha. 6234 mov byte ptr [edi + 11], dl 6235 6236 movd esi, xmm0 6237 6238 movzx edx, byte ptr [eax + 12] 6239 movzx edx, byte ptr [esi + edx] 6240 mov byte ptr [edi + 12], dl 6241 movzx edx, byte ptr [eax + 13] 6242 movzx edx, byte ptr [esi + edx] 6243 mov byte ptr [edi + 13], dl 6244 movzx edx, byte ptr [eax + 14] 6245 movzx edx, byte ptr [esi + edx] 6246 mov byte ptr [edi + 14], dl 6247 movzx edx, byte ptr [eax + 15] // copy alpha. 6248 mov byte ptr [edi + 15], dl 6249 6250 lea eax, [eax + 16] 6251 lea edi, [edi + 16] 6252 sub ecx, 4 6253 jg convertloop 6254 6255 pop edi 6256 pop esi 6257 ret 6258 } 6259} 6260#endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 6261 6262#endif // defined(_M_X64) 6263 6264#ifdef __cplusplus 6265} // extern "C" 6266} // namespace libyuv 6267#endif 6268 6269#endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) 6270