1/* 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include "libyuv/row.h" 12 13#include "libyuv/basic_types.h" 14 15#ifdef __cplusplus 16namespace libyuv { 17extern "C" { 18#endif 19 20// This module is for GCC x86 and x64 21#if !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__)) 22 23// GCC 4.2 on OSX has link error when passing static or const to inline. 24// TODO(fbarchard): Use static const when gcc 4.2 support is dropped. 25#ifdef __APPLE__ 26#define CONST 27#else 28#define CONST static const 29#endif 30 31#ifdef HAS_ARGBTOYROW_SSSE3 32 33// Constants for ARGB 34CONST vec8 kARGBToY = { 35 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 36}; 37 38CONST vec8 kARGBToU = { 39 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 40}; 41 42CONST vec8 kARGBToV = { 43 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, 44}; 45 46// Constants for BGRA 47CONST vec8 kBGRAToY = { 48 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13 49}; 50 51CONST vec8 kBGRAToU = { 52 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112 53}; 54 55CONST vec8 kBGRAToV = { 56 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18 57}; 58 59// Constants for ABGR 60CONST vec8 kABGRToY = { 61 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0 62}; 63 64CONST vec8 kABGRToU = { 65 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0 66}; 67 68CONST vec8 kABGRToV = { 69 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0 70}; 71 72CONST uvec8 kAddY16 = { 73 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u 74}; 75 76CONST uvec8 kAddUV128 = { 77 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 78 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u 79}; 80 81// Shuffle table for converting RGB24 to ARGB. 82CONST uvec8 kShuffleMaskRGB24ToARGB = { 83 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u 84}; 85 86// Shuffle table for converting RAW to ARGB. 87CONST uvec8 kShuffleMaskRAWToARGB = { 88 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u 89}; 90 91// Shuffle table for converting ABGR to ARGB. 92CONST uvec8 kShuffleMaskABGRToARGB = { 93 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u 94}; 95 96// Shuffle table for converting BGRA to ARGB. 97CONST uvec8 kShuffleMaskBGRAToARGB = { 98 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u 99}; 100 101// Shuffle table for converting RGBA to ARGB. 102CONST uvec8 kShuffleMaskRGBAToARGB = { 103 1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u 104}; 105 106// Shuffle table for converting ARGB to RGBA. 107CONST uvec8 kShuffleMaskARGBToRGBA = { 108 3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u 109}; 110 111// Shuffle table for converting ARGB to RGB24. 112CONST uvec8 kShuffleMaskARGBToRGB24 = { 113 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u 114}; 115 116// Shuffle table for converting ARGB to RAW. 117CONST uvec8 kShuffleMaskARGBToRAW = { 118 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u 119}; 120 121void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { 122 asm volatile ( 123 "pcmpeqb %%xmm5,%%xmm5 \n" 124 "pslld $0x18,%%xmm5 \n" 125 ".p2align 4 \n" 126 "1: \n" 127 "movq (%0),%%xmm0 \n" 128 "lea 0x8(%0),%0 \n" 129 "punpcklbw %%xmm0,%%xmm0 \n" 130 "movdqa %%xmm0,%%xmm1 \n" 131 "punpcklwd %%xmm0,%%xmm0 \n" 132 "punpckhwd %%xmm1,%%xmm1 \n" 133 "por %%xmm5,%%xmm0 \n" 134 "por %%xmm5,%%xmm1 \n" 135 "movdqa %%xmm0,(%1) \n" 136 "movdqa %%xmm1,0x10(%1) \n" 137 "lea 0x20(%1),%1 \n" 138 "sub $0x8,%2 \n" 139 "jg 1b \n" 140 : "+r"(src_y), // %0 141 "+r"(dst_argb), // %1 142 "+r"(pix) // %2 143 : 144 : "memory", "cc" 145#if defined(__SSE2__) 146 , "xmm0", "xmm1", "xmm5" 147#endif 148 ); 149} 150 151void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) { 152 asm volatile ( 153 "movdqa %3,%%xmm5 \n" 154 "sub %0,%1 \n" 155 ".p2align 4 \n" 156 "1: \n" 157 "movdqa (%0),%%xmm0 \n" 158 "pshufb %%xmm5,%%xmm0 \n" 159 "sub $0x4,%2 \n" 160 "movdqa %%xmm0,(%0,%1,1) \n" 161 "lea 0x10(%0),%0 \n" 162 "jg 1b \n" 163 164 : "+r"(src_abgr), // %0 165 "+r"(dst_argb), // %1 166 "+r"(pix) // %2 167 : "m"(kShuffleMaskABGRToARGB) // %3 168 : "memory", "cc" 169#if defined(__SSE2__) 170 , "xmm0", "xmm5" 171#endif 172 ); 173} 174 175void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) { 176 asm volatile ( 177 "movdqa %3,%%xmm5 \n" 178 "sub %0,%1 \n" 179 ".p2align 4 \n" 180 "1: \n" 181 "movdqa (%0),%%xmm0 \n" 182 "pshufb %%xmm5,%%xmm0 \n" 183 "sub $0x4,%2 \n" 184 "movdqa %%xmm0,(%0,%1,1) \n" 185 "lea 0x10(%0),%0 \n" 186 "jg 1b \n" 187 : "+r"(src_bgra), // %0 188 "+r"(dst_argb), // %1 189 "+r"(pix) // %2 190 : "m"(kShuffleMaskBGRAToARGB) // %3 191 : "memory", "cc" 192#if defined(__SSE2__) 193 , "xmm0", "xmm5" 194#endif 195 ); 196} 197 198void RGBAToARGBRow_SSSE3(const uint8* src_rgba, uint8* dst_argb, int pix) { 199 asm volatile ( 200 "movdqa %3,%%xmm5 \n" 201 "sub %0,%1 \n" 202 ".p2align 4 \n" 203 "1: \n" 204 "movdqa (%0),%%xmm0 \n" 205 "pshufb %%xmm5,%%xmm0 \n" 206 "sub $0x4,%2 \n" 207 "movdqa %%xmm0,(%0,%1,1) \n" 208 "lea 0x10(%0),%0 \n" 209 "jg 1b \n" 210 211 : "+r"(src_rgba), // %0 212 "+r"(dst_argb), // %1 213 "+r"(pix) // %2 214 : "m"(kShuffleMaskRGBAToARGB) // %3 215 : "memory", "cc" 216#if defined(__SSE2__) 217 , "xmm0", "xmm5" 218#endif 219 ); 220} 221 222void ARGBToRGBARow_SSSE3(const uint8* src_argb, uint8* dst_rgba, int pix) { 223 asm volatile ( 224 "movdqa %3,%%xmm5 \n" 225 "sub %0,%1 \n" 226 ".p2align 4 \n" 227 "1: \n" 228 "movdqa (%0),%%xmm0 \n" 229 "pshufb %%xmm5,%%xmm0 \n" 230 "sub $0x4,%2 \n" 231 "movdqa %%xmm0,(%0,%1,1) \n" 232 "lea 0x10(%0),%0 \n" 233 "jg 1b \n" 234 235 : "+r"(src_argb), // %0 236 "+r"(dst_rgba), // %1 237 "+r"(pix) // %2 238 : "m"(kShuffleMaskARGBToRGBA) // %3 239 : "memory", "cc" 240#if defined(__SSE2__) 241 , "xmm0", "xmm5" 242#endif 243 ); 244} 245 246void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { 247 asm volatile ( 248 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000 249 "pslld $0x18,%%xmm5 \n" 250 "movdqa %3,%%xmm4 \n" 251 ".p2align 4 \n" 252 "1: \n" 253 "movdqu (%0),%%xmm0 \n" 254 "movdqu 0x10(%0),%%xmm1 \n" 255 "movdqu 0x20(%0),%%xmm3 \n" 256 "lea 0x30(%0),%0 \n" 257 "movdqa %%xmm3,%%xmm2 \n" 258 "palignr $0x8,%%xmm1,%%xmm2 \n" 259 "pshufb %%xmm4,%%xmm2 \n" 260 "por %%xmm5,%%xmm2 \n" 261 "palignr $0xc,%%xmm0,%%xmm1 \n" 262 "pshufb %%xmm4,%%xmm0 \n" 263 "movdqa %%xmm2,0x20(%1) \n" 264 "por %%xmm5,%%xmm0 \n" 265 "pshufb %%xmm4,%%xmm1 \n" 266 "movdqa %%xmm0,(%1) \n" 267 "por %%xmm5,%%xmm1 \n" 268 "palignr $0x4,%%xmm3,%%xmm3 \n" 269 "pshufb %%xmm4,%%xmm3 \n" 270 "movdqa %%xmm1,0x10(%1) \n" 271 "por %%xmm5,%%xmm3 \n" 272 "sub $0x10,%2 \n" 273 "movdqa %%xmm3,0x30(%1) \n" 274 "lea 0x40(%1),%1 \n" 275 "jg 1b \n" 276 : "+r"(src_rgb24), // %0 277 "+r"(dst_argb), // %1 278 "+r"(pix) // %2 279 : "m"(kShuffleMaskRGB24ToARGB) // %3 280 : "memory", "cc" 281#if defined(__SSE2__) 282 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 283#endif 284 ); 285} 286 287void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) { 288 asm volatile ( 289 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000 290 "pslld $0x18,%%xmm5 \n" 291 "movdqa %3,%%xmm4 \n" 292 ".p2align 4 \n" 293 "1: \n" 294 "movdqu (%0),%%xmm0 \n" 295 "movdqu 0x10(%0),%%xmm1 \n" 296 "movdqu 0x20(%0),%%xmm3 \n" 297 "lea 0x30(%0),%0 \n" 298 "movdqa %%xmm3,%%xmm2 \n" 299 "palignr $0x8,%%xmm1,%%xmm2 \n" 300 "pshufb %%xmm4,%%xmm2 \n" 301 "por %%xmm5,%%xmm2 \n" 302 "palignr $0xc,%%xmm0,%%xmm1 \n" 303 "pshufb %%xmm4,%%xmm0 \n" 304 "movdqa %%xmm2,0x20(%1) \n" 305 "por %%xmm5,%%xmm0 \n" 306 "pshufb %%xmm4,%%xmm1 \n" 307 "movdqa %%xmm0,(%1) \n" 308 "por %%xmm5,%%xmm1 \n" 309 "palignr $0x4,%%xmm3,%%xmm3 \n" 310 "pshufb %%xmm4,%%xmm3 \n" 311 "movdqa %%xmm1,0x10(%1) \n" 312 "por %%xmm5,%%xmm3 \n" 313 "sub $0x10,%2 \n" 314 "movdqa %%xmm3,0x30(%1) \n" 315 "lea 0x40(%1),%1 \n" 316 "jg 1b \n" 317 : "+r"(src_raw), // %0 318 "+r"(dst_argb), // %1 319 "+r"(pix) // %2 320 : "m"(kShuffleMaskRAWToARGB) // %3 321 : "memory", "cc" 322#if defined(__SSE2__) 323 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 324#endif 325 ); 326} 327 328void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { 329 asm volatile ( 330 "mov $0x1080108,%%eax \n" 331 "movd %%eax,%%xmm5 \n" 332 "pshufd $0x0,%%xmm5,%%xmm5 \n" 333 "mov $0x20802080,%%eax \n" 334 "movd %%eax,%%xmm6 \n" 335 "pshufd $0x0,%%xmm6,%%xmm6 \n" 336 "pcmpeqb %%xmm3,%%xmm3 \n" 337 "psllw $0xb,%%xmm3 \n" 338 "pcmpeqb %%xmm4,%%xmm4 \n" 339 "psllw $0xa,%%xmm4 \n" 340 "psrlw $0x5,%%xmm4 \n" 341 "pcmpeqb %%xmm7,%%xmm7 \n" 342 "psllw $0x8,%%xmm7 \n" 343 "sub %0,%1 \n" 344 "sub %0,%1 \n" 345 ".p2align 4 \n" 346 "1: \n" 347 "movdqu (%0),%%xmm0 \n" 348 "movdqa %%xmm0,%%xmm1 \n" 349 "movdqa %%xmm0,%%xmm2 \n" 350 "pand %%xmm3,%%xmm1 \n" 351 "psllw $0xb,%%xmm2 \n" 352 "pmulhuw %%xmm5,%%xmm1 \n" 353 "pmulhuw %%xmm5,%%xmm2 \n" 354 "psllw $0x8,%%xmm1 \n" 355 "por %%xmm2,%%xmm1 \n" 356 "pand %%xmm4,%%xmm0 \n" 357 "pmulhuw %%xmm6,%%xmm0 \n" 358 "por %%xmm7,%%xmm0 \n" 359 "movdqa %%xmm1,%%xmm2 \n" 360 "punpcklbw %%xmm0,%%xmm1 \n" 361 "punpckhbw %%xmm0,%%xmm2 \n" 362 "movdqa %%xmm1,(%1,%0,2) \n" 363 "movdqa %%xmm2,0x10(%1,%0,2) \n" 364 "lea 0x10(%0),%0 \n" 365 "sub $0x8,%2 \n" 366 "jg 1b \n" 367 : "+r"(src), // %0 368 "+r"(dst), // %1 369 "+r"(pix) // %2 370 : 371 : "memory", "cc", "eax" 372#if defined(__SSE2__) 373 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 374#endif 375 ); 376} 377 378void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { 379 asm volatile ( 380 "mov $0x1080108,%%eax \n" 381 "movd %%eax,%%xmm5 \n" 382 "pshufd $0x0,%%xmm5,%%xmm5 \n" 383 "mov $0x42004200,%%eax \n" 384 "movd %%eax,%%xmm6 \n" 385 "pshufd $0x0,%%xmm6,%%xmm6 \n" 386 "pcmpeqb %%xmm3,%%xmm3 \n" 387 "psllw $0xb,%%xmm3 \n" 388 "movdqa %%xmm3,%%xmm4 \n" 389 "psrlw $0x6,%%xmm4 \n" 390 "pcmpeqb %%xmm7,%%xmm7 \n" 391 "psllw $0x8,%%xmm7 \n" 392 "sub %0,%1 \n" 393 "sub %0,%1 \n" 394 ".p2align 4 \n" 395 "1: \n" 396 "movdqu (%0),%%xmm0 \n" 397 "movdqa %%xmm0,%%xmm1 \n" 398 "movdqa %%xmm0,%%xmm2 \n" 399 "psllw $0x1,%%xmm1 \n" 400 "psllw $0xb,%%xmm2 \n" 401 "pand %%xmm3,%%xmm1 \n" 402 "pmulhuw %%xmm5,%%xmm2 \n" 403 "pmulhuw %%xmm5,%%xmm1 \n" 404 "psllw $0x8,%%xmm1 \n" 405 "por %%xmm2,%%xmm1 \n" 406 "movdqa %%xmm0,%%xmm2 \n" 407 "pand %%xmm4,%%xmm0 \n" 408 "psraw $0x8,%%xmm2 \n" 409 "pmulhuw %%xmm6,%%xmm0 \n" 410 "pand %%xmm7,%%xmm2 \n" 411 "por %%xmm2,%%xmm0 \n" 412 "movdqa %%xmm1,%%xmm2 \n" 413 "punpcklbw %%xmm0,%%xmm1 \n" 414 "punpckhbw %%xmm0,%%xmm2 \n" 415 "movdqa %%xmm1,(%1,%0,2) \n" 416 "movdqa %%xmm2,0x10(%1,%0,2) \n" 417 "lea 0x10(%0),%0 \n" 418 "sub $0x8,%2 \n" 419 "jg 1b \n" 420 : "+r"(src), // %0 421 "+r"(dst), // %1 422 "+r"(pix) // %2 423 : 424 : "memory", "cc", "eax" 425#if defined(__SSE2__) 426 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 427#endif 428 ); 429} 430 431void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { 432 asm volatile ( 433 "mov $0xf0f0f0f,%%eax \n" 434 "movd %%eax,%%xmm4 \n" 435 "pshufd $0x0,%%xmm4,%%xmm4 \n" 436 "movdqa %%xmm4,%%xmm5 \n" 437 "pslld $0x4,%%xmm5 \n" 438 "sub %0,%1 \n" 439 "sub %0,%1 \n" 440 ".p2align 4 \n" 441 "1: \n" 442 "movdqu (%0),%%xmm0 \n" 443 "movdqa %%xmm0,%%xmm2 \n" 444 "pand %%xmm4,%%xmm0 \n" 445 "pand %%xmm5,%%xmm2 \n" 446 "movdqa %%xmm0,%%xmm1 \n" 447 "movdqa %%xmm2,%%xmm3 \n" 448 "psllw $0x4,%%xmm1 \n" 449 "psrlw $0x4,%%xmm3 \n" 450 "por %%xmm1,%%xmm0 \n" 451 "por %%xmm3,%%xmm2 \n" 452 "movdqa %%xmm0,%%xmm1 \n" 453 "punpcklbw %%xmm2,%%xmm0 \n" 454 "punpckhbw %%xmm2,%%xmm1 \n" 455 "movdqa %%xmm0,(%1,%0,2) \n" 456 "movdqa %%xmm1,0x10(%1,%0,2) \n" 457 "lea 0x10(%0),%0 \n" 458 "sub $0x8,%2 \n" 459 "jg 1b \n" 460 : "+r"(src), // %0 461 "+r"(dst), // %1 462 "+r"(pix) // %2 463 : 464 : "memory", "cc", "eax" 465#if defined(__SSE2__) 466 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 467#endif 468 ); 469} 470 471void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) { 472 asm volatile ( 473 "movdqa %3,%%xmm6 \n" 474 ".p2align 4 \n" 475 "1: \n" 476 "movdqa (%0),%%xmm0 \n" 477 "movdqa 0x10(%0),%%xmm1 \n" 478 "movdqa 0x20(%0),%%xmm2 \n" 479 "movdqa 0x30(%0),%%xmm3 \n" 480 "lea 0x40(%0),%0 \n" 481 "pshufb %%xmm6,%%xmm0 \n" 482 "pshufb %%xmm6,%%xmm1 \n" 483 "pshufb %%xmm6,%%xmm2 \n" 484 "pshufb %%xmm6,%%xmm3 \n" 485 "movdqa %%xmm1,%%xmm4 \n" 486 "psrldq $0x4,%%xmm1 \n" 487 "pslldq $0xc,%%xmm4 \n" 488 "movdqa %%xmm2,%%xmm5 \n" 489 "por %%xmm4,%%xmm0 \n" 490 "pslldq $0x8,%%xmm5 \n" 491 "movdqa %%xmm0,(%1) \n" 492 "por %%xmm5,%%xmm1 \n" 493 "psrldq $0x8,%%xmm2 \n" 494 "pslldq $0x4,%%xmm3 \n" 495 "por %%xmm3,%%xmm2 \n" 496 "movdqa %%xmm1,0x10(%1) \n" 497 "movdqa %%xmm2,0x20(%1) \n" 498 "lea 0x30(%1),%1 \n" 499 "sub $0x10,%2 \n" 500 "jg 1b \n" 501 : "+r"(src), // %0 502 "+r"(dst), // %1 503 "+r"(pix) // %2 504 : "m"(kShuffleMaskARGBToRGB24) // %3 505 : "memory", "cc" 506#if defined(__SSE2__) 507 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" 508#endif 509 ); 510} 511 512void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) { 513 asm volatile ( 514 "movdqa %3,%%xmm6 \n" 515 ".p2align 4 \n" 516 "1: \n" 517 "movdqa (%0),%%xmm0 \n" 518 "movdqa 0x10(%0),%%xmm1 \n" 519 "movdqa 0x20(%0),%%xmm2 \n" 520 "movdqa 0x30(%0),%%xmm3 \n" 521 "lea 0x40(%0),%0 \n" 522 "pshufb %%xmm6,%%xmm0 \n" 523 "pshufb %%xmm6,%%xmm1 \n" 524 "pshufb %%xmm6,%%xmm2 \n" 525 "pshufb %%xmm6,%%xmm3 \n" 526 "movdqa %%xmm1,%%xmm4 \n" 527 "psrldq $0x4,%%xmm1 \n" 528 "pslldq $0xc,%%xmm4 \n" 529 "movdqa %%xmm2,%%xmm5 \n" 530 "por %%xmm4,%%xmm0 \n" 531 "pslldq $0x8,%%xmm5 \n" 532 "movdqa %%xmm0,(%1) \n" 533 "por %%xmm5,%%xmm1 \n" 534 "psrldq $0x8,%%xmm2 \n" 535 "pslldq $0x4,%%xmm3 \n" 536 "por %%xmm3,%%xmm2 \n" 537 "movdqa %%xmm1,0x10(%1) \n" 538 "movdqa %%xmm2,0x20(%1) \n" 539 "lea 0x30(%1),%1 \n" 540 "sub $0x10,%2 \n" 541 "jg 1b \n" 542 : "+r"(src), // %0 543 "+r"(dst), // %1 544 "+r"(pix) // %2 545 : "m"(kShuffleMaskARGBToRAW) // %3 546 : "memory", "cc" 547#if defined(__SSE2__) 548 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" 549#endif 550 ); 551} 552 553void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) { 554 asm volatile ( 555 "pcmpeqb %%xmm3,%%xmm3 \n" 556 "psrld $0x1b,%%xmm3 \n" 557 "pcmpeqb %%xmm4,%%xmm4 \n" 558 "psrld $0x1a,%%xmm4 \n" 559 "pslld $0x5,%%xmm4 \n" 560 "pcmpeqb %%xmm5,%%xmm5 \n" 561 "pslld $0xb,%%xmm5 \n" 562 ".p2align 4 \n" 563 "1: \n" 564 "movdqa (%0),%%xmm0 \n" 565 "movdqa %%xmm0,%%xmm1 \n" 566 "movdqa %%xmm0,%%xmm2 \n" 567 "pslld $0x8,%%xmm0 \n" 568 "psrld $0x3,%%xmm1 \n" 569 "psrld $0x5,%%xmm2 \n" 570 "psrad $0x10,%%xmm0 \n" 571 "pand %%xmm3,%%xmm1 \n" 572 "pand %%xmm4,%%xmm2 \n" 573 "pand %%xmm5,%%xmm0 \n" 574 "por %%xmm2,%%xmm1 \n" 575 "por %%xmm1,%%xmm0 \n" 576 "packssdw %%xmm0,%%xmm0 \n" 577 "lea 0x10(%0),%0 \n" 578 "movq %%xmm0,(%1) \n" 579 "lea 0x8(%1),%1 \n" 580 "sub $0x4,%2 \n" 581 "jg 1b \n" 582 : "+r"(src), // %0 583 "+r"(dst), // %1 584 "+r"(pix) // %2 585 : 586 : "memory", "cc" 587#if defined(__SSE2__) 588 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 589#endif 590 ); 591} 592 593void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) { 594 asm volatile ( 595 "pcmpeqb %%xmm4,%%xmm4 \n" 596 "psrld $0x1b,%%xmm4 \n" 597 "movdqa %%xmm4,%%xmm5 \n" 598 "pslld $0x5,%%xmm5 \n" 599 "movdqa %%xmm4,%%xmm6 \n" 600 "pslld $0xa,%%xmm6 \n" 601 "pcmpeqb %%xmm7,%%xmm7 \n" 602 "pslld $0xf,%%xmm7 \n" 603 ".p2align 4 \n" 604 "1: \n" 605 "movdqa (%0),%%xmm0 \n" 606 "movdqa %%xmm0,%%xmm1 \n" 607 "movdqa %%xmm0,%%xmm2 \n" 608 "movdqa %%xmm0,%%xmm3 \n" 609 "psrad $0x10,%%xmm0 \n" 610 "psrld $0x3,%%xmm1 \n" 611 "psrld $0x6,%%xmm2 \n" 612 "psrld $0x9,%%xmm3 \n" 613 "pand %%xmm7,%%xmm0 \n" 614 "pand %%xmm4,%%xmm1 \n" 615 "pand %%xmm5,%%xmm2 \n" 616 "pand %%xmm6,%%xmm3 \n" 617 "por %%xmm1,%%xmm0 \n" 618 "por %%xmm3,%%xmm2 \n" 619 "por %%xmm2,%%xmm0 \n" 620 "packssdw %%xmm0,%%xmm0 \n" 621 "lea 0x10(%0),%0 \n" 622 "movq %%xmm0,(%1) \n" 623 "lea 0x8(%1),%1 \n" 624 "sub $0x4,%2 \n" 625 "jg 1b \n" 626 : "+r"(src), // %0 627 "+r"(dst), // %1 628 "+r"(pix) // %2 629 : 630 : "memory", "cc" 631#if defined(__SSE2__) 632 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 633#endif 634 ); 635} 636 637void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) { 638 asm volatile ( 639 "pcmpeqb %%xmm4,%%xmm4 \n" 640 "psllw $0xc,%%xmm4 \n" 641 "movdqa %%xmm4,%%xmm3 \n" 642 "psrlw $0x8,%%xmm3 \n" 643 ".p2align 4 \n" 644 "1: \n" 645 "movdqa (%0),%%xmm0 \n" 646 "movdqa %%xmm0,%%xmm1 \n" 647 "pand %%xmm3,%%xmm0 \n" 648 "pand %%xmm4,%%xmm1 \n" 649 "psrlq $0x4,%%xmm0 \n" 650 "psrlq $0x8,%%xmm1 \n" 651 "por %%xmm1,%%xmm0 \n" 652 "packuswb %%xmm0,%%xmm0 \n" 653 "lea 0x10(%0),%0 \n" 654 "movq %%xmm0,(%1) \n" 655 "lea 0x8(%1),%1 \n" 656 "sub $0x4,%2 \n" 657 "jg 1b \n" 658 : "+r"(src), // %0 659 "+r"(dst), // %1 660 "+r"(pix) // %2 661 : 662 : "memory", "cc" 663#if defined(__SSE2__) 664 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" 665#endif 666 ); 667} 668 669void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 670 asm volatile ( 671 "movdqa %4,%%xmm5 \n" 672 "movdqa %3,%%xmm4 \n" 673 ".p2align 4 \n" 674 "1: \n" 675 "movdqa (%0),%%xmm0 \n" 676 "movdqa 0x10(%0),%%xmm1 \n" 677 "movdqa 0x20(%0),%%xmm2 \n" 678 "movdqa 0x30(%0),%%xmm3 \n" 679 "pmaddubsw %%xmm4,%%xmm0 \n" 680 "pmaddubsw %%xmm4,%%xmm1 \n" 681 "pmaddubsw %%xmm4,%%xmm2 \n" 682 "pmaddubsw %%xmm4,%%xmm3 \n" 683 "lea 0x40(%0),%0 \n" 684 "phaddw %%xmm1,%%xmm0 \n" 685 "phaddw %%xmm3,%%xmm2 \n" 686 "psrlw $0x7,%%xmm0 \n" 687 "psrlw $0x7,%%xmm2 \n" 688 "packuswb %%xmm2,%%xmm0 \n" 689 "paddb %%xmm5,%%xmm0 \n" 690 "sub $0x10,%2 \n" 691 "movdqa %%xmm0,(%1) \n" 692 "lea 0x10(%1),%1 \n" 693 "jg 1b \n" 694 : "+r"(src_argb), // %0 695 "+r"(dst_y), // %1 696 "+r"(pix) // %2 697 : "m"(kARGBToY), // %3 698 "m"(kAddY16) // %4 699 : "memory", "cc" 700#if defined(__SSE2__) 701 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 702#endif 703 ); 704} 705 706void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 707 asm volatile ( 708 "movdqa %4,%%xmm5 \n" 709 "movdqa %3,%%xmm4 \n" 710 ".p2align 4 \n" 711 "1: \n" 712 "movdqu (%0),%%xmm0 \n" 713 "movdqu 0x10(%0),%%xmm1 \n" 714 "movdqu 0x20(%0),%%xmm2 \n" 715 "movdqu 0x30(%0),%%xmm3 \n" 716 "pmaddubsw %%xmm4,%%xmm0 \n" 717 "pmaddubsw %%xmm4,%%xmm1 \n" 718 "pmaddubsw %%xmm4,%%xmm2 \n" 719 "pmaddubsw %%xmm4,%%xmm3 \n" 720 "lea 0x40(%0),%0 \n" 721 "phaddw %%xmm1,%%xmm0 \n" 722 "phaddw %%xmm3,%%xmm2 \n" 723 "psrlw $0x7,%%xmm0 \n" 724 "psrlw $0x7,%%xmm2 \n" 725 "packuswb %%xmm2,%%xmm0 \n" 726 "paddb %%xmm5,%%xmm0 \n" 727 "sub $0x10,%2 \n" 728 "movdqu %%xmm0,(%1) \n" 729 "lea 0x10(%1),%1 \n" 730 "jg 1b \n" 731 : "+r"(src_argb), // %0 732 "+r"(dst_y), // %1 733 "+r"(pix) // %2 734 : "m"(kARGBToY), // %3 735 "m"(kAddY16) // %4 736 : "memory", "cc" 737#if defined(__SSE2__) 738 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 739#endif 740 ); 741} 742 743// TODO(fbarchard): pass xmm constants to single block of assembly. 744// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes 745// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers, 746// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around 747// and considered unsafe. 748void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 749 uint8* dst_u, uint8* dst_v, int width) { 750 asm volatile ( 751 "movdqa %0,%%xmm4 \n" 752 "movdqa %1,%%xmm3 \n" 753 "movdqa %2,%%xmm5 \n" 754 : 755 : "m"(kARGBToU), // %0 756 "m"(kARGBToV), // %1 757 "m"(kAddUV128) // %2 758 ); 759 asm volatile ( 760 "sub %1,%2 \n" 761 ".p2align 4 \n" 762 "1: \n" 763 "movdqa (%0),%%xmm0 \n" 764 "movdqa 0x10(%0),%%xmm1 \n" 765 "movdqa 0x20(%0),%%xmm2 \n" 766 "movdqa 0x30(%0),%%xmm6 \n" 767 "pavgb (%0,%4,1),%%xmm0 \n" 768 "pavgb 0x10(%0,%4,1),%%xmm1 \n" 769 "pavgb 0x20(%0,%4,1),%%xmm2 \n" 770 "pavgb 0x30(%0,%4,1),%%xmm6 \n" 771 "lea 0x40(%0),%0 \n" 772 "movdqa %%xmm0,%%xmm7 \n" 773 "shufps $0x88,%%xmm1,%%xmm0 \n" 774 "shufps $0xdd,%%xmm1,%%xmm7 \n" 775 "pavgb %%xmm7,%%xmm0 \n" 776 "movdqa %%xmm2,%%xmm7 \n" 777 "shufps $0x88,%%xmm6,%%xmm2 \n" 778 "shufps $0xdd,%%xmm6,%%xmm7 \n" 779 "pavgb %%xmm7,%%xmm2 \n" 780 "movdqa %%xmm0,%%xmm1 \n" 781 "movdqa %%xmm2,%%xmm6 \n" 782 "pmaddubsw %%xmm4,%%xmm0 \n" 783 "pmaddubsw %%xmm4,%%xmm2 \n" 784 "pmaddubsw %%xmm3,%%xmm1 \n" 785 "pmaddubsw %%xmm3,%%xmm6 \n" 786 "phaddw %%xmm2,%%xmm0 \n" 787 "phaddw %%xmm6,%%xmm1 \n" 788 "psraw $0x8,%%xmm0 \n" 789 "psraw $0x8,%%xmm1 \n" 790 "packsswb %%xmm1,%%xmm0 \n" 791 "paddb %%xmm5,%%xmm0 \n" 792 "sub $0x10,%3 \n" 793 "movlps %%xmm0,(%1) \n" 794 "movhps %%xmm0,(%1,%2,1) \n" 795 "lea 0x8(%1),%1 \n" 796 "jg 1b \n" 797 : "+r"(src_argb0), // %0 798 "+r"(dst_u), // %1 799 "+r"(dst_v), // %2 800 "+rm"(width) // %3 801 : "r"(static_cast<intptr_t>(src_stride_argb)) 802 : "memory", "cc" 803#if defined(__SSE2__) 804 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" 805#endif 806 ); 807} 808 809void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, 810 uint8* dst_u, uint8* dst_v, int width) { 811 asm volatile ( 812 "movdqa %0,%%xmm4 \n" 813 "movdqa %1,%%xmm3 \n" 814 "movdqa %2,%%xmm5 \n" 815 : 816 : "m"(kARGBToU), // %0 817 "m"(kARGBToV), // %1 818 "m"(kAddUV128) // %2 819 ); 820 asm volatile ( 821 "sub %1,%2 \n" 822 ".p2align 4 \n" 823 "1: \n" 824 "movdqu (%0),%%xmm0 \n" 825 "movdqu 0x10(%0),%%xmm1 \n" 826 "movdqu 0x20(%0),%%xmm2 \n" 827 "movdqu 0x30(%0),%%xmm6 \n" 828 "movdqu (%0,%4,1),%%xmm7 \n" 829 "pavgb %%xmm7,%%xmm0 \n" 830 "movdqu 0x10(%0,%4,1),%%xmm7 \n" 831 "pavgb %%xmm7,%%xmm1 \n" 832 "movdqu 0x20(%0,%4,1),%%xmm7 \n" 833 "pavgb %%xmm7,%%xmm2 \n" 834 "movdqu 0x30(%0,%4,1),%%xmm7 \n" 835 "pavgb %%xmm7,%%xmm6 \n" 836 "lea 0x40(%0),%0 \n" 837 "movdqa %%xmm0,%%xmm7 \n" 838 "shufps $0x88,%%xmm1,%%xmm0 \n" 839 "shufps $0xdd,%%xmm1,%%xmm7 \n" 840 "pavgb %%xmm7,%%xmm0 \n" 841 "movdqa %%xmm2,%%xmm7 \n" 842 "shufps $0x88,%%xmm6,%%xmm2 \n" 843 "shufps $0xdd,%%xmm6,%%xmm7 \n" 844 "pavgb %%xmm7,%%xmm2 \n" 845 "movdqa %%xmm0,%%xmm1 \n" 846 "movdqa %%xmm2,%%xmm6 \n" 847 "pmaddubsw %%xmm4,%%xmm0 \n" 848 "pmaddubsw %%xmm4,%%xmm2 \n" 849 "pmaddubsw %%xmm3,%%xmm1 \n" 850 "pmaddubsw %%xmm3,%%xmm6 \n" 851 "phaddw %%xmm2,%%xmm0 \n" 852 "phaddw %%xmm6,%%xmm1 \n" 853 "psraw $0x8,%%xmm0 \n" 854 "psraw $0x8,%%xmm1 \n" 855 "packsswb %%xmm1,%%xmm0 \n" 856 "paddb %%xmm5,%%xmm0 \n" 857 "sub $0x10,%3 \n" 858 "movlps %%xmm0,(%1) \n" 859 "movhps %%xmm0,(%1,%2,1) \n" 860 "lea 0x8(%1),%1 \n" 861 "jg 1b \n" 862 : "+r"(src_argb0), // %0 863 "+r"(dst_u), // %1 864 "+r"(dst_v), // %2 865 "+rm"(width) // %3 866 : "r"(static_cast<intptr_t>(src_stride_argb)) 867 : "memory", "cc" 868#if defined(__SSE2__) 869 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" 870#endif 871 ); 872} 873 874void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) { 875 asm volatile ( 876 "movdqa %4,%%xmm5 \n" 877 "movdqa %3,%%xmm4 \n" 878 ".p2align 4 \n" 879 "1: \n" 880 "movdqa (%0),%%xmm0 \n" 881 "movdqa 0x10(%0),%%xmm1 \n" 882 "movdqa 0x20(%0),%%xmm2 \n" 883 "movdqa 0x30(%0),%%xmm3 \n" 884 "pmaddubsw %%xmm4,%%xmm0 \n" 885 "pmaddubsw %%xmm4,%%xmm1 \n" 886 "pmaddubsw %%xmm4,%%xmm2 \n" 887 "pmaddubsw %%xmm4,%%xmm3 \n" 888 "lea 0x40(%0),%0 \n" 889 "phaddw %%xmm1,%%xmm0 \n" 890 "phaddw %%xmm3,%%xmm2 \n" 891 "psrlw $0x7,%%xmm0 \n" 892 "psrlw $0x7,%%xmm2 \n" 893 "packuswb %%xmm2,%%xmm0 \n" 894 "paddb %%xmm5,%%xmm0 \n" 895 "sub $0x10,%2 \n" 896 "movdqa %%xmm0,(%1) \n" 897 "lea 0x10(%1),%1 \n" 898 "jg 1b \n" 899 : "+r"(src_bgra), // %0 900 "+r"(dst_y), // %1 901 "+r"(pix) // %2 902 : "m"(kBGRAToY), // %3 903 "m"(kAddY16) // %4 904 : "memory", "cc" 905#if defined(__SSE2__) 906 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 907#endif 908 ); 909} 910 911void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) { 912 asm volatile ( 913 "movdqa %4,%%xmm5 \n" 914 "movdqa %3,%%xmm4 \n" 915 ".p2align 4 \n" 916 "1: \n" 917 "movdqu (%0),%%xmm0 \n" 918 "movdqu 0x10(%0),%%xmm1 \n" 919 "movdqu 0x20(%0),%%xmm2 \n" 920 "movdqu 0x30(%0),%%xmm3 \n" 921 "pmaddubsw %%xmm4,%%xmm0 \n" 922 "pmaddubsw %%xmm4,%%xmm1 \n" 923 "pmaddubsw %%xmm4,%%xmm2 \n" 924 "pmaddubsw %%xmm4,%%xmm3 \n" 925 "lea 0x40(%0),%0 \n" 926 "phaddw %%xmm1,%%xmm0 \n" 927 "phaddw %%xmm3,%%xmm2 \n" 928 "psrlw $0x7,%%xmm0 \n" 929 "psrlw $0x7,%%xmm2 \n" 930 "packuswb %%xmm2,%%xmm0 \n" 931 "paddb %%xmm5,%%xmm0 \n" 932 "sub $0x10,%2 \n" 933 "movdqu %%xmm0,(%1) \n" 934 "lea 0x10(%1),%1 \n" 935 "jg 1b \n" 936 : "+r"(src_bgra), // %0 937 "+r"(dst_y), // %1 938 "+r"(pix) // %2 939 : "m"(kBGRAToY), // %3 940 "m"(kAddY16) // %4 941 : "memory", "cc" 942#if defined(__SSE2__) 943 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 944#endif 945 ); 946} 947 948void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra, 949 uint8* dst_u, uint8* dst_v, int width) { 950 asm volatile ( 951 "movdqa %0,%%xmm4 \n" 952 "movdqa %1,%%xmm3 \n" 953 "movdqa %2,%%xmm5 \n" 954 : 955 : "m"(kBGRAToU), // %0 956 "m"(kBGRAToV), // %1 957 "m"(kAddUV128) // %2 958 ); 959 asm volatile ( 960 "sub %1,%2 \n" 961 ".p2align 4 \n" 962 "1: \n" 963 "movdqa (%0),%%xmm0 \n" 964 "movdqa 0x10(%0),%%xmm1 \n" 965 "movdqa 0x20(%0),%%xmm2 \n" 966 "movdqa 0x30(%0),%%xmm6 \n" 967 "pavgb (%0,%4,1),%%xmm0 \n" 968 "pavgb 0x10(%0,%4,1),%%xmm1 \n" 969 "pavgb 0x20(%0,%4,1),%%xmm2 \n" 970 "pavgb 0x30(%0,%4,1),%%xmm6 \n" 971 "lea 0x40(%0),%0 \n" 972 "movdqa %%xmm0,%%xmm7 \n" 973 "shufps $0x88,%%xmm1,%%xmm0 \n" 974 "shufps $0xdd,%%xmm1,%%xmm7 \n" 975 "pavgb %%xmm7,%%xmm0 \n" 976 "movdqa %%xmm2,%%xmm7 \n" 977 "shufps $0x88,%%xmm6,%%xmm2 \n" 978 "shufps $0xdd,%%xmm6,%%xmm7 \n" 979 "pavgb %%xmm7,%%xmm2 \n" 980 "movdqa %%xmm0,%%xmm1 \n" 981 "movdqa %%xmm2,%%xmm6 \n" 982 "pmaddubsw %%xmm4,%%xmm0 \n" 983 "pmaddubsw %%xmm4,%%xmm2 \n" 984 "pmaddubsw %%xmm3,%%xmm1 \n" 985 "pmaddubsw %%xmm3,%%xmm6 \n" 986 "phaddw %%xmm2,%%xmm0 \n" 987 "phaddw %%xmm6,%%xmm1 \n" 988 "psraw $0x8,%%xmm0 \n" 989 "psraw $0x8,%%xmm1 \n" 990 "packsswb %%xmm1,%%xmm0 \n" 991 "paddb %%xmm5,%%xmm0 \n" 992 "sub $0x10,%3 \n" 993 "movlps %%xmm0,(%1) \n" 994 "movhps %%xmm0,(%1,%2,1) \n" 995 "lea 0x8(%1),%1 \n" 996 "jg 1b \n" 997 : "+r"(src_bgra0), // %0 998 "+r"(dst_u), // %1 999 "+r"(dst_v), // %2 1000 "+rm"(width) // %3 1001 : "r"(static_cast<intptr_t>(src_stride_bgra)) 1002 : "memory", "cc" 1003#if defined(__SSE2__) 1004 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" 1005#endif 1006 ); 1007} 1008 1009void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra, 1010 uint8* dst_u, uint8* dst_v, int width) { 1011 asm volatile ( 1012 "movdqa %0,%%xmm4 \n" 1013 "movdqa %1,%%xmm3 \n" 1014 "movdqa %2,%%xmm5 \n" 1015 : 1016 : "m"(kBGRAToU), // %0 1017 "m"(kBGRAToV), // %1 1018 "m"(kAddUV128) // %2 1019 ); 1020 asm volatile ( 1021 "sub %1,%2 \n" 1022 ".p2align 4 \n" 1023 "1: \n" 1024 "movdqu (%0),%%xmm0 \n" 1025 "movdqu 0x10(%0),%%xmm1 \n" 1026 "movdqu 0x20(%0),%%xmm2 \n" 1027 "movdqu 0x30(%0),%%xmm6 \n" 1028 "movdqu (%0,%4,1),%%xmm7 \n" 1029 "pavgb %%xmm7,%%xmm0 \n" 1030 "movdqu 0x10(%0,%4,1),%%xmm7 \n" 1031 "pavgb %%xmm7,%%xmm1 \n" 1032 "movdqu 0x20(%0,%4,1),%%xmm7 \n" 1033 "pavgb %%xmm7,%%xmm2 \n" 1034 "movdqu 0x30(%0,%4,1),%%xmm7 \n" 1035 "pavgb %%xmm7,%%xmm6 \n" 1036 "lea 0x40(%0),%0 \n" 1037 "movdqa %%xmm0,%%xmm7 \n" 1038 "shufps $0x88,%%xmm1,%%xmm0 \n" 1039 "shufps $0xdd,%%xmm1,%%xmm7 \n" 1040 "pavgb %%xmm7,%%xmm0 \n" 1041 "movdqa %%xmm2,%%xmm7 \n" 1042 "shufps $0x88,%%xmm6,%%xmm2 \n" 1043 "shufps $0xdd,%%xmm6,%%xmm7 \n" 1044 "pavgb %%xmm7,%%xmm2 \n" 1045 "movdqa %%xmm0,%%xmm1 \n" 1046 "movdqa %%xmm2,%%xmm6 \n" 1047 "pmaddubsw %%xmm4,%%xmm0 \n" 1048 "pmaddubsw %%xmm4,%%xmm2 \n" 1049 "pmaddubsw %%xmm3,%%xmm1 \n" 1050 "pmaddubsw %%xmm3,%%xmm6 \n" 1051 "phaddw %%xmm2,%%xmm0 \n" 1052 "phaddw %%xmm6,%%xmm1 \n" 1053 "psraw $0x8,%%xmm0 \n" 1054 "psraw $0x8,%%xmm1 \n" 1055 "packsswb %%xmm1,%%xmm0 \n" 1056 "paddb %%xmm5,%%xmm0 \n" 1057 "sub $0x10,%3 \n" 1058 "movlps %%xmm0,(%1) \n" 1059 "movhps %%xmm0,(%1,%2,1) \n" 1060 "lea 0x8(%1),%1 \n" 1061 "jg 1b \n" 1062 : "+r"(src_bgra0), // %0 1063 "+r"(dst_u), // %1 1064 "+r"(dst_v), // %2 1065 "+rm"(width) // %3 1066 : "r"(static_cast<intptr_t>(src_stride_bgra)) 1067 : "memory", "cc" 1068#if defined(__SSE2__) 1069 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" 1070#endif 1071 ); 1072} 1073 1074void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) { 1075 asm volatile ( 1076 "movdqa %4,%%xmm5 \n" 1077 "movdqa %3,%%xmm4 \n" 1078 ".p2align 4 \n" 1079 "1: \n" 1080 "movdqa (%0),%%xmm0 \n" 1081 "movdqa 0x10(%0),%%xmm1 \n" 1082 "movdqa 0x20(%0),%%xmm2 \n" 1083 "movdqa 0x30(%0),%%xmm3 \n" 1084 "pmaddubsw %%xmm4,%%xmm0 \n" 1085 "pmaddubsw %%xmm4,%%xmm1 \n" 1086 "pmaddubsw %%xmm4,%%xmm2 \n" 1087 "pmaddubsw %%xmm4,%%xmm3 \n" 1088 "lea 0x40(%0),%0 \n" 1089 "phaddw %%xmm1,%%xmm0 \n" 1090 "phaddw %%xmm3,%%xmm2 \n" 1091 "psrlw $0x7,%%xmm0 \n" 1092 "psrlw $0x7,%%xmm2 \n" 1093 "packuswb %%xmm2,%%xmm0 \n" 1094 "paddb %%xmm5,%%xmm0 \n" 1095 "sub $0x10,%2 \n" 1096 "movdqa %%xmm0,(%1) \n" 1097 "lea 0x10(%1),%1 \n" 1098 "jg 1b \n" 1099 : "+r"(src_abgr), // %0 1100 "+r"(dst_y), // %1 1101 "+r"(pix) // %2 1102 : "m"(kABGRToY), // %3 1103 "m"(kAddY16) // %4 1104 : "memory", "cc" 1105#if defined(__SSE2__) 1106 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1107#endif 1108 ); 1109} 1110 1111void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) { 1112 asm volatile ( 1113 "movdqa %4,%%xmm5 \n" 1114 "movdqa %3,%%xmm4 \n" 1115 ".p2align 4 \n" 1116 "1: \n" 1117 "movdqu (%0),%%xmm0 \n" 1118 "movdqu 0x10(%0),%%xmm1 \n" 1119 "movdqu 0x20(%0),%%xmm2 \n" 1120 "movdqu 0x30(%0),%%xmm3 \n" 1121 "pmaddubsw %%xmm4,%%xmm0 \n" 1122 "pmaddubsw %%xmm4,%%xmm1 \n" 1123 "pmaddubsw %%xmm4,%%xmm2 \n" 1124 "pmaddubsw %%xmm4,%%xmm3 \n" 1125 "lea 0x40(%0),%0 \n" 1126 "phaddw %%xmm1,%%xmm0 \n" 1127 "phaddw %%xmm3,%%xmm2 \n" 1128 "psrlw $0x7,%%xmm0 \n" 1129 "psrlw $0x7,%%xmm2 \n" 1130 "packuswb %%xmm2,%%xmm0 \n" 1131 "paddb %%xmm5,%%xmm0 \n" 1132 "sub $0x10,%2 \n" 1133 "movdqu %%xmm0,(%1) \n" 1134 "lea 0x10(%1),%1 \n" 1135 "jg 1b \n" 1136 : "+r"(src_abgr), // %0 1137 "+r"(dst_y), // %1 1138 "+r"(pix) // %2 1139 : "m"(kABGRToY), // %3 1140 "m"(kAddY16) // %4 1141 : "memory", "cc" 1142#if defined(__SSE2__) 1143 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1144#endif 1145 ); 1146} 1147 1148void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr, 1149 uint8* dst_u, uint8* dst_v, int width) { 1150 asm volatile ( 1151 "movdqa %0,%%xmm4 \n" 1152 "movdqa %1,%%xmm3 \n" 1153 "movdqa %2,%%xmm5 \n" 1154 : 1155 : "m"(kABGRToU), // %0 1156 "m"(kABGRToV), // %1 1157 "m"(kAddUV128) // %2 1158 ); 1159 asm volatile ( 1160 "sub %1,%2 \n" 1161 ".p2align 4 \n" 1162 "1: \n" 1163 "movdqa (%0),%%xmm0 \n" 1164 "movdqa 0x10(%0),%%xmm1 \n" 1165 "movdqa 0x20(%0),%%xmm2 \n" 1166 "movdqa 0x30(%0),%%xmm6 \n" 1167 "pavgb (%0,%4,1),%%xmm0 \n" 1168 "pavgb 0x10(%0,%4,1),%%xmm1 \n" 1169 "pavgb 0x20(%0,%4,1),%%xmm2 \n" 1170 "pavgb 0x30(%0,%4,1),%%xmm6 \n" 1171 "lea 0x40(%0),%0 \n" 1172 "movdqa %%xmm0,%%xmm7 \n" 1173 "shufps $0x88,%%xmm1,%%xmm0 \n" 1174 "shufps $0xdd,%%xmm1,%%xmm7 \n" 1175 "pavgb %%xmm7,%%xmm0 \n" 1176 "movdqa %%xmm2,%%xmm7 \n" 1177 "shufps $0x88,%%xmm6,%%xmm2 \n" 1178 "shufps $0xdd,%%xmm6,%%xmm7 \n" 1179 "pavgb %%xmm7,%%xmm2 \n" 1180 "movdqa %%xmm0,%%xmm1 \n" 1181 "movdqa %%xmm2,%%xmm6 \n" 1182 "pmaddubsw %%xmm4,%%xmm0 \n" 1183 "pmaddubsw %%xmm4,%%xmm2 \n" 1184 "pmaddubsw %%xmm3,%%xmm1 \n" 1185 "pmaddubsw %%xmm3,%%xmm6 \n" 1186 "phaddw %%xmm2,%%xmm0 \n" 1187 "phaddw %%xmm6,%%xmm1 \n" 1188 "psraw $0x8,%%xmm0 \n" 1189 "psraw $0x8,%%xmm1 \n" 1190 "packsswb %%xmm1,%%xmm0 \n" 1191 "paddb %%xmm5,%%xmm0 \n" 1192 "sub $0x10,%3 \n" 1193 "movlps %%xmm0,(%1) \n" 1194 "movhps %%xmm0,(%1,%2,1) \n" 1195 "lea 0x8(%1),%1 \n" 1196 "jg 1b \n" 1197 : "+r"(src_abgr0), // %0 1198 "+r"(dst_u), // %1 1199 "+r"(dst_v), // %2 1200 "+rm"(width) // %3 1201 : "r"(static_cast<intptr_t>(src_stride_abgr)) 1202 : "memory", "cc" 1203#if defined(__SSE2__) 1204 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" 1205#endif 1206 ); 1207} 1208 1209void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr, 1210 uint8* dst_u, uint8* dst_v, int width) { 1211 asm volatile ( 1212 "movdqa %0,%%xmm4 \n" 1213 "movdqa %1,%%xmm3 \n" 1214 "movdqa %2,%%xmm5 \n" 1215 : 1216 : "m"(kABGRToU), // %0 1217 "m"(kABGRToV), // %1 1218 "m"(kAddUV128) // %2 1219 ); 1220 asm volatile ( 1221 "sub %1,%2 \n" 1222 ".p2align 4 \n" 1223 "1: \n" 1224 "movdqu (%0),%%xmm0 \n" 1225 "movdqu 0x10(%0),%%xmm1 \n" 1226 "movdqu 0x20(%0),%%xmm2 \n" 1227 "movdqu 0x30(%0),%%xmm6 \n" 1228 "movdqu (%0,%4,1),%%xmm7 \n" 1229 "pavgb %%xmm7,%%xmm0 \n" 1230 "movdqu 0x10(%0,%4,1),%%xmm7 \n" 1231 "pavgb %%xmm7,%%xmm1 \n" 1232 "movdqu 0x20(%0,%4,1),%%xmm7 \n" 1233 "pavgb %%xmm7,%%xmm2 \n" 1234 "movdqu 0x30(%0,%4,1),%%xmm7 \n" 1235 "pavgb %%xmm7,%%xmm6 \n" 1236 "lea 0x40(%0),%0 \n" 1237 "movdqa %%xmm0,%%xmm7 \n" 1238 "shufps $0x88,%%xmm1,%%xmm0 \n" 1239 "shufps $0xdd,%%xmm1,%%xmm7 \n" 1240 "pavgb %%xmm7,%%xmm0 \n" 1241 "movdqa %%xmm2,%%xmm7 \n" 1242 "shufps $0x88,%%xmm6,%%xmm2 \n" 1243 "shufps $0xdd,%%xmm6,%%xmm7 \n" 1244 "pavgb %%xmm7,%%xmm2 \n" 1245 "movdqa %%xmm0,%%xmm1 \n" 1246 "movdqa %%xmm2,%%xmm6 \n" 1247 "pmaddubsw %%xmm4,%%xmm0 \n" 1248 "pmaddubsw %%xmm4,%%xmm2 \n" 1249 "pmaddubsw %%xmm3,%%xmm1 \n" 1250 "pmaddubsw %%xmm3,%%xmm6 \n" 1251 "phaddw %%xmm2,%%xmm0 \n" 1252 "phaddw %%xmm6,%%xmm1 \n" 1253 "psraw $0x8,%%xmm0 \n" 1254 "psraw $0x8,%%xmm1 \n" 1255 "packsswb %%xmm1,%%xmm0 \n" 1256 "paddb %%xmm5,%%xmm0 \n" 1257 "sub $0x10,%3 \n" 1258 "movlps %%xmm0,(%1) \n" 1259 "movhps %%xmm0,(%1,%2,1) \n" 1260 "lea 0x8(%1),%1 \n" 1261 "jg 1b \n" 1262 : "+r"(src_abgr0), // %0 1263 "+r"(dst_u), // %1 1264 "+r"(dst_v), // %2 1265 "+rm"(width) // %3 1266 : "r"(static_cast<intptr_t>(src_stride_abgr)) 1267 : "memory", "cc" 1268#if defined(__SSE2__) 1269 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" 1270#endif 1271 ); 1272} 1273#endif // HAS_ARGBTOYROW_SSSE3 1274 1275#ifdef HAS_I422TOARGBROW_SSSE3 1276#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */ 1277#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */ 1278#define UR 0 1279 1280#define VB 0 1281#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */ 1282#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */ 1283 1284// Bias 1285#define BB UB * 128 + VB * 128 1286#define BG UG * 128 + VG * 128 1287#define BR UR * 128 + VR * 128 1288 1289#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */ 1290 1291struct { 1292 vec8 kUVToB; // 0 1293 vec8 kUVToG; // 16 1294 vec8 kUVToR; // 32 1295 vec16 kUVBiasB; // 48 1296 vec16 kUVBiasG; // 64 1297 vec16 kUVBiasR; // 80 1298 vec16 kYSub16; // 96 1299 vec16 kYToRgb; // 112 1300 vec8 kVUToB; // 128 1301 vec8 kVUToG; // 144 1302 vec8 kVUToR; // 160 1303} CONST SIMD_ALIGNED(kYuvConstants) = { 1304 { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB }, 1305 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG }, 1306 { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR }, 1307 { BB, BB, BB, BB, BB, BB, BB, BB }, 1308 { BG, BG, BG, BG, BG, BG, BG, BG }, 1309 { BR, BR, BR, BR, BR, BR, BR, BR }, 1310 { 16, 16, 16, 16, 16, 16, 16, 16 }, 1311 { YG, YG, YG, YG, YG, YG, YG, YG }, 1312 { VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB }, 1313 { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG }, 1314 { VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR } 1315}; 1316 1317 1318// Read 8 UV from 411 1319#define READYUV444 \ 1320 "movq (%[u_buf]),%%xmm0 \n" \ 1321 "movq (%[u_buf],%[v_buf],1),%%xmm1 \n" \ 1322 "lea 0x8(%[u_buf]),%[u_buf] \n" \ 1323 "punpcklbw %%xmm1,%%xmm0 \n" \ 1324 1325// Read 4 UV from 422, upsample to 8 UV 1326#define READYUV422 \ 1327 "movd (%[u_buf]),%%xmm0 \n" \ 1328 "movd (%[u_buf],%[v_buf],1),%%xmm1 \n" \ 1329 "lea 0x4(%[u_buf]),%[u_buf] \n" \ 1330 "punpcklbw %%xmm1,%%xmm0 \n" \ 1331 "punpcklwd %%xmm0,%%xmm0 \n" \ 1332 1333// Read 2 UV from 411, upsample to 8 UV 1334#define READYUV411 \ 1335 "movd (%[u_buf]),%%xmm0 \n" \ 1336 "movd (%[u_buf],%[v_buf],1),%%xmm1 \n" \ 1337 "lea 0x2(%[u_buf]),%[u_buf] \n" \ 1338 "punpcklbw %%xmm1,%%xmm0 \n" \ 1339 "punpcklwd %%xmm0,%%xmm0 \n" \ 1340 "punpckldq %%xmm0,%%xmm0 \n" \ 1341 1342// Read 4 UV from NV12, upsample to 8 UV 1343#define READNV12 \ 1344 "movq (%[uv_buf]),%%xmm0 \n" \ 1345 "lea 0x8(%[uv_buf]),%[uv_buf] \n" \ 1346 "punpcklwd %%xmm0,%%xmm0 \n" \ 1347 1348// Convert 8 pixels: 8 UV and 8 Y 1349#define YUVTORGB \ 1350 "movdqa %%xmm0,%%xmm1 \n" \ 1351 "movdqa %%xmm0,%%xmm2 \n" \ 1352 "pmaddubsw (%[kYuvConstants]),%%xmm0 \n" \ 1353 "pmaddubsw 16(%[kYuvConstants]),%%xmm1 \n" \ 1354 "pmaddubsw 32(%[kYuvConstants]),%%xmm2 \n" \ 1355 "psubw 48(%[kYuvConstants]),%%xmm0 \n" \ 1356 "psubw 64(%[kYuvConstants]),%%xmm1 \n" \ 1357 "psubw 80(%[kYuvConstants]),%%xmm2 \n" \ 1358 "movq (%[y_buf]),%%xmm3 \n" \ 1359 "lea 0x8(%[y_buf]),%[y_buf] \n" \ 1360 "punpcklbw %%xmm4,%%xmm3 \n" \ 1361 "psubsw 96(%[kYuvConstants]),%%xmm3 \n" \ 1362 "pmullw 112(%[kYuvConstants]),%%xmm3 \n" \ 1363 "paddsw %%xmm3,%%xmm0 \n" \ 1364 "paddsw %%xmm3,%%xmm1 \n" \ 1365 "paddsw %%xmm3,%%xmm2 \n" \ 1366 "psraw $0x6,%%xmm0 \n" \ 1367 "psraw $0x6,%%xmm1 \n" \ 1368 "psraw $0x6,%%xmm2 \n" \ 1369 "packuswb %%xmm0,%%xmm0 \n" \ 1370 "packuswb %%xmm1,%%xmm1 \n" \ 1371 "packuswb %%xmm2,%%xmm2 \n" \ 1372 1373// Convert 8 pixels: 8 VU and 8 Y 1374#define YVUTORGB \ 1375 "movdqa %%xmm0,%%xmm1 \n" \ 1376 "movdqa %%xmm0,%%xmm2 \n" \ 1377 "pmaddubsw 128(%[kYuvConstants]),%%xmm0 \n" \ 1378 "pmaddubsw 144(%[kYuvConstants]),%%xmm1 \n" \ 1379 "pmaddubsw 160(%[kYuvConstants]),%%xmm2 \n" \ 1380 "psubw 48(%[kYuvConstants]),%%xmm0 \n" \ 1381 "psubw 64(%[kYuvConstants]),%%xmm1 \n" \ 1382 "psubw 80(%[kYuvConstants]),%%xmm2 \n" \ 1383 "movq (%[y_buf]),%%xmm3 \n" \ 1384 "lea 0x8(%[y_buf]),%[y_buf] \n" \ 1385 "punpcklbw %%xmm4,%%xmm3 \n" \ 1386 "psubsw 96(%[kYuvConstants]),%%xmm3 \n" \ 1387 "pmullw 112(%[kYuvConstants]),%%xmm3 \n" \ 1388 "paddsw %%xmm3,%%xmm0 \n" \ 1389 "paddsw %%xmm3,%%xmm1 \n" \ 1390 "paddsw %%xmm3,%%xmm2 \n" \ 1391 "psraw $0x6,%%xmm0 \n" \ 1392 "psraw $0x6,%%xmm1 \n" \ 1393 "psraw $0x6,%%xmm2 \n" \ 1394 "packuswb %%xmm0,%%xmm0 \n" \ 1395 "packuswb %%xmm1,%%xmm1 \n" \ 1396 "packuswb %%xmm2,%%xmm2 \n" \ 1397 1398void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, 1399 const uint8* u_buf, 1400 const uint8* v_buf, 1401 uint8* argb_buf, 1402 int width) { 1403 asm volatile ( 1404 "sub %[u_buf],%[v_buf] \n" 1405 "pcmpeqb %%xmm5,%%xmm5 \n" 1406 "pxor %%xmm4,%%xmm4 \n" 1407 ".p2align 4 \n" 1408 "1: \n" 1409 READYUV444 1410 YUVTORGB 1411 "punpcklbw %%xmm1,%%xmm0 \n" 1412 "punpcklbw %%xmm5,%%xmm2 \n" 1413 "movdqa %%xmm0,%%xmm1 \n" 1414 "punpcklwd %%xmm2,%%xmm0 \n" 1415 "punpckhwd %%xmm2,%%xmm1 \n" 1416 "movdqa %%xmm0,(%[argb_buf]) \n" 1417 "movdqa %%xmm1,0x10(%[argb_buf]) \n" 1418 "lea 0x20(%[argb_buf]),%[argb_buf] \n" 1419 "sub $0x8,%[width] \n" 1420 "jg 1b \n" 1421 : [y_buf]"+r"(y_buf), // %[y_buf] 1422 [u_buf]"+r"(u_buf), // %[u_buf] 1423 [v_buf]"+r"(v_buf), // %[v_buf] 1424 [argb_buf]"+r"(argb_buf), // %[argb_buf] 1425 [width]"+rm"(width) // %[width] 1426 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] 1427 : "memory", "cc" 1428#if defined(__SSE2__) 1429 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1430#endif 1431 ); 1432} 1433 1434void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf, 1435 const uint8* u_buf, 1436 const uint8* v_buf, 1437 uint8* argb_buf, 1438 int width) { 1439 asm volatile ( 1440 "sub %[u_buf],%[v_buf] \n" 1441 "pcmpeqb %%xmm5,%%xmm5 \n" 1442 "pxor %%xmm4,%%xmm4 \n" 1443 ".p2align 4 \n" 1444 "1: \n" 1445 READYUV422 1446 YUVTORGB 1447 "punpcklbw %%xmm1,%%xmm0 \n" 1448 "punpcklbw %%xmm5,%%xmm2 \n" 1449 "movdqa %%xmm0,%%xmm1 \n" 1450 "punpcklwd %%xmm2,%%xmm0 \n" 1451 "punpckhwd %%xmm2,%%xmm1 \n" 1452 "movdqa %%xmm0,(%[argb_buf]) \n" 1453 "movdqa %%xmm1,0x10(%[argb_buf]) \n" 1454 "lea 0x20(%[argb_buf]),%[argb_buf] \n" 1455 "sub $0x8,%[width] \n" 1456 "jg 1b \n" 1457 : [y_buf]"+r"(y_buf), // %[y_buf] 1458 [u_buf]"+r"(u_buf), // %[u_buf] 1459 [v_buf]"+r"(v_buf), // %[v_buf] 1460 [argb_buf]"+r"(argb_buf), // %[argb_buf] 1461 [width]"+rm"(width) // %[width] 1462 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] 1463 : "memory", "cc" 1464#if defined(__SSE2__) 1465 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1466#endif 1467 ); 1468} 1469 1470void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, 1471 const uint8* u_buf, 1472 const uint8* v_buf, 1473 uint8* argb_buf, 1474 int width) { 1475 asm volatile ( 1476 "sub %[u_buf],%[v_buf] \n" 1477 "pcmpeqb %%xmm5,%%xmm5 \n" 1478 "pxor %%xmm4,%%xmm4 \n" 1479 ".p2align 4 \n" 1480 "1: \n" 1481 READYUV411 1482 YUVTORGB 1483 "punpcklbw %%xmm1,%%xmm0 \n" 1484 "punpcklbw %%xmm5,%%xmm2 \n" 1485 "movdqa %%xmm0,%%xmm1 \n" 1486 "punpcklwd %%xmm2,%%xmm0 \n" 1487 "punpckhwd %%xmm2,%%xmm1 \n" 1488 "movdqa %%xmm0,(%[argb_buf]) \n" 1489 "movdqa %%xmm1,0x10(%[argb_buf]) \n" 1490 "lea 0x20(%[argb_buf]),%[argb_buf] \n" 1491 "sub $0x8,%[width] \n" 1492 "jg 1b \n" 1493 : [y_buf]"+r"(y_buf), // %[y_buf] 1494 [u_buf]"+r"(u_buf), // %[u_buf] 1495 [v_buf]"+r"(v_buf), // %[v_buf] 1496 [argb_buf]"+r"(argb_buf), // %[argb_buf] 1497 [width]"+rm"(width) // %[width] 1498 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] 1499 : "memory", "cc" 1500#if defined(__SSE2__) 1501 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1502#endif 1503 ); 1504} 1505 1506void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf, 1507 const uint8* uv_buf, 1508 uint8* argb_buf, 1509 int width) { 1510 asm volatile ( 1511 "pcmpeqb %%xmm5,%%xmm5 \n" 1512 "pxor %%xmm4,%%xmm4 \n" 1513 ".p2align 4 \n" 1514 "1: \n" 1515 READNV12 1516 YUVTORGB 1517 "punpcklbw %%xmm1,%%xmm0 \n" 1518 "punpcklbw %%xmm5,%%xmm2 \n" 1519 "movdqa %%xmm0,%%xmm1 \n" 1520 "punpcklwd %%xmm2,%%xmm0 \n" 1521 "punpckhwd %%xmm2,%%xmm1 \n" 1522 "movdqa %%xmm0,(%[argb_buf]) \n" 1523 "movdqa %%xmm1,0x10(%[argb_buf]) \n" 1524 "lea 0x20(%[argb_buf]),%[argb_buf] \n" 1525 "sub $0x8,%[width] \n" 1526 "jg 1b \n" 1527 : [y_buf]"+r"(y_buf), // %[y_buf] 1528 [uv_buf]"+r"(uv_buf), // %[uv_buf] 1529 [argb_buf]"+r"(argb_buf), // %[argb_buf] 1530 [width]"+rm"(width) // %[width] 1531 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] 1532 : "memory", "cc" 1533#if defined(__SSE2__) 1534 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1535#endif 1536 ); 1537} 1538 1539void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf, 1540 const uint8* vu_buf, 1541 uint8* argb_buf, 1542 int width) { 1543 asm volatile ( 1544 "pcmpeqb %%xmm5,%%xmm5 \n" 1545 "pxor %%xmm4,%%xmm4 \n" 1546 ".p2align 4 \n" 1547 "1: \n" 1548 READNV12 1549 YVUTORGB 1550 "punpcklbw %%xmm1,%%xmm0 \n" 1551 "punpcklbw %%xmm5,%%xmm2 \n" 1552 "movdqa %%xmm0,%%xmm1 \n" 1553 "punpcklwd %%xmm2,%%xmm0 \n" 1554 "punpckhwd %%xmm2,%%xmm1 \n" 1555 "movdqa %%xmm0,(%[argb_buf]) \n" 1556 "movdqa %%xmm1,0x10(%[argb_buf]) \n" 1557 "lea 0x20(%[argb_buf]),%[argb_buf] \n" 1558 "sub $0x8,%[width] \n" 1559 "jg 1b \n" 1560 : [y_buf]"+r"(y_buf), // %[y_buf] 1561 [uv_buf]"+r"(vu_buf), // %[uv_buf] 1562 [argb_buf]"+r"(argb_buf), // %[argb_buf] 1563 [width]"+rm"(width) // %[width] 1564 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] 1565 : "memory", "cc" 1566#if defined(__SSE2__) 1567 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1568#endif 1569 ); 1570} 1571 1572void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, 1573 const uint8* u_buf, 1574 const uint8* v_buf, 1575 uint8* argb_buf, 1576 int width) { 1577 asm volatile ( 1578 "sub %[u_buf],%[v_buf] \n" 1579 "pcmpeqb %%xmm5,%%xmm5 \n" 1580 "pxor %%xmm4,%%xmm4 \n" 1581 ".p2align 4 \n" 1582 "1: \n" 1583 READYUV444 1584 YUVTORGB 1585 "punpcklbw %%xmm1,%%xmm0 \n" 1586 "punpcklbw %%xmm5,%%xmm2 \n" 1587 "movdqa %%xmm0,%%xmm1 \n" 1588 "punpcklwd %%xmm2,%%xmm0 \n" 1589 "punpckhwd %%xmm2,%%xmm1 \n" 1590 "movdqu %%xmm0,(%[argb_buf]) \n" 1591 "movdqu %%xmm1,0x10(%[argb_buf]) \n" 1592 "lea 0x20(%[argb_buf]),%[argb_buf] \n" 1593 "sub $0x8,%[width] \n" 1594 "jg 1b \n" 1595 : [y_buf]"+r"(y_buf), // %[y_buf] 1596 [u_buf]"+r"(u_buf), // %[u_buf] 1597 [v_buf]"+r"(v_buf), // %[v_buf] 1598 [argb_buf]"+r"(argb_buf), // %[argb_buf] 1599 [width]"+rm"(width) // %[width] 1600 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] 1601 : "memory", "cc" 1602#if defined(__SSE2__) 1603 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1604#endif 1605 ); 1606} 1607 1608void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, 1609 const uint8* u_buf, 1610 const uint8* v_buf, 1611 uint8* argb_buf, 1612 int width) { 1613 asm volatile ( 1614 "sub %[u_buf],%[v_buf] \n" 1615 "pcmpeqb %%xmm5,%%xmm5 \n" 1616 "pxor %%xmm4,%%xmm4 \n" 1617 ".p2align 4 \n" 1618 "1: \n" 1619 READYUV422 1620 YUVTORGB 1621 "punpcklbw %%xmm1,%%xmm0 \n" 1622 "punpcklbw %%xmm5,%%xmm2 \n" 1623 "movdqa %%xmm0,%%xmm1 \n" 1624 "punpcklwd %%xmm2,%%xmm0 \n" 1625 "punpckhwd %%xmm2,%%xmm1 \n" 1626 "movdqu %%xmm0,(%[argb_buf]) \n" 1627 "movdqu %%xmm1,0x10(%[argb_buf]) \n" 1628 "lea 0x20(%[argb_buf]),%[argb_buf] \n" 1629 "sub $0x8,%[width] \n" 1630 "jg 1b \n" 1631 : [y_buf]"+r"(y_buf), // %[y_buf] 1632 [u_buf]"+r"(u_buf), // %[u_buf] 1633 [v_buf]"+r"(v_buf), // %[v_buf] 1634 [argb_buf]"+r"(argb_buf), // %[argb_buf] 1635 [width]"+rm"(width) // %[width] 1636 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] 1637 : "memory", "cc" 1638#if defined(__SSE2__) 1639 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1640#endif 1641 ); 1642} 1643 1644void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, 1645 const uint8* u_buf, 1646 const uint8* v_buf, 1647 uint8* argb_buf, 1648 int width) { 1649 asm volatile ( 1650 "sub %[u_buf],%[v_buf] \n" 1651 "pcmpeqb %%xmm5,%%xmm5 \n" 1652 "pxor %%xmm4,%%xmm4 \n" 1653 ".p2align 4 \n" 1654 "1: \n" 1655 READYUV411 1656 YUVTORGB 1657 "punpcklbw %%xmm1,%%xmm0 \n" 1658 "punpcklbw %%xmm5,%%xmm2 \n" 1659 "movdqa %%xmm0,%%xmm1 \n" 1660 "punpcklwd %%xmm2,%%xmm0 \n" 1661 "punpckhwd %%xmm2,%%xmm1 \n" 1662 "movdqu %%xmm0,(%[argb_buf]) \n" 1663 "movdqu %%xmm1,0x10(%[argb_buf]) \n" 1664 "lea 0x20(%[argb_buf]),%[argb_buf] \n" 1665 "sub $0x8,%[width] \n" 1666 "jg 1b \n" 1667 : [y_buf]"+r"(y_buf), // %[y_buf] 1668 [u_buf]"+r"(u_buf), // %[u_buf] 1669 [v_buf]"+r"(v_buf), // %[v_buf] 1670 [argb_buf]"+r"(argb_buf), // %[argb_buf] 1671 [width]"+rm"(width) // %[width] 1672 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] 1673 : "memory", "cc" 1674#if defined(__SSE2__) 1675 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1676#endif 1677 ); 1678} 1679 1680void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, 1681 const uint8* uv_buf, 1682 uint8* argb_buf, 1683 int width) { 1684 asm volatile ( 1685 "pcmpeqb %%xmm5,%%xmm5 \n" 1686 "pxor %%xmm4,%%xmm4 \n" 1687 ".p2align 4 \n" 1688 "1: \n" 1689 READNV12 1690 YUVTORGB 1691 "punpcklbw %%xmm1,%%xmm0 \n" 1692 "punpcklbw %%xmm5,%%xmm2 \n" 1693 "movdqa %%xmm0,%%xmm1 \n" 1694 "punpcklwd %%xmm2,%%xmm0 \n" 1695 "punpckhwd %%xmm2,%%xmm1 \n" 1696 "movdqu %%xmm0,(%[argb_buf]) \n" 1697 "movdqu %%xmm1,0x10(%[argb_buf]) \n" 1698 "lea 0x20(%[argb_buf]),%[argb_buf] \n" 1699 "sub $0x8,%[width] \n" 1700 "jg 1b \n" 1701 : [y_buf]"+r"(y_buf), // %[y_buf] 1702 [uv_buf]"+r"(uv_buf), // %[uv_buf] 1703 [argb_buf]"+r"(argb_buf), // %[argb_buf] 1704 [width]"+rm"(width) // %[width] 1705 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] 1706 : "memory", "cc" 1707#if defined(__SSE2__) 1708 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1709#endif 1710 ); 1711} 1712 1713void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, 1714 const uint8* vu_buf, 1715 uint8* argb_buf, 1716 int width) { 1717 asm volatile ( 1718 "pcmpeqb %%xmm5,%%xmm5 \n" 1719 "pxor %%xmm4,%%xmm4 \n" 1720 ".p2align 4 \n" 1721 "1: \n" 1722 READNV12 1723 YVUTORGB 1724 "punpcklbw %%xmm1,%%xmm0 \n" 1725 "punpcklbw %%xmm5,%%xmm2 \n" 1726 "movdqa %%xmm0,%%xmm1 \n" 1727 "punpcklwd %%xmm2,%%xmm0 \n" 1728 "punpckhwd %%xmm2,%%xmm1 \n" 1729 "movdqu %%xmm0,(%[argb_buf]) \n" 1730 "movdqu %%xmm1,0x10(%[argb_buf]) \n" 1731 "lea 0x20(%[argb_buf]),%[argb_buf] \n" 1732 "sub $0x8,%[width] \n" 1733 "jg 1b \n" 1734 : [y_buf]"+r"(y_buf), // %[y_buf] 1735 [uv_buf]"+r"(vu_buf), // %[uv_buf] 1736 [argb_buf]"+r"(argb_buf), // %[argb_buf] 1737 [width]"+rm"(width) // %[width] 1738 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] 1739 : "memory", "cc" 1740#if defined(__SSE2__) 1741 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1742#endif 1743 ); 1744} 1745 1746void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf, 1747 const uint8* u_buf, 1748 const uint8* v_buf, 1749 uint8* bgra_buf, 1750 int width) { 1751 asm volatile ( 1752 "sub %[u_buf],%[v_buf] \n" 1753 "pcmpeqb %%xmm5,%%xmm5 \n" 1754 "pxor %%xmm4,%%xmm4 \n" 1755 ".p2align 4 \n" 1756 "1: \n" 1757 READYUV422 1758 YUVTORGB 1759 "pcmpeqb %%xmm5,%%xmm5 \n" 1760 "punpcklbw %%xmm0,%%xmm1 \n" 1761 "punpcklbw %%xmm2,%%xmm5 \n" 1762 "movdqa %%xmm5,%%xmm0 \n" 1763 "punpcklwd %%xmm1,%%xmm5 \n" 1764 "punpckhwd %%xmm1,%%xmm0 \n" 1765 "movdqa %%xmm5,(%[argb_buf]) \n" 1766 "movdqa %%xmm0,0x10(%[argb_buf]) \n" 1767 "lea 0x20(%[argb_buf]),%[argb_buf] \n" 1768 "sub $0x8,%[width] \n" 1769 "jg 1b \n" 1770 : [y_buf]"+r"(y_buf), // %[y_buf] 1771 [u_buf]"+r"(u_buf), // %[u_buf] 1772 [v_buf]"+r"(v_buf), // %[v_buf] 1773 [argb_buf]"+r"(bgra_buf), // %[argb_buf] 1774 [width]"+rm"(width) // %[width] 1775 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] 1776 : "memory", "cc" 1777#if defined(__SSE2__) 1778 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1779#endif 1780 ); 1781} 1782 1783void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf, 1784 const uint8* u_buf, 1785 const uint8* v_buf, 1786 uint8* abgr_buf, 1787 int width) { 1788 asm volatile ( 1789 "sub %[u_buf],%[v_buf] \n" 1790 "pcmpeqb %%xmm5,%%xmm5 \n" 1791 "pxor %%xmm4,%%xmm4 \n" 1792 ".p2align 4 \n" 1793 "1: \n" 1794 READYUV422 1795 YUVTORGB 1796 "punpcklbw %%xmm1,%%xmm2 \n" 1797 "punpcklbw %%xmm5,%%xmm0 \n" 1798 "movdqa %%xmm2,%%xmm1 \n" 1799 "punpcklwd %%xmm0,%%xmm2 \n" 1800 "punpckhwd %%xmm0,%%xmm1 \n" 1801 "movdqa %%xmm2,(%[argb_buf]) \n" 1802 "movdqa %%xmm1,0x10(%[argb_buf]) \n" 1803 "lea 0x20(%[argb_buf]),%[argb_buf] \n" 1804 "sub $0x8,%[width] \n" 1805 "jg 1b \n" 1806 : [y_buf]"+r"(y_buf), // %[y_buf] 1807 [u_buf]"+r"(u_buf), // %[u_buf] 1808 [v_buf]"+r"(v_buf), // %[v_buf] 1809 [argb_buf]"+r"(abgr_buf), // %[argb_buf] 1810 [width]"+rm"(width) // %[width] 1811 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] 1812 : "memory", "cc" 1813#if defined(__SSE2__) 1814 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1815#endif 1816 ); 1817} 1818 1819void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, 1820 const uint8* u_buf, 1821 const uint8* v_buf, 1822 uint8* bgra_buf, 1823 int width) { 1824 asm volatile ( 1825 "sub %[u_buf],%[v_buf] \n" 1826 "pcmpeqb %%xmm5,%%xmm5 \n" 1827 "pxor %%xmm4,%%xmm4 \n" 1828 ".p2align 4 \n" 1829 "1: \n" 1830 READYUV422 1831 YUVTORGB 1832 "pcmpeqb %%xmm5,%%xmm5 \n" 1833 "punpcklbw %%xmm0,%%xmm1 \n" 1834 "punpcklbw %%xmm2,%%xmm5 \n" 1835 "movdqa %%xmm5,%%xmm0 \n" 1836 "punpcklwd %%xmm1,%%xmm5 \n" 1837 "punpckhwd %%xmm1,%%xmm0 \n" 1838 "movdqu %%xmm5,(%[argb_buf]) \n" 1839 "movdqu %%xmm0,0x10(%[argb_buf]) \n" 1840 "lea 0x20(%[argb_buf]),%[argb_buf] \n" 1841 "sub $0x8,%[width] \n" 1842 "jg 1b \n" 1843 : [y_buf]"+r"(y_buf), // %[y_buf] 1844 [u_buf]"+r"(u_buf), // %[u_buf] 1845 [v_buf]"+r"(v_buf), // %[v_buf] 1846 [argb_buf]"+r"(bgra_buf), // %[argb_buf] 1847 [width]"+rm"(width) // %[width] 1848 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] 1849 : "memory", "cc" 1850#if defined(__SSE2__) 1851 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1852#endif 1853 ); 1854} 1855 1856void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, 1857 const uint8* u_buf, 1858 const uint8* v_buf, 1859 uint8* abgr_buf, 1860 int width) { 1861 asm volatile ( 1862 "sub %[u_buf],%[v_buf] \n" 1863 "pcmpeqb %%xmm5,%%xmm5 \n" 1864 "pxor %%xmm4,%%xmm4 \n" 1865 ".p2align 4 \n" 1866 "1: \n" 1867 READYUV422 1868 YUVTORGB 1869 "punpcklbw %%xmm1,%%xmm2 \n" 1870 "punpcklbw %%xmm5,%%xmm0 \n" 1871 "movdqa %%xmm2,%%xmm1 \n" 1872 "punpcklwd %%xmm0,%%xmm2 \n" 1873 "punpckhwd %%xmm0,%%xmm1 \n" 1874 "movdqu %%xmm2,(%[argb_buf]) \n" 1875 "movdqu %%xmm1,0x10(%[argb_buf]) \n" 1876 "lea 0x20(%[argb_buf]),%[argb_buf] \n" 1877 "sub $0x8,%[width] \n" 1878 "jg 1b \n" 1879 : [y_buf]"+r"(y_buf), // %[y_buf] 1880 [u_buf]"+r"(u_buf), // %[u_buf] 1881 [v_buf]"+r"(v_buf), // %[v_buf] 1882 [argb_buf]"+r"(abgr_buf), // %[argb_buf] 1883 [width]"+rm"(width) // %[width] 1884 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] 1885 : "memory", "cc" 1886#if defined(__SSE2__) 1887 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1888#endif 1889 ); 1890} 1891#endif // HAS_I422TOARGBROW_SSSE3 1892 1893#ifdef HAS_YTOARGBROW_SSE2 1894void YToARGBRow_SSE2(const uint8* y_buf, 1895 uint8* rgb_buf, 1896 int width) { 1897 asm volatile ( 1898 "pcmpeqb %%xmm4,%%xmm4 \n" 1899 "pslld $0x18,%%xmm4 \n" 1900 "mov $0x10001000,%%eax \n" 1901 "movd %%eax,%%xmm3 \n" 1902 "pshufd $0x0,%%xmm3,%%xmm3 \n" 1903 "mov $0x012a012a,%%eax \n" 1904 "movd %%eax,%%xmm2 \n" 1905 "pshufd $0x0,%%xmm2,%%xmm2 \n" 1906 ".p2align 4 \n" 1907 "1: \n" 1908 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 1909 "movq (%0),%%xmm0 \n" 1910 "lea 0x8(%0),%0 \n" 1911 "punpcklbw %%xmm0,%%xmm0 \n" 1912 "psubusw %%xmm3,%%xmm0 \n" 1913 "pmulhuw %%xmm2,%%xmm0 \n" 1914 "packuswb %%xmm0,%%xmm0 \n" 1915 1916 // Step 2: Weave into ARGB 1917 "punpcklbw %%xmm0,%%xmm0 \n" 1918 "movdqa %%xmm0,%%xmm1 \n" 1919 "punpcklwd %%xmm0,%%xmm0 \n" 1920 "punpckhwd %%xmm1,%%xmm1 \n" 1921 "por %%xmm4,%%xmm0 \n" 1922 "por %%xmm4,%%xmm1 \n" 1923 "movdqa %%xmm0,(%1) \n" 1924 "movdqa %%xmm1,16(%1) \n" 1925 "lea 32(%1),%1 \n" 1926 1927 "sub $0x8,%2 \n" 1928 "jg 1b \n" 1929 : "+r"(y_buf), // %0 1930 "+r"(rgb_buf), // %1 1931 "+rm"(width) // %2 1932 : 1933 : "memory", "cc", "eax" 1934#if defined(__SSE2__) 1935 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" 1936#endif 1937 ); 1938} 1939#endif // HAS_YTOARGBROW_SSE2 1940 1941#ifdef HAS_MIRRORROW_SSSE3 1942// Shuffle table for reversing the bytes. 1943CONST uvec8 kShuffleMirror = { 1944 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u 1945}; 1946 1947void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { 1948 intptr_t temp_width = static_cast<intptr_t>(width); 1949 asm volatile ( 1950 "movdqa %3,%%xmm5 \n" 1951 "lea -0x10(%0),%0 \n" 1952 ".p2align 4 \n" 1953 "1: \n" 1954 "movdqa (%0,%2),%%xmm0 \n" 1955 "pshufb %%xmm5,%%xmm0 \n" 1956 "sub $0x10,%2 \n" 1957 "movdqa %%xmm0,(%1) \n" 1958 "lea 0x10(%1),%1 \n" 1959 "jg 1b \n" 1960 : "+r"(src), // %0 1961 "+r"(dst), // %1 1962 "+r"(temp_width) // %2 1963 : "m"(kShuffleMirror) // %3 1964 : "memory", "cc" 1965#if defined(__SSE2__) 1966 , "xmm0", "xmm5" 1967#endif 1968 ); 1969} 1970#endif // HAS_MIRRORROW_SSSE3 1971 1972#ifdef HAS_MIRRORROW_SSE2 1973void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { 1974 intptr_t temp_width = static_cast<intptr_t>(width); 1975 asm volatile ( 1976 "lea -0x10(%0),%0 \n" 1977 ".p2align 4 \n" 1978 "1: \n" 1979 "movdqu (%0,%2),%%xmm0 \n" 1980 "movdqa %%xmm0,%%xmm1 \n" 1981 "psllw $0x8,%%xmm0 \n" 1982 "psrlw $0x8,%%xmm1 \n" 1983 "por %%xmm1,%%xmm0 \n" 1984 "pshuflw $0x1b,%%xmm0,%%xmm0 \n" 1985 "pshufhw $0x1b,%%xmm0,%%xmm0 \n" 1986 "pshufd $0x4e,%%xmm0,%%xmm0 \n" 1987 "sub $0x10,%2 \n" 1988 "movdqu %%xmm0,(%1) \n" 1989 "lea 0x10(%1),%1 \n" 1990 "jg 1b \n" 1991 : "+r"(src), // %0 1992 "+r"(dst), // %1 1993 "+r"(temp_width) // %2 1994 : 1995 : "memory", "cc" 1996#if defined(__SSE2__) 1997 , "xmm0", "xmm1" 1998#endif 1999 ); 2000} 2001#endif // HAS_MIRRORROW_SSE2 2002 2003#ifdef HAS_MIRRORROW_UV_SSSE3 2004// Shuffle table for reversing the bytes of UV channels. 2005CONST uvec8 kShuffleMirrorUV = { 2006 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u 2007}; 2008void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, 2009 int width) { 2010 intptr_t temp_width = static_cast<intptr_t>(width); 2011 asm volatile ( 2012 "movdqa %4,%%xmm1 \n" 2013 "lea -16(%0,%3,2),%0 \n" 2014 "sub %1,%2 \n" 2015 ".p2align 4 \n" 2016 "1: \n" 2017 "movdqa (%0),%%xmm0 \n" 2018 "lea -16(%0),%0 \n" 2019 "pshufb %%xmm1,%%xmm0 \n" 2020 "sub $8,%3 \n" 2021 "movlpd %%xmm0,(%1) \n" 2022 "movhpd %%xmm0,(%1,%2) \n" 2023 "lea 8(%1),%1 \n" 2024 "jg 1b \n" 2025 : "+r"(src), // %0 2026 "+r"(dst_u), // %1 2027 "+r"(dst_v), // %2 2028 "+r"(temp_width) // %3 2029 : "m"(kShuffleMirrorUV) // %4 2030 : "memory", "cc" 2031#if defined(__SSE2__) 2032 , "xmm0", "xmm1" 2033#endif 2034 ); 2035} 2036#endif // HAS_MIRRORROW_UV_SSSE3 2037 2038#ifdef HAS_ARGBMIRRORROW_SSSE3 2039// Shuffle table for reversing the bytes. 2040CONST uvec8 kARGBShuffleMirror = { 2041 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u 2042}; 2043 2044void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { 2045 intptr_t temp_width = static_cast<intptr_t>(width); 2046 asm volatile ( 2047 "movdqa %3,%%xmm5 \n" 2048 "lea -0x10(%0),%0 \n" 2049 ".p2align 4 \n" 2050 "1: \n" 2051 "movdqa (%0,%2,4),%%xmm0 \n" 2052 "pshufb %%xmm5,%%xmm0 \n" 2053 "sub $0x4,%2 \n" 2054 "movdqa %%xmm0,(%1) \n" 2055 "lea 0x10(%1),%1 \n" 2056 "jg 1b \n" 2057 : "+r"(src), // %0 2058 "+r"(dst), // %1 2059 "+r"(temp_width) // %2 2060 : "m"(kARGBShuffleMirror) // %3 2061 : "memory", "cc" 2062#if defined(__SSE2__) 2063 , "xmm0", "xmm5" 2064#endif 2065 ); 2066} 2067#endif // HAS_ARGBMIRRORROW_SSSE3 2068 2069#ifdef HAS_SPLITUV_SSE2 2070void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { 2071 asm volatile ( 2072 "pcmpeqb %%xmm5,%%xmm5 \n" 2073 "psrlw $0x8,%%xmm5 \n" 2074 "sub %1,%2 \n" 2075 ".p2align 4 \n" 2076 "1: \n" 2077 "movdqa (%0),%%xmm0 \n" 2078 "movdqa 0x10(%0),%%xmm1 \n" 2079 "lea 0x20(%0),%0 \n" 2080 "movdqa %%xmm0,%%xmm2 \n" 2081 "movdqa %%xmm1,%%xmm3 \n" 2082 "pand %%xmm5,%%xmm0 \n" 2083 "pand %%xmm5,%%xmm1 \n" 2084 "packuswb %%xmm1,%%xmm0 \n" 2085 "psrlw $0x8,%%xmm2 \n" 2086 "psrlw $0x8,%%xmm3 \n" 2087 "packuswb %%xmm3,%%xmm2 \n" 2088 "movdqa %%xmm0,(%1) \n" 2089 "movdqa %%xmm2,(%1,%2) \n" 2090 "lea 0x10(%1),%1 \n" 2091 "sub $0x10,%3 \n" 2092 "jg 1b \n" 2093 : "+r"(src_uv), // %0 2094 "+r"(dst_u), // %1 2095 "+r"(dst_v), // %2 2096 "+r"(pix) // %3 2097 : 2098 : "memory", "cc" 2099#if defined(__SSE2__) 2100 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 2101#endif 2102 ); 2103} 2104#endif // HAS_SPLITUV_SSE2 2105 2106#ifdef HAS_COPYROW_SSE2 2107void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { 2108 asm volatile ( 2109 "sub %0,%1 \n" 2110 ".p2align 4 \n" 2111 "1: \n" 2112 "movdqa (%0),%%xmm0 \n" 2113 "movdqa 0x10(%0),%%xmm1 \n" 2114 "movdqa %%xmm0,(%0,%1) \n" 2115 "movdqa %%xmm1,0x10(%0,%1) \n" 2116 "lea 0x20(%0),%0 \n" 2117 "sub $0x20,%2 \n" 2118 "jg 1b \n" 2119 : "+r"(src), // %0 2120 "+r"(dst), // %1 2121 "+r"(count) // %2 2122 : 2123 : "memory", "cc" 2124#if defined(__SSE2__) 2125 , "xmm0", "xmm1" 2126#endif 2127 ); 2128} 2129#endif // HAS_COPYROW_SSE2 2130 2131#ifdef HAS_COPYROW_X86 2132void CopyRow_X86(const uint8* src, uint8* dst, int width) { 2133 size_t width_tmp = static_cast<size_t>(width); 2134 asm volatile ( 2135 "shr $0x2,%2 \n" 2136 "rep movsl \n" 2137 : "+S"(src), // %0 2138 "+D"(dst), // %1 2139 "+c"(width_tmp) // %2 2140 : 2141 : "memory", "cc" 2142 ); 2143} 2144#endif // HAS_COPYROW_X86 2145 2146#ifdef HAS_SETROW_X86 2147void SetRow8_X86(uint8* dst, uint32 v32, int width) { 2148 size_t width_tmp = static_cast<size_t>(width); 2149 asm volatile ( 2150 "shr $0x2,%1 \n" 2151 "rep stosl \n" 2152 : "+D"(dst), // %0 2153 "+c"(width_tmp) // %1 2154 : "a"(v32) // %2 2155 : "memory", "cc"); 2156} 2157 2158void SetRows32_X86(uint8* dst, uint32 v32, int width, 2159 int dst_stride, int height) { 2160 for (int y = 0; y < height; ++y) { 2161 size_t width_tmp = static_cast<size_t>(width); 2162 uint32* d = reinterpret_cast<uint32*>(dst); 2163 asm volatile ( 2164 "rep stosl \n" 2165 : "+D"(d), // %0 2166 "+c"(width_tmp) // %1 2167 : "a"(v32) // %2 2168 : "memory", "cc"); 2169 dst += dst_stride; 2170 } 2171} 2172#endif // HAS_SETROW_X86 2173 2174#ifdef HAS_YUY2TOYROW_SSE2 2175void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) { 2176 asm volatile ( 2177 "pcmpeqb %%xmm5,%%xmm5 \n" 2178 "psrlw $0x8,%%xmm5 \n" 2179 ".p2align 4 \n" 2180 "1: \n" 2181 "movdqa (%0),%%xmm0 \n" 2182 "movdqa 0x10(%0),%%xmm1 \n" 2183 "lea 0x20(%0),%0 \n" 2184 "pand %%xmm5,%%xmm0 \n" 2185 "pand %%xmm5,%%xmm1 \n" 2186 "packuswb %%xmm1,%%xmm0 \n" 2187 "movdqa %%xmm0,(%1) \n" 2188 "lea 0x10(%1),%1 \n" 2189 "sub $0x10,%2 \n" 2190 "jg 1b \n" 2191 : "+r"(src_yuy2), // %0 2192 "+r"(dst_y), // %1 2193 "+r"(pix) // %2 2194 : 2195 : "memory", "cc" 2196#if defined(__SSE2__) 2197 , "xmm0", "xmm1", "xmm5" 2198#endif 2199 ); 2200} 2201 2202void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, 2203 uint8* dst_u, uint8* dst_v, int pix) { 2204 asm volatile ( 2205 "pcmpeqb %%xmm5,%%xmm5 \n" 2206 "psrlw $0x8,%%xmm5 \n" 2207 "sub %1,%2 \n" 2208 ".p2align 4 \n" 2209 "1: \n" 2210 "movdqa (%0),%%xmm0 \n" 2211 "movdqa 0x10(%0),%%xmm1 \n" 2212 "movdqa (%0,%4,1),%%xmm2 \n" 2213 "movdqa 0x10(%0,%4,1),%%xmm3 \n" 2214 "lea 0x20(%0),%0 \n" 2215 "pavgb %%xmm2,%%xmm0 \n" 2216 "pavgb %%xmm3,%%xmm1 \n" 2217 "psrlw $0x8,%%xmm0 \n" 2218 "psrlw $0x8,%%xmm1 \n" 2219 "packuswb %%xmm1,%%xmm0 \n" 2220 "movdqa %%xmm0,%%xmm1 \n" 2221 "pand %%xmm5,%%xmm0 \n" 2222 "packuswb %%xmm0,%%xmm0 \n" 2223 "psrlw $0x8,%%xmm1 \n" 2224 "packuswb %%xmm1,%%xmm1 \n" 2225 "movq %%xmm0,(%1) \n" 2226 "movq %%xmm1,(%1,%2) \n" 2227 "lea 0x8(%1),%1 \n" 2228 "sub $0x10,%3 \n" 2229 "jg 1b \n" 2230 : "+r"(src_yuy2), // %0 2231 "+r"(dst_u), // %1 2232 "+r"(dst_v), // %2 2233 "+r"(pix) // %3 2234 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4 2235 : "memory", "cc" 2236#if defined(__SSE2__) 2237 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 2238#endif 2239 ); 2240} 2241 2242void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, 2243 uint8* dst_u, uint8* dst_v, int pix) { 2244 asm volatile ( 2245 "pcmpeqb %%xmm5,%%xmm5 \n" 2246 "psrlw $0x8,%%xmm5 \n" 2247 "sub %1,%2 \n" 2248 ".p2align 4 \n" 2249 "1: \n" 2250 "movdqa (%0),%%xmm0 \n" 2251 "movdqa 0x10(%0),%%xmm1 \n" 2252 "lea 0x20(%0),%0 \n" 2253 "psrlw $0x8,%%xmm0 \n" 2254 "psrlw $0x8,%%xmm1 \n" 2255 "packuswb %%xmm1,%%xmm0 \n" 2256 "movdqa %%xmm0,%%xmm1 \n" 2257 "pand %%xmm5,%%xmm0 \n" 2258 "packuswb %%xmm0,%%xmm0 \n" 2259 "psrlw $0x8,%%xmm1 \n" 2260 "packuswb %%xmm1,%%xmm1 \n" 2261 "movq %%xmm0,(%1) \n" 2262 "movq %%xmm1,(%1,%2) \n" 2263 "lea 0x8(%1),%1 \n" 2264 "sub $0x10,%3 \n" 2265 "jg 1b \n" 2266 : "+r"(src_yuy2), // %0 2267 "+r"(dst_u), // %1 2268 "+r"(dst_v), // %2 2269 "+r"(pix) // %3 2270 : 2271 : "memory", "cc" 2272#if defined(__SSE2__) 2273 , "xmm0", "xmm1", "xmm5" 2274#endif 2275 ); 2276} 2277 2278void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2, 2279 uint8* dst_y, int pix) { 2280 asm volatile ( 2281 "pcmpeqb %%xmm5,%%xmm5 \n" 2282 "psrlw $0x8,%%xmm5 \n" 2283 ".p2align 4 \n" 2284 "1: \n" 2285 "movdqu (%0),%%xmm0 \n" 2286 "movdqu 0x10(%0),%%xmm1 \n" 2287 "lea 0x20(%0),%0 \n" 2288 "pand %%xmm5,%%xmm0 \n" 2289 "pand %%xmm5,%%xmm1 \n" 2290 "packuswb %%xmm1,%%xmm0 \n" 2291 "sub $0x10,%2 \n" 2292 "movdqu %%xmm0,(%1) \n" 2293 "lea 0x10(%1),%1 \n" 2294 "jg 1b \n" 2295 : "+r"(src_yuy2), // %0 2296 "+r"(dst_y), // %1 2297 "+r"(pix) // %2 2298 : 2299 : "memory", "cc" 2300#if defined(__SSE2__) 2301 , "xmm0", "xmm1", "xmm5" 2302#endif 2303 ); 2304} 2305 2306void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, 2307 int stride_yuy2, 2308 uint8* dst_u, uint8* dst_v, int pix) { 2309 asm volatile ( 2310 "pcmpeqb %%xmm5,%%xmm5 \n" 2311 "psrlw $0x8,%%xmm5 \n" 2312 "sub %1,%2 \n" 2313 ".p2align 4 \n" 2314 "1: \n" 2315 "movdqu (%0),%%xmm0 \n" 2316 "movdqu 0x10(%0),%%xmm1 \n" 2317 "movdqu (%0,%4,1),%%xmm2 \n" 2318 "movdqu 0x10(%0,%4,1),%%xmm3 \n" 2319 "lea 0x20(%0),%0 \n" 2320 "pavgb %%xmm2,%%xmm0 \n" 2321 "pavgb %%xmm3,%%xmm1 \n" 2322 "psrlw $0x8,%%xmm0 \n" 2323 "psrlw $0x8,%%xmm1 \n" 2324 "packuswb %%xmm1,%%xmm0 \n" 2325 "movdqa %%xmm0,%%xmm1 \n" 2326 "pand %%xmm5,%%xmm0 \n" 2327 "packuswb %%xmm0,%%xmm0 \n" 2328 "psrlw $0x8,%%xmm1 \n" 2329 "packuswb %%xmm1,%%xmm1 \n" 2330 "movq %%xmm0,(%1) \n" 2331 "movq %%xmm1,(%1,%2) \n" 2332 "lea 0x8(%1),%1 \n" 2333 "sub $0x10,%3 \n" 2334 "jg 1b \n" 2335 : "+r"(src_yuy2), // %0 2336 "+r"(dst_u), // %1 2337 "+r"(dst_v), // %2 2338 "+r"(pix) // %3 2339 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4 2340 : "memory", "cc" 2341#if defined(__SSE2__) 2342 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 2343#endif 2344 ); 2345} 2346 2347void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2, 2348 uint8* dst_u, uint8* dst_v, int pix) { 2349 asm volatile ( 2350 "pcmpeqb %%xmm5,%%xmm5 \n" 2351 "psrlw $0x8,%%xmm5 \n" 2352 "sub %1,%2 \n" 2353 ".p2align 4 \n" 2354 "1: \n" 2355 "movdqu (%0),%%xmm0 \n" 2356 "movdqu 0x10(%0),%%xmm1 \n" 2357 "lea 0x20(%0),%0 \n" 2358 "psrlw $0x8,%%xmm0 \n" 2359 "psrlw $0x8,%%xmm1 \n" 2360 "packuswb %%xmm1,%%xmm0 \n" 2361 "movdqa %%xmm0,%%xmm1 \n" 2362 "pand %%xmm5,%%xmm0 \n" 2363 "packuswb %%xmm0,%%xmm0 \n" 2364 "psrlw $0x8,%%xmm1 \n" 2365 "packuswb %%xmm1,%%xmm1 \n" 2366 "movq %%xmm0,(%1) \n" 2367 "movq %%xmm1,(%1,%2) \n" 2368 "lea 0x8(%1),%1 \n" 2369 "sub $0x10,%3 \n" 2370 "jg 1b \n" 2371 : "+r"(src_yuy2), // %0 2372 "+r"(dst_u), // %1 2373 "+r"(dst_v), // %2 2374 "+r"(pix) // %3 2375 : 2376 : "memory", "cc" 2377#if defined(__SSE2__) 2378 , "xmm0", "xmm1", "xmm5" 2379#endif 2380 ); 2381} 2382 2383void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) { 2384 asm volatile ( 2385 ".p2align 4 \n" 2386 "1: \n" 2387 "movdqa (%0),%%xmm0 \n" 2388 "movdqa 0x10(%0),%%xmm1 \n" 2389 "lea 0x20(%0),%0 \n" 2390 "psrlw $0x8,%%xmm0 \n" 2391 "psrlw $0x8,%%xmm1 \n" 2392 "packuswb %%xmm1,%%xmm0 \n" 2393 "sub $0x10,%2 \n" 2394 "movdqa %%xmm0,(%1) \n" 2395 "lea 0x10(%1),%1 \n" 2396 "jg 1b \n" 2397 : "+r"(src_uyvy), // %0 2398 "+r"(dst_y), // %1 2399 "+r"(pix) // %2 2400 : 2401 : "memory", "cc" 2402#if defined(__SSE2__) 2403 , "xmm0", "xmm1" 2404#endif 2405 ); 2406} 2407 2408void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, 2409 uint8* dst_u, uint8* dst_v, int pix) { 2410 asm volatile ( 2411 "pcmpeqb %%xmm5,%%xmm5 \n" 2412 "psrlw $0x8,%%xmm5 \n" 2413 "sub %1,%2 \n" 2414 ".p2align 4 \n" 2415 "1: \n" 2416 "movdqa (%0),%%xmm0 \n" 2417 "movdqa 0x10(%0),%%xmm1 \n" 2418 "movdqa (%0,%4,1),%%xmm2 \n" 2419 "movdqa 0x10(%0,%4,1),%%xmm3 \n" 2420 "lea 0x20(%0),%0 \n" 2421 "pavgb %%xmm2,%%xmm0 \n" 2422 "pavgb %%xmm3,%%xmm1 \n" 2423 "pand %%xmm5,%%xmm0 \n" 2424 "pand %%xmm5,%%xmm1 \n" 2425 "packuswb %%xmm1,%%xmm0 \n" 2426 "movdqa %%xmm0,%%xmm1 \n" 2427 "pand %%xmm5,%%xmm0 \n" 2428 "packuswb %%xmm0,%%xmm0 \n" 2429 "psrlw $0x8,%%xmm1 \n" 2430 "packuswb %%xmm1,%%xmm1 \n" 2431 "movq %%xmm0,(%1) \n" 2432 "movq %%xmm1,(%1,%2) \n" 2433 "lea 0x8(%1),%1 \n" 2434 "sub $0x10,%3 \n" 2435 "jg 1b \n" 2436 : "+r"(src_uyvy), // %0 2437 "+r"(dst_u), // %1 2438 "+r"(dst_v), // %2 2439 "+r"(pix) // %3 2440 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4 2441 : "memory", "cc" 2442#if defined(__SSE2__) 2443 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 2444#endif 2445 ); 2446} 2447 2448void UYVYToUV422Row_SSE2(const uint8* src_uyvy, 2449 uint8* dst_u, uint8* dst_v, int pix) { 2450 asm volatile ( 2451 "pcmpeqb %%xmm5,%%xmm5 \n" 2452 "psrlw $0x8,%%xmm5 \n" 2453 "sub %1,%2 \n" 2454 ".p2align 4 \n" 2455 "1: \n" 2456 "movdqa (%0),%%xmm0 \n" 2457 "movdqa 0x10(%0),%%xmm1 \n" 2458 "lea 0x20(%0),%0 \n" 2459 "pand %%xmm5,%%xmm0 \n" 2460 "pand %%xmm5,%%xmm1 \n" 2461 "packuswb %%xmm1,%%xmm0 \n" 2462 "movdqa %%xmm0,%%xmm1 \n" 2463 "pand %%xmm5,%%xmm0 \n" 2464 "packuswb %%xmm0,%%xmm0 \n" 2465 "psrlw $0x8,%%xmm1 \n" 2466 "packuswb %%xmm1,%%xmm1 \n" 2467 "movq %%xmm0,(%1) \n" 2468 "movq %%xmm1,(%1,%2) \n" 2469 "lea 0x8(%1),%1 \n" 2470 "sub $0x10,%3 \n" 2471 "jg 1b \n" 2472 : "+r"(src_uyvy), // %0 2473 "+r"(dst_u), // %1 2474 "+r"(dst_v), // %2 2475 "+r"(pix) // %3 2476 : 2477 : "memory", "cc" 2478#if defined(__SSE2__) 2479 , "xmm0", "xmm1", "xmm5" 2480#endif 2481 ); 2482} 2483 2484void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy, 2485 uint8* dst_y, int pix) { 2486 asm volatile ( 2487 ".p2align 4 \n" 2488 "1: \n" 2489 "movdqu (%0),%%xmm0 \n" 2490 "movdqu 0x10(%0),%%xmm1 \n" 2491 "lea 0x20(%0),%0 \n" 2492 "psrlw $0x8,%%xmm0 \n" 2493 "psrlw $0x8,%%xmm1 \n" 2494 "packuswb %%xmm1,%%xmm0 \n" 2495 "sub $0x10,%2 \n" 2496 "movdqu %%xmm0,(%1) \n" 2497 "lea 0x10(%1),%1 \n" 2498 "jg 1b \n" 2499 : "+r"(src_uyvy), // %0 2500 "+r"(dst_y), // %1 2501 "+r"(pix) // %2 2502 : 2503 : "memory", "cc" 2504#if defined(__SSE2__) 2505 , "xmm0", "xmm1" 2506#endif 2507 ); 2508} 2509 2510void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, 2511 uint8* dst_u, uint8* dst_v, int pix) { 2512 asm volatile ( 2513 "pcmpeqb %%xmm5,%%xmm5 \n" 2514 "psrlw $0x8,%%xmm5 \n" 2515 "sub %1,%2 \n" 2516 ".p2align 4 \n" 2517 "1: \n" 2518 "movdqu (%0),%%xmm0 \n" 2519 "movdqu 0x10(%0),%%xmm1 \n" 2520 "movdqu (%0,%4,1),%%xmm2 \n" 2521 "movdqu 0x10(%0,%4,1),%%xmm3 \n" 2522 "lea 0x20(%0),%0 \n" 2523 "pavgb %%xmm2,%%xmm0 \n" 2524 "pavgb %%xmm3,%%xmm1 \n" 2525 "pand %%xmm5,%%xmm0 \n" 2526 "pand %%xmm5,%%xmm1 \n" 2527 "packuswb %%xmm1,%%xmm0 \n" 2528 "movdqa %%xmm0,%%xmm1 \n" 2529 "pand %%xmm5,%%xmm0 \n" 2530 "packuswb %%xmm0,%%xmm0 \n" 2531 "psrlw $0x8,%%xmm1 \n" 2532 "packuswb %%xmm1,%%xmm1 \n" 2533 "movq %%xmm0,(%1) \n" 2534 "movq %%xmm1,(%1,%2) \n" 2535 "lea 0x8(%1),%1 \n" 2536 "sub $0x10,%3 \n" 2537 "jg 1b \n" 2538 : "+r"(src_uyvy), // %0 2539 "+r"(dst_u), // %1 2540 "+r"(dst_v), // %2 2541 "+r"(pix) // %3 2542 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4 2543 : "memory", "cc" 2544#if defined(__SSE2__) 2545 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 2546#endif 2547 ); 2548} 2549 2550void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy, 2551 uint8* dst_u, uint8* dst_v, int pix) { 2552 asm volatile ( 2553 "pcmpeqb %%xmm5,%%xmm5 \n" 2554 "psrlw $0x8,%%xmm5 \n" 2555 "sub %1,%2 \n" 2556 ".p2align 4 \n" 2557 "1: \n" 2558 "movdqu (%0),%%xmm0 \n" 2559 "movdqu 0x10(%0),%%xmm1 \n" 2560 "lea 0x20(%0),%0 \n" 2561 "pand %%xmm5,%%xmm0 \n" 2562 "pand %%xmm5,%%xmm1 \n" 2563 "packuswb %%xmm1,%%xmm0 \n" 2564 "movdqa %%xmm0,%%xmm1 \n" 2565 "pand %%xmm5,%%xmm0 \n" 2566 "packuswb %%xmm0,%%xmm0 \n" 2567 "psrlw $0x8,%%xmm1 \n" 2568 "packuswb %%xmm1,%%xmm1 \n" 2569 "movq %%xmm0,(%1) \n" 2570 "movq %%xmm1,(%1,%2) \n" 2571 "lea 0x8(%1),%1 \n" 2572 "sub $0x10,%3 \n" 2573 "jg 1b \n" 2574 : "+r"(src_uyvy), // %0 2575 "+r"(dst_u), // %1 2576 "+r"(dst_v), // %2 2577 "+r"(pix) // %3 2578 : 2579 : "memory", "cc" 2580#if defined(__SSE2__) 2581 , "xmm0", "xmm1", "xmm5" 2582#endif 2583 ); 2584} 2585#endif // HAS_YUY2TOYROW_SSE2 2586 2587#ifdef HAS_ARGBBLENDROW_SSE2 2588// Blend 8 pixels at a time. 2589void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, 2590 uint8* dst_argb, int width) { 2591 asm volatile ( 2592 "pcmpeqb %%xmm7,%%xmm7 \n" 2593 "psrlw $0xf,%%xmm7 \n" 2594 "pcmpeqb %%xmm6,%%xmm6 \n" 2595 "psrlw $0x8,%%xmm6 \n" 2596 "pcmpeqb %%xmm5,%%xmm5 \n" 2597 "psllw $0x8,%%xmm5 \n" 2598 "pcmpeqb %%xmm4,%%xmm4 \n" 2599 "pslld $0x18,%%xmm4 \n" 2600 "sub $0x1,%3 \n" 2601 "je 91f \n" 2602 "jl 99f \n" 2603 2604 // 1 pixel loop until destination pointer is aligned. 2605 "10: \n" 2606 "test $0xf,%2 \n" 2607 "je 19f \n" 2608 "movd (%0),%%xmm3 \n" 2609 "lea 0x4(%0),%0 \n" 2610 "movdqa %%xmm3,%%xmm0 \n" 2611 "pxor %%xmm4,%%xmm3 \n" 2612 "movd (%1),%%xmm2 \n" 2613 "psrlw $0x8,%%xmm3 \n" 2614 "pshufhw $0xf5,%%xmm3,%%xmm3 \n" 2615 "pshuflw $0xf5,%%xmm3,%%xmm3 \n" 2616 "pand %%xmm6,%%xmm2 \n" 2617 "paddw %%xmm7,%%xmm3 \n" 2618 "pmullw %%xmm3,%%xmm2 \n" 2619 "movd (%1),%%xmm1 \n" 2620 "lea 0x4(%1),%1 \n" 2621 "psrlw $0x8,%%xmm1 \n" 2622 "por %%xmm4,%%xmm0 \n" 2623 "pmullw %%xmm3,%%xmm1 \n" 2624 "psrlw $0x8,%%xmm2 \n" 2625 "paddusb %%xmm2,%%xmm0 \n" 2626 "pand %%xmm5,%%xmm1 \n" 2627 "paddusb %%xmm1,%%xmm0 \n" 2628 "sub $0x1,%3 \n" 2629 "movd %%xmm0,(%2) \n" 2630 "lea 0x4(%2),%2 \n" 2631 "jge 10b \n" 2632 2633 "19: \n" 2634 "add $1-4,%3 \n" 2635 "jl 49f \n" 2636 2637 // 4 pixel loop. 2638 ".p2align 2 \n" 2639 "41: \n" 2640 "movdqu (%0),%%xmm3 \n" 2641 "lea 0x10(%0),%0 \n" 2642 "movdqa %%xmm3,%%xmm0 \n" 2643 "pxor %%xmm4,%%xmm3 \n" 2644 "movdqu (%1),%%xmm2 \n" 2645 "psrlw $0x8,%%xmm3 \n" 2646 "pshufhw $0xf5,%%xmm3,%%xmm3 \n" 2647 "pshuflw $0xf5,%%xmm3,%%xmm3 \n" 2648 "pand %%xmm6,%%xmm2 \n" 2649 "paddw %%xmm7,%%xmm3 \n" 2650 "pmullw %%xmm3,%%xmm2 \n" 2651 "movdqu (%1),%%xmm1 \n" 2652 "lea 0x10(%1),%1 \n" 2653 "psrlw $0x8,%%xmm1 \n" 2654 "por %%xmm4,%%xmm0 \n" 2655 "pmullw %%xmm3,%%xmm1 \n" 2656 "psrlw $0x8,%%xmm2 \n" 2657 "paddusb %%xmm2,%%xmm0 \n" 2658 "pand %%xmm5,%%xmm1 \n" 2659 "paddusb %%xmm1,%%xmm0 \n" 2660 "sub $0x4,%3 \n" 2661 "movdqa %%xmm0,(%2) \n" 2662 "lea 0x10(%2),%2 \n" 2663 "jge 41b \n" 2664 2665 "49: \n" 2666 "add $0x3,%3 \n" 2667 "jl 99f \n" 2668 2669 // 1 pixel loop. 2670 "91: \n" 2671 "movd (%0),%%xmm3 \n" 2672 "lea 0x4(%0),%0 \n" 2673 "movdqa %%xmm3,%%xmm0 \n" 2674 "pxor %%xmm4,%%xmm3 \n" 2675 "movd (%1),%%xmm2 \n" 2676 "psrlw $0x8,%%xmm3 \n" 2677 "pshufhw $0xf5,%%xmm3,%%xmm3 \n" 2678 "pshuflw $0xf5,%%xmm3,%%xmm3 \n" 2679 "pand %%xmm6,%%xmm2 \n" 2680 "paddw %%xmm7,%%xmm3 \n" 2681 "pmullw %%xmm3,%%xmm2 \n" 2682 "movd (%1),%%xmm1 \n" 2683 "lea 0x4(%1),%1 \n" 2684 "psrlw $0x8,%%xmm1 \n" 2685 "por %%xmm4,%%xmm0 \n" 2686 "pmullw %%xmm3,%%xmm1 \n" 2687 "psrlw $0x8,%%xmm2 \n" 2688 "paddusb %%xmm2,%%xmm0 \n" 2689 "pand %%xmm5,%%xmm1 \n" 2690 "paddusb %%xmm1,%%xmm0 \n" 2691 "sub $0x1,%3 \n" 2692 "movd %%xmm0,(%2) \n" 2693 "lea 0x4(%2),%2 \n" 2694 "jge 91b \n" 2695 "99: \n" 2696 : "+r"(src_argb0), // %0 2697 "+r"(src_argb1), // %1 2698 "+r"(dst_argb), // %2 2699 "+r"(width) // %3 2700 : 2701 : "memory", "cc" 2702#if defined(__SSE2__) 2703 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 2704#endif 2705 ); 2706} 2707#endif // HAS_ARGBBLENDROW_SSE2 2708 2709#ifdef HAS_ARGBBLENDROW_SSSE3 2710// Shuffle table for isolating alpha. 2711CONST uvec8 kShuffleAlpha = { 2712 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, 2713 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 2714}; 2715 2716// Blend 8 pixels at a time 2717// Shuffle table for reversing the bytes. 2718 2719// Same as SSE2, but replaces 2720// psrlw xmm3, 8 // alpha 2721// pshufhw xmm3, xmm3,0F5h // 8 alpha words 2722// pshuflw xmm3, xmm3,0F5h 2723// with.. 2724// pshufb xmm3, kShuffleAlpha // alpha 2725 2726void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, 2727 uint8* dst_argb, int width) { 2728 asm volatile ( 2729 "pcmpeqb %%xmm7,%%xmm7 \n" 2730 "psrlw $0xf,%%xmm7 \n" 2731 "pcmpeqb %%xmm6,%%xmm6 \n" 2732 "psrlw $0x8,%%xmm6 \n" 2733 "pcmpeqb %%xmm5,%%xmm5 \n" 2734 "psllw $0x8,%%xmm5 \n" 2735 "pcmpeqb %%xmm4,%%xmm4 \n" 2736 "pslld $0x18,%%xmm4 \n" 2737 "sub $0x1,%3 \n" 2738 "je 91f \n" 2739 "jl 99f \n" 2740 2741 // 1 pixel loop until destination pointer is aligned. 2742 "10: \n" 2743 "test $0xf,%2 \n" 2744 "je 19f \n" 2745 "movd (%0),%%xmm3 \n" 2746 "lea 0x4(%0),%0 \n" 2747 "movdqa %%xmm3,%%xmm0 \n" 2748 "pxor %%xmm4,%%xmm3 \n" 2749 "movd (%1),%%xmm2 \n" 2750 "pshufb %4,%%xmm3 \n" 2751 "pand %%xmm6,%%xmm2 \n" 2752 "paddw %%xmm7,%%xmm3 \n" 2753 "pmullw %%xmm3,%%xmm2 \n" 2754 "movd (%1),%%xmm1 \n" 2755 "lea 0x4(%1),%1 \n" 2756 "psrlw $0x8,%%xmm1 \n" 2757 "por %%xmm4,%%xmm0 \n" 2758 "pmullw %%xmm3,%%xmm1 \n" 2759 "psrlw $0x8,%%xmm2 \n" 2760 "paddusb %%xmm2,%%xmm0 \n" 2761 "pand %%xmm5,%%xmm1 \n" 2762 "paddusb %%xmm1,%%xmm0 \n" 2763 "sub $0x1,%3 \n" 2764 "movd %%xmm0,(%2) \n" 2765 "lea 0x4(%2),%2 \n" 2766 "jge 10b \n" 2767 2768 "19: \n" 2769 "add $1-4,%3 \n" 2770 "jl 49f \n" 2771 "test $0xf,%0 \n" 2772 "jne 41f \n" 2773 "test $0xf,%1 \n" 2774 "jne 41f \n" 2775 2776 // 4 pixel loop. 2777 ".p2align 2 \n" 2778 "40: \n" 2779 "movdqa (%0),%%xmm3 \n" 2780 "lea 0x10(%0),%0 \n" 2781 "movdqa %%xmm3,%%xmm0 \n" 2782 "pxor %%xmm4,%%xmm3 \n" 2783 "movdqa (%1),%%xmm2 \n" 2784 "pshufb %4,%%xmm3 \n" 2785 "pand %%xmm6,%%xmm2 \n" 2786 "paddw %%xmm7,%%xmm3 \n" 2787 "pmullw %%xmm3,%%xmm2 \n" 2788 "movdqa (%1),%%xmm1 \n" 2789 "lea 0x10(%1),%1 \n" 2790 "psrlw $0x8,%%xmm1 \n" 2791 "por %%xmm4,%%xmm0 \n" 2792 "pmullw %%xmm3,%%xmm1 \n" 2793 "psrlw $0x8,%%xmm2 \n" 2794 "paddusb %%xmm2,%%xmm0 \n" 2795 "pand %%xmm5,%%xmm1 \n" 2796 "paddusb %%xmm1,%%xmm0 \n" 2797 "sub $0x4,%3 \n" 2798 "movdqa %%xmm0,(%2) \n" 2799 "lea 0x10(%2),%2 \n" 2800 "jge 40b \n" 2801 "jmp 49f \n" 2802 2803 // 4 pixel unaligned loop. 2804 ".p2align 2 \n" 2805 "41: \n" 2806 "movdqu (%0),%%xmm3 \n" 2807 "lea 0x10(%0),%0 \n" 2808 "movdqa %%xmm3,%%xmm0 \n" 2809 "pxor %%xmm4,%%xmm3 \n" 2810 "movdqu (%1),%%xmm2 \n" 2811 "pshufb %4,%%xmm3 \n" 2812 "pand %%xmm6,%%xmm2 \n" 2813 "paddw %%xmm7,%%xmm3 \n" 2814 "pmullw %%xmm3,%%xmm2 \n" 2815 "movdqu (%1),%%xmm1 \n" 2816 "lea 0x10(%1),%1 \n" 2817 "psrlw $0x8,%%xmm1 \n" 2818 "por %%xmm4,%%xmm0 \n" 2819 "pmullw %%xmm3,%%xmm1 \n" 2820 "psrlw $0x8,%%xmm2 \n" 2821 "paddusb %%xmm2,%%xmm0 \n" 2822 "pand %%xmm5,%%xmm1 \n" 2823 "paddusb %%xmm1,%%xmm0 \n" 2824 "sub $0x4,%3 \n" 2825 "movdqa %%xmm0,(%2) \n" 2826 "lea 0x10(%2),%2 \n" 2827 "jge 41b \n" 2828 2829 "49: \n" 2830 "add $0x3,%3 \n" 2831 "jl 99f \n" 2832 2833 // 1 pixel loop. 2834 "91: \n" 2835 "movd (%0),%%xmm3 \n" 2836 "lea 0x4(%0),%0 \n" 2837 "movdqa %%xmm3,%%xmm0 \n" 2838 "pxor %%xmm4,%%xmm3 \n" 2839 "movd (%1),%%xmm2 \n" 2840 "pshufb %4,%%xmm3 \n" 2841 "pand %%xmm6,%%xmm2 \n" 2842 "paddw %%xmm7,%%xmm3 \n" 2843 "pmullw %%xmm3,%%xmm2 \n" 2844 "movd (%1),%%xmm1 \n" 2845 "lea 0x4(%1),%1 \n" 2846 "psrlw $0x8,%%xmm1 \n" 2847 "por %%xmm4,%%xmm0 \n" 2848 "pmullw %%xmm3,%%xmm1 \n" 2849 "psrlw $0x8,%%xmm2 \n" 2850 "paddusb %%xmm2,%%xmm0 \n" 2851 "pand %%xmm5,%%xmm1 \n" 2852 "paddusb %%xmm1,%%xmm0 \n" 2853 "sub $0x1,%3 \n" 2854 "movd %%xmm0,(%2) \n" 2855 "lea 0x4(%2),%2 \n" 2856 "jge 91b \n" 2857 "99: \n" 2858 : "+r"(src_argb0), // %0 2859 "+r"(src_argb1), // %1 2860 "+r"(dst_argb), // %2 2861 "+r"(width) // %3 2862 : "m"(kShuffleAlpha) // %4 2863 : "memory", "cc" 2864#if defined(__SSE2__) 2865 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 2866#endif 2867 ); 2868} 2869#endif // HAS_ARGBBLENDROW_SSSE3 2870 2871#ifdef HAS_ARGBATTENUATE_SSE2 2872// Attenuate 4 pixels at a time. 2873// aligned to 16 bytes 2874void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { 2875 asm volatile ( 2876 "sub %0,%1 \n" 2877 "pcmpeqb %%xmm4,%%xmm4 \n" 2878 "pslld $0x18,%%xmm4 \n" 2879 "pcmpeqb %%xmm5,%%xmm5 \n" 2880 "psrld $0x8,%%xmm5 \n" 2881 2882 // 4 pixel loop. 2883 ".p2align 4 \n" 2884 "1: \n" 2885 "movdqa (%0),%%xmm0 \n" 2886 "punpcklbw %%xmm0,%%xmm0 \n" 2887 "pshufhw $0xff,%%xmm0,%%xmm2 \n" 2888 "pshuflw $0xff,%%xmm2,%%xmm2 \n" 2889 "pmulhuw %%xmm2,%%xmm0 \n" 2890 "movdqa (%0),%%xmm1 \n" 2891 "punpckhbw %%xmm1,%%xmm1 \n" 2892 "pshufhw $0xff,%%xmm1,%%xmm2 \n" 2893 "pshuflw $0xff,%%xmm2,%%xmm2 \n" 2894 "pmulhuw %%xmm2,%%xmm1 \n" 2895 "movdqa (%0),%%xmm2 \n" 2896 "psrlw $0x8,%%xmm0 \n" 2897 "pand %%xmm4,%%xmm2 \n" 2898 "psrlw $0x8,%%xmm1 \n" 2899 "packuswb %%xmm1,%%xmm0 \n" 2900 "pand %%xmm5,%%xmm0 \n" 2901 "por %%xmm2,%%xmm0 \n" 2902 "sub $0x4,%2 \n" 2903 "movdqa %%xmm0,(%0,%1,1) \n" 2904 "lea 0x10(%0),%0 \n" 2905 "jg 1b \n" 2906 : "+r"(src_argb), // %0 2907 "+r"(dst_argb), // %1 2908 "+r"(width) // %2 2909 : 2910 : "memory", "cc" 2911#if defined(__SSE2__) 2912 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2913#endif 2914 ); 2915} 2916#endif // HAS_ARGBATTENUATE_SSE2 2917 2918#ifdef HAS_ARGBATTENUATEROW_SSSE3 2919// Shuffle table duplicating alpha 2920CONST uvec8 kShuffleAlpha0 = { 2921 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u, 2922}; 2923CONST uvec8 kShuffleAlpha1 = { 2924 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, 2925 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u, 2926}; 2927// Attenuate 4 pixels at a time. 2928// aligned to 16 bytes 2929void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { 2930 asm volatile ( 2931 "sub %0,%1 \n" 2932 "pcmpeqb %%xmm3,%%xmm3 \n" 2933 "pslld $0x18,%%xmm3 \n" 2934 "movdqa %3,%%xmm4 \n" 2935 "movdqa %4,%%xmm5 \n" 2936 2937 // 4 pixel loop. 2938 ".p2align 4 \n" 2939 "1: \n" 2940 "movdqa (%0),%%xmm0 \n" 2941 "pshufb %%xmm4,%%xmm0 \n" 2942 "movdqa (%0),%%xmm1 \n" 2943 "punpcklbw %%xmm1,%%xmm1 \n" 2944 "pmulhuw %%xmm1,%%xmm0 \n" 2945 "movdqa (%0),%%xmm1 \n" 2946 "pshufb %%xmm5,%%xmm1 \n" 2947 "movdqa (%0),%%xmm2 \n" 2948 "punpckhbw %%xmm2,%%xmm2 \n" 2949 "pmulhuw %%xmm2,%%xmm1 \n" 2950 "movdqa (%0),%%xmm2 \n" 2951 "pand %%xmm3,%%xmm2 \n" 2952 "psrlw $0x8,%%xmm0 \n" 2953 "psrlw $0x8,%%xmm1 \n" 2954 "packuswb %%xmm1,%%xmm0 \n" 2955 "por %%xmm2,%%xmm0 \n" 2956 "sub $0x4,%2 \n" 2957 "movdqa %%xmm0,(%0,%1,1) \n" 2958 "lea 0x10(%0),%0 \n" 2959 "jg 1b \n" 2960 : "+r"(src_argb), // %0 2961 "+r"(dst_argb), // %1 2962 "+r"(width) // %2 2963 : "m"(kShuffleAlpha0), // %3 2964 "m"(kShuffleAlpha1) // %4 2965 : "memory", "cc" 2966#if defined(__SSE2__) 2967 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2968#endif 2969 ); 2970} 2971#endif // HAS_ARGBATTENUATEROW_SSSE3 2972 2973#ifdef HAS_ARGBUNATTENUATEROW_SSE2 2974// Unattenuate 4 pixels at a time. 2975// aligned to 16 bytes 2976void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, 2977 int width) { 2978 uintptr_t alpha = 0; 2979 asm volatile ( 2980 "sub %0,%1 \n" 2981 "pcmpeqb %%xmm4,%%xmm4 \n" 2982 "pslld $0x18,%%xmm4 \n" 2983 2984 // 4 pixel loop. 2985 ".p2align 4 \n" 2986 "1: \n" 2987 "movdqa (%0),%%xmm0 \n" 2988 "movzb 0x3(%0),%3 \n" 2989 "punpcklbw %%xmm0,%%xmm0 \n" 2990 "movd 0x0(%4,%3,4),%%xmm2 \n" 2991 "movzb 0x7(%0),%3 \n" 2992 "movd 0x0(%4,%3,4),%%xmm3 \n" 2993 "pshuflw $0xc0,%%xmm2,%%xmm2 \n" 2994 "pshuflw $0xc0,%%xmm3,%%xmm3 \n" 2995 "movlhps %%xmm3,%%xmm2 \n" 2996 "pmulhuw %%xmm2,%%xmm0 \n" 2997 "movdqa (%0),%%xmm1 \n" 2998 "movzb 0xb(%0),%3 \n" 2999 "punpckhbw %%xmm1,%%xmm1 \n" 3000 "movd 0x0(%4,%3,4),%%xmm2 \n" 3001 "movzb 0xf(%0),%3 \n" 3002 "movd 0x0(%4,%3,4),%%xmm3 \n" 3003 "pshuflw $0xc0,%%xmm2,%%xmm2 \n" 3004 "pshuflw $0xc0,%%xmm3,%%xmm3 \n" 3005 "movlhps %%xmm3,%%xmm2 \n" 3006 "pmulhuw %%xmm2,%%xmm1 \n" 3007 "movdqa (%0),%%xmm2 \n" 3008 "pand %%xmm4,%%xmm2 \n" 3009 "packuswb %%xmm1,%%xmm0 \n" 3010 "por %%xmm2,%%xmm0 \n" 3011 "sub $0x4,%2 \n" 3012 "movdqa %%xmm0,(%0,%1,1) \n" 3013 "lea 0x10(%0),%0 \n" 3014 "jg 1b \n" 3015 : "+r"(src_argb), // %0 3016 "+r"(dst_argb), // %1 3017 "+r"(width), // %2 3018 "+r"(alpha) // %3 3019 : "r"(fixed_invtbl8) // %4 3020 : "memory", "cc" 3021#if defined(__SSE2__) 3022 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 3023#endif 3024 ); 3025} 3026#endif // HAS_ARGBUNATTENUATEROW_SSE2 3027 3028#ifdef HAS_ARGBGRAYROW_SSSE3 3029// Constant for ARGB color to gray scale. 0.11 * B + 0.59 * G + 0.30 * R 3030CONST vec8 kARGBToGray = { 3031 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0 3032}; 3033 3034// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels 3035void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { 3036 asm volatile ( 3037 "movdqa %3,%%xmm4 \n" 3038 "sub %0,%1 \n" 3039 3040 // 8 pixel loop. 3041 ".p2align 4 \n" 3042 "1: \n" 3043 "movdqa (%0),%%xmm0 \n" 3044 "movdqa 0x10(%0),%%xmm1 \n" 3045 "pmaddubsw %%xmm4,%%xmm0 \n" 3046 "pmaddubsw %%xmm4,%%xmm1 \n" 3047 "phaddw %%xmm1,%%xmm0 \n" 3048 "psrlw $0x7,%%xmm0 \n" 3049 "packuswb %%xmm0,%%xmm0 \n" 3050 "movdqa (%0),%%xmm2 \n" 3051 "movdqa 0x10(%0),%%xmm3 \n" 3052 "psrld $0x18,%%xmm2 \n" 3053 "psrld $0x18,%%xmm3 \n" 3054 "packuswb %%xmm3,%%xmm2 \n" 3055 "packuswb %%xmm2,%%xmm2 \n" 3056 "movdqa %%xmm0,%%xmm3 \n" 3057 "punpcklbw %%xmm0,%%xmm0 \n" 3058 "punpcklbw %%xmm2,%%xmm3 \n" 3059 "movdqa %%xmm0,%%xmm1 \n" 3060 "punpcklwd %%xmm3,%%xmm0 \n" 3061 "punpckhwd %%xmm3,%%xmm1 \n" 3062 "sub $0x8,%2 \n" 3063 "movdqa %%xmm0,(%0,%1,1) \n" 3064 "movdqa %%xmm1,0x10(%0,%1,1) \n" 3065 "lea 0x20(%0),%0 \n" 3066 "jg 1b \n" 3067 : "+r"(src_argb), // %0 3068 "+r"(dst_argb), // %1 3069 "+r"(width) // %2 3070 : "m"(kARGBToGray) // %3 3071 : "memory", "cc" 3072#if defined(__SSE2__) 3073 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" 3074#endif 3075 ); 3076} 3077#endif // HAS_ARGBGRAYROW_SSSE3 3078 3079#ifdef HAS_ARGBSEPIAROW_SSSE3 3080// b = (r * 35 + g * 68 + b * 17) >> 7 3081// g = (r * 45 + g * 88 + b * 22) >> 7 3082// r = (r * 50 + g * 98 + b * 24) >> 7 3083// Constant for ARGB color to sepia tone 3084CONST vec8 kARGBToSepiaB = { 3085 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0 3086}; 3087 3088CONST vec8 kARGBToSepiaG = { 3089 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0 3090}; 3091 3092CONST vec8 kARGBToSepiaR = { 3093 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0 3094}; 3095 3096// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. 3097void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { 3098 asm volatile ( 3099 "movdqa %2,%%xmm2 \n" 3100 "movdqa %3,%%xmm3 \n" 3101 "movdqa %4,%%xmm4 \n" 3102 3103 // 8 pixel loop. 3104 ".p2align 4 \n" 3105 "1: \n" 3106 "movdqa (%0),%%xmm0 \n" 3107 "movdqa 0x10(%0),%%xmm6 \n" 3108 "pmaddubsw %%xmm2,%%xmm0 \n" 3109 "pmaddubsw %%xmm2,%%xmm6 \n" 3110 "phaddw %%xmm6,%%xmm0 \n" 3111 "psrlw $0x7,%%xmm0 \n" 3112 "packuswb %%xmm0,%%xmm0 \n" 3113 "movdqa (%0),%%xmm5 \n" 3114 "movdqa 0x10(%0),%%xmm1 \n" 3115 "pmaddubsw %%xmm3,%%xmm5 \n" 3116 "pmaddubsw %%xmm3,%%xmm1 \n" 3117 "phaddw %%xmm1,%%xmm5 \n" 3118 "psrlw $0x7,%%xmm5 \n" 3119 "packuswb %%xmm5,%%xmm5 \n" 3120 "punpcklbw %%xmm5,%%xmm0 \n" 3121 "movdqa (%0),%%xmm5 \n" 3122 "movdqa 0x10(%0),%%xmm1 \n" 3123 "pmaddubsw %%xmm4,%%xmm5 \n" 3124 "pmaddubsw %%xmm4,%%xmm1 \n" 3125 "phaddw %%xmm1,%%xmm5 \n" 3126 "psrlw $0x7,%%xmm5 \n" 3127 "packuswb %%xmm5,%%xmm5 \n" 3128 "movdqa (%0),%%xmm6 \n" 3129 "movdqa 0x10(%0),%%xmm1 \n" 3130 "psrld $0x18,%%xmm6 \n" 3131 "psrld $0x18,%%xmm1 \n" 3132 "packuswb %%xmm1,%%xmm6 \n" 3133 "packuswb %%xmm6,%%xmm6 \n" 3134 "punpcklbw %%xmm6,%%xmm5 \n" 3135 "movdqa %%xmm0,%%xmm1 \n" 3136 "punpcklwd %%xmm5,%%xmm0 \n" 3137 "punpckhwd %%xmm5,%%xmm1 \n" 3138 "sub $0x8,%1 \n" 3139 "movdqa %%xmm0,(%0) \n" 3140 "movdqa %%xmm1,0x10(%0) \n" 3141 "lea 0x20(%0),%0 \n" 3142 "jg 1b \n" 3143 : "+r"(dst_argb), // %0 3144 "+r"(width) // %1 3145 : "m"(kARGBToSepiaB), // %2 3146 "m"(kARGBToSepiaG), // %3 3147 "m"(kARGBToSepiaR) // %4 3148 : "memory", "cc" 3149#if defined(__SSE2__) 3150 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" 3151#endif 3152 ); 3153} 3154#endif // HAS_ARGBSEPIAROW_SSSE3 3155 3156#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 3157// Tranform 8 ARGB pixels (32 bytes) with color matrix. 3158// Same as Sepia except matrix is provided. 3159void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb, 3160 int width) { 3161 asm volatile ( 3162 "movd (%2),%%xmm2 \n" 3163 "movd 0x4(%2),%%xmm3 \n" 3164 "movd 0x8(%2),%%xmm4 \n" 3165 "pshufd $0x0,%%xmm2,%%xmm2 \n" 3166 "pshufd $0x0,%%xmm3,%%xmm3 \n" 3167 "pshufd $0x0,%%xmm4,%%xmm4 \n" 3168 3169 // 8 pixel loop. 3170 ".p2align 4 \n" 3171 "1: \n" 3172 "movdqa (%0),%%xmm0 \n" 3173 "movdqa 0x10(%0),%%xmm6 \n" 3174 "pmaddubsw %%xmm2,%%xmm0 \n" 3175 "pmaddubsw %%xmm2,%%xmm6 \n" 3176 "movdqa (%0),%%xmm5 \n" 3177 "movdqa 0x10(%0),%%xmm1 \n" 3178 "pmaddubsw %%xmm3,%%xmm5 \n" 3179 "pmaddubsw %%xmm3,%%xmm1 \n" 3180 "phaddsw %%xmm6,%%xmm0 \n" 3181 "phaddsw %%xmm1,%%xmm5 \n" 3182 "psraw $0x7,%%xmm0 \n" 3183 "psraw $0x7,%%xmm5 \n" 3184 "packuswb %%xmm0,%%xmm0 \n" 3185 "packuswb %%xmm5,%%xmm5 \n" 3186 "punpcklbw %%xmm5,%%xmm0 \n" 3187 "movdqa (%0),%%xmm5 \n" 3188 "movdqa 0x10(%0),%%xmm1 \n" 3189 "pmaddubsw %%xmm4,%%xmm5 \n" 3190 "pmaddubsw %%xmm4,%%xmm1 \n" 3191 "phaddsw %%xmm1,%%xmm5 \n" 3192 "psraw $0x7,%%xmm5 \n" 3193 "packuswb %%xmm5,%%xmm5 \n" 3194 "movdqa (%0),%%xmm6 \n" 3195 "movdqa 0x10(%0),%%xmm1 \n" 3196 "psrld $0x18,%%xmm6 \n" 3197 "psrld $0x18,%%xmm1 \n" 3198 "packuswb %%xmm1,%%xmm6 \n" 3199 "packuswb %%xmm6,%%xmm6 \n" 3200 "movdqa %%xmm0,%%xmm1 \n" 3201 "punpcklbw %%xmm6,%%xmm5 \n" 3202 "punpcklwd %%xmm5,%%xmm0 \n" 3203 "punpckhwd %%xmm5,%%xmm1 \n" 3204 "sub $0x8,%1 \n" 3205 "movdqa %%xmm0,(%0) \n" 3206 "movdqa %%xmm1,0x10(%0) \n" 3207 "lea 0x20(%0),%0 \n" 3208 "jg 1b \n" 3209 : "+r"(dst_argb), // %0 3210 "+r"(width) // %1 3211 : "r"(matrix_argb) // %2 3212 : "memory", "cc" 3213#if defined(__SSE2__) 3214 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" 3215#endif 3216 ); 3217} 3218#endif // HAS_ARGBCOLORMATRIXROW_SSSE3 3219 3220#ifdef HAS_ARGBQUANTIZEROW_SSE2 3221// Quantize 4 ARGB pixels (16 bytes). 3222// aligned to 16 bytes 3223void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, 3224 int interval_offset, int width) { 3225 asm volatile ( 3226 "movd %2,%%xmm2 \n" 3227 "movd %3,%%xmm3 \n" 3228 "movd %4,%%xmm4 \n" 3229 "pshuflw $0x40,%%xmm2,%%xmm2 \n" 3230 "pshufd $0x44,%%xmm2,%%xmm2 \n" 3231 "pshuflw $0x40,%%xmm3,%%xmm3 \n" 3232 "pshufd $0x44,%%xmm3,%%xmm3 \n" 3233 "pshuflw $0x40,%%xmm4,%%xmm4 \n" 3234 "pshufd $0x44,%%xmm4,%%xmm4 \n" 3235 "pxor %%xmm5,%%xmm5 \n" 3236 "pcmpeqb %%xmm6,%%xmm6 \n" 3237 "pslld $0x18,%%xmm6 \n" 3238 3239 // 4 pixel loop. 3240 ".p2align 2 \n" 3241 "1: \n" 3242 "movdqa (%0),%%xmm0 \n" 3243 "punpcklbw %%xmm5,%%xmm0 \n" 3244 "pmulhuw %%xmm2,%%xmm0 \n" 3245 "movdqa (%0),%%xmm1 \n" 3246 "punpckhbw %%xmm5,%%xmm1 \n" 3247 "pmulhuw %%xmm2,%%xmm1 \n" 3248 "pmullw %%xmm3,%%xmm0 \n" 3249 "movdqa (%0),%%xmm7 \n" 3250 "pmullw %%xmm3,%%xmm1 \n" 3251 "pand %%xmm6,%%xmm7 \n" 3252 "paddw %%xmm4,%%xmm0 \n" 3253 "paddw %%xmm4,%%xmm1 \n" 3254 "packuswb %%xmm1,%%xmm0 \n" 3255 "por %%xmm7,%%xmm0 \n" 3256 "sub $0x4,%1 \n" 3257 "movdqa %%xmm0,(%0) \n" 3258 "lea 0x10(%0),%0 \n" 3259 "jg 1b \n" 3260 : "+r"(dst_argb), // %0 3261 "+r"(width) // %1 3262 : "r"(scale), // %2 3263 "r"(interval_size), // %3 3264 "r"(interval_offset) // %4 3265 : "memory", "cc" 3266#if defined(__SSE2__) 3267 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 3268#endif 3269 ); 3270} 3271#endif // HAS_ARGBQUANTIZEROW_SSE2 3272 3273#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 3274// Creates a table of cumulative sums where each value is a sum of all values 3275// above and to the left of the value, inclusive of the value. 3276void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, 3277 const int32* previous_cumsum, int width) { 3278 asm volatile ( 3279 "sub %1,%2 \n" 3280 "pxor %%xmm0,%%xmm0 \n" 3281 "pxor %%xmm1,%%xmm1 \n" 3282 "sub $0x4,%3 \n" 3283 "jl 49f \n" 3284 "test $0xf,%1 \n" 3285 "jne 49f \n" 3286 3287 // 4 pixel loop \n" 3288 ".p2align 2 \n" 3289 "40: \n" 3290 "movdqu (%0),%%xmm2 \n" 3291 "lea 0x10(%0),%0 \n" 3292 "movdqa %%xmm2,%%xmm4 \n" 3293 "punpcklbw %%xmm1,%%xmm2 \n" 3294 "movdqa %%xmm2,%%xmm3 \n" 3295 "punpcklwd %%xmm1,%%xmm2 \n" 3296 "punpckhwd %%xmm1,%%xmm3 \n" 3297 "punpckhbw %%xmm1,%%xmm4 \n" 3298 "movdqa %%xmm4,%%xmm5 \n" 3299 "punpcklwd %%xmm1,%%xmm4 \n" 3300 "punpckhwd %%xmm1,%%xmm5 \n" 3301 "paddd %%xmm2,%%xmm0 \n" 3302 "movdqa (%1,%2,1),%%xmm2 \n" 3303 "paddd %%xmm0,%%xmm2 \n" 3304 "paddd %%xmm3,%%xmm0 \n" 3305 "movdqa 0x10(%1,%2,1),%%xmm3 \n" 3306 "paddd %%xmm0,%%xmm3 \n" 3307 "paddd %%xmm4,%%xmm0 \n" 3308 "movdqa 0x20(%1,%2,1),%%xmm4 \n" 3309 "paddd %%xmm0,%%xmm4 \n" 3310 "paddd %%xmm5,%%xmm0 \n" 3311 "movdqa 0x30(%1,%2,1),%%xmm5 \n" 3312 "paddd %%xmm0,%%xmm5 \n" 3313 "movdqa %%xmm2,(%1) \n" 3314 "movdqa %%xmm3,0x10(%1) \n" 3315 "movdqa %%xmm4,0x20(%1) \n" 3316 "movdqa %%xmm5,0x30(%1) \n" 3317 "lea 0x40(%1),%1 \n" 3318 "sub $0x4,%3 \n" 3319 "jge 40b \n" 3320 3321 "49: \n" 3322 "add $0x3,%3 \n" 3323 "jl 19f \n" 3324 3325 // 1 pixel loop \n" 3326 ".p2align 2 \n" 3327 "10: \n" 3328 "movd (%0),%%xmm2 \n" 3329 "lea 0x4(%0),%0 \n" 3330 "punpcklbw %%xmm1,%%xmm2 \n" 3331 "punpcklwd %%xmm1,%%xmm2 \n" 3332 "paddd %%xmm2,%%xmm0 \n" 3333 "movdqu (%1,%2,1),%%xmm2 \n" 3334 "paddd %%xmm0,%%xmm2 \n" 3335 "movdqu %%xmm2,(%1) \n" 3336 "lea 0x10(%1),%1 \n" 3337 "sub $0x1,%3 \n" 3338 "jge 10b \n" 3339 3340 "19: \n" 3341 : "+r"(row), // %0 3342 "+r"(cumsum), // %1 3343 "+r"(previous_cumsum), // %2 3344 "+r"(width) // %3 3345 : 3346 : "memory", "cc" 3347#if defined(__SSE2__) 3348 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 3349#endif 3350 ); 3351} 3352#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 3353 3354#ifdef HAS_CUMULATIVESUMTOAVERAGE_SSE2 3355void CumulativeSumToAverage_SSE2(const int32* topleft, const int32* botleft, 3356 int width, int area, uint8* dst, int count) { 3357 asm volatile ( 3358 "movd %5,%%xmm4 \n" 3359 "cvtdq2ps %%xmm4,%%xmm4 \n" 3360 "rcpss %%xmm4,%%xmm4 \n" 3361 "pshufd $0x0,%%xmm4,%%xmm4 \n" 3362 "sub $0x4,%3 \n" 3363 "jl 49f \n" 3364 3365 // 4 pixel loop \n" 3366 ".p2align 2 \n" 3367 "40: \n" 3368 "movdqa (%0),%%xmm0 \n" 3369 "movdqa 0x10(%0),%%xmm1 \n" 3370 "movdqa 0x20(%0),%%xmm2 \n" 3371 "movdqa 0x30(%0),%%xmm3 \n" 3372 "psubd (%0,%4,4),%%xmm0 \n" 3373 "psubd 0x10(%0,%4,4),%%xmm1 \n" 3374 "psubd 0x20(%0,%4,4),%%xmm2 \n" 3375 "psubd 0x30(%0,%4,4),%%xmm3 \n" 3376 "lea 0x40(%0),%0 \n" 3377 "psubd (%1),%%xmm0 \n" 3378 "psubd 0x10(%1),%%xmm1 \n" 3379 "psubd 0x20(%1),%%xmm2 \n" 3380 "psubd 0x30(%1),%%xmm3 \n" 3381 "paddd (%1,%4,4),%%xmm0 \n" 3382 "paddd 0x10(%1,%4,4),%%xmm1 \n" 3383 "paddd 0x20(%1,%4,4),%%xmm2 \n" 3384 "paddd 0x30(%1,%4,4),%%xmm3 \n" 3385 "lea 0x40(%1),%1 \n" 3386 "cvtdq2ps %%xmm0,%%xmm0 \n" 3387 "cvtdq2ps %%xmm1,%%xmm1 \n" 3388 "mulps %%xmm4,%%xmm0 \n" 3389 "mulps %%xmm4,%%xmm1 \n" 3390 "cvtdq2ps %%xmm2,%%xmm2 \n" 3391 "cvtdq2ps %%xmm3,%%xmm3 \n" 3392 "mulps %%xmm4,%%xmm2 \n" 3393 "mulps %%xmm4,%%xmm3 \n" 3394 "cvtps2dq %%xmm0,%%xmm0 \n" 3395 "cvtps2dq %%xmm1,%%xmm1 \n" 3396 "cvtps2dq %%xmm2,%%xmm2 \n" 3397 "cvtps2dq %%xmm3,%%xmm3 \n" 3398 "packssdw %%xmm1,%%xmm0 \n" 3399 "packssdw %%xmm3,%%xmm2 \n" 3400 "packuswb %%xmm2,%%xmm0 \n" 3401 "movdqu %%xmm0,(%2) \n" 3402 "lea 0x10(%2),%2 \n" 3403 "sub $0x4,%3 \n" 3404 "jge 40b \n" 3405 3406 "49: \n" 3407 "add $0x3,%3 \n" 3408 "jl 19f \n" 3409 3410 // 1 pixel loop \n" 3411 ".p2align 2 \n" 3412 "10: \n" 3413 "movdqa (%0),%%xmm0 \n" 3414 "psubd (%0,%4,4),%%xmm0 \n" 3415 "lea 0x10(%0),%0 \n" 3416 "psubd (%1),%%xmm0 \n" 3417 "paddd (%1,%4,4),%%xmm0 \n" 3418 "lea 0x10(%1),%1 \n" 3419 "cvtdq2ps %%xmm0,%%xmm0 \n" 3420 "mulps %%xmm4,%%xmm0 \n" 3421 "cvtps2dq %%xmm0,%%xmm0 \n" 3422 "packssdw %%xmm0,%%xmm0 \n" 3423 "packuswb %%xmm0,%%xmm0 \n" 3424 "movd %%xmm0,(%2) \n" 3425 "lea 0x4(%2),%2 \n" 3426 "sub $0x1,%3 \n" 3427 "jge 10b \n" 3428 "19: \n" 3429 : "+r"(topleft), // %0 3430 "+r"(botleft), // %1 3431 "+r"(dst), // %2 3432 "+rm"(count) // %3 3433 : "r"(static_cast<intptr_t>(width)), // %4 3434 "rm"(area) // %5 3435 : "memory", "cc" 3436#if defined(__SSE2__) 3437 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" 3438#endif 3439 ); 3440} 3441#endif // HAS_CUMULATIVESUMTOAVERAGE_SSE2 3442#ifdef HAS_ARGBSHADE_SSE2 3443// Shade 4 pixels at a time by specified value. 3444// Aligned to 16 bytes. 3445void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, 3446 uint32 value) { 3447 asm volatile ( 3448 "movd %3,%%xmm2 \n" 3449 "sub %0,%1 \n" 3450 "punpcklbw %%xmm2,%%xmm2 \n" 3451 "punpcklqdq %%xmm2,%%xmm2 \n" 3452 3453 // 4 pixel loop. 3454 ".p2align 2 \n" 3455 "1: \n" 3456 "movdqa (%0),%%xmm0 \n" 3457 "movdqa %%xmm0,%%xmm1 \n" 3458 "punpcklbw %%xmm0,%%xmm0 \n" 3459 "punpckhbw %%xmm1,%%xmm1 \n" 3460 "pmulhuw %%xmm2,%%xmm0 \n" 3461 "pmulhuw %%xmm2,%%xmm1 \n" 3462 "psrlw $0x8,%%xmm0 \n" 3463 "psrlw $0x8,%%xmm1 \n" 3464 "packuswb %%xmm1,%%xmm0 \n" 3465 "sub $0x4,%2 \n" 3466 "movdqa %%xmm0,(%0,%1,1) \n" 3467 "lea 0x10(%0),%0 \n" 3468 "jg 1b \n" 3469 : "+r"(src_argb), // %0 3470 "+r"(dst_argb), // %1 3471 "+r"(width) // %2 3472 : "r"(value) // %3 3473 : "memory", "cc" 3474#if defined(__SSE2__) 3475 , "xmm0", "xmm1", "xmm2" 3476#endif 3477 ); 3478} 3479#endif // HAS_ARGBSHADE_SSE2 3480 3481#ifdef HAS_ARGBAFFINEROW_SSE2 3482// TODO(fbarchard): Find 64 bit way to avoid masking. 3483// TODO(fbarchard): Investigate why 4 pixels is slower than 2 on Core2. 3484// Copy ARGB pixels from source image with slope to a row of destination. 3485// Caveat - in 64 bit, movd is used with 64 bit gpr due to Mac gcc producing 3486// an error if movq is used. movd %%xmm0,%1 3487 3488LIBYUV_API 3489void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, 3490 uint8* dst_argb, const float* uv_dudv, int width) { 3491 intptr_t src_argb_stride_temp = src_argb_stride; 3492 intptr_t temp = 0; 3493 asm volatile ( 3494 "movq (%3),%%xmm2 \n" 3495 "movq 0x8(%3),%%xmm7 \n" 3496 "shl $0x10,%1 \n" 3497 "add $0x4,%1 \n" 3498 "movd %1,%%xmm5 \n" 3499 "sub $0x4,%4 \n" 3500 "jl 49f \n" 3501 3502 "pshufd $0x44,%%xmm7,%%xmm7 \n" 3503 "pshufd $0x0,%%xmm5,%%xmm5 \n" 3504 "movdqa %%xmm2,%%xmm0 \n" 3505 "addps %%xmm7,%%xmm0 \n" 3506 "movlhps %%xmm0,%%xmm2 \n" 3507 "movdqa %%xmm7,%%xmm4 \n" 3508 "addps %%xmm4,%%xmm4 \n" 3509 "movdqa %%xmm2,%%xmm3 \n" 3510 "addps %%xmm4,%%xmm3 \n" 3511 "addps %%xmm4,%%xmm4 \n" 3512 3513 // 4 pixel loop \n" 3514 ".p2align 4 \n" 3515 "40: \n" 3516 "cvttps2dq %%xmm2,%%xmm0 \n" 3517 "cvttps2dq %%xmm3,%%xmm1 \n" 3518 "packssdw %%xmm1,%%xmm0 \n" 3519 "pmaddwd %%xmm5,%%xmm0 \n" 3520#if defined(__x86_64__) 3521 "movd %%xmm0,%1 \n" 3522 "mov %1,%5 \n" 3523 "and $0x0fffffff,%1 \n" 3524 "shr $32,%5 \n" 3525 "pshufd $0xEE,%%xmm0,%%xmm0 \n" 3526#else 3527 "movd %%xmm0,%1 \n" 3528 "pshufd $0x39,%%xmm0,%%xmm0 \n" 3529 "movd %%xmm0,%5 \n" 3530 "pshufd $0x39,%%xmm0,%%xmm0 \n" 3531#endif 3532 "movd (%0,%1,1),%%xmm1 \n" 3533 "movd (%0,%5,1),%%xmm6 \n" 3534 "punpckldq %%xmm6,%%xmm1 \n" 3535 "addps %%xmm4,%%xmm2 \n" 3536 "movq %%xmm1,(%2) \n" 3537#if defined(__x86_64__) 3538 "movd %%xmm0,%1 \n" 3539 "mov %1,%5 \n" 3540 "and $0x0fffffff,%1 \n" 3541 "shr $32,%5 \n" 3542#else 3543 "movd %%xmm0,%1 \n" 3544 "pshufd $0x39,%%xmm0,%%xmm0 \n" 3545 "movd %%xmm0,%5 \n" 3546#endif 3547 "movd (%0,%1,1),%%xmm0 \n" 3548 "movd (%0,%5,1),%%xmm6 \n" 3549 "punpckldq %%xmm6,%%xmm0 \n" 3550 "addps %%xmm4,%%xmm3 \n" 3551 "sub $0x4,%4 \n" 3552 "movq %%xmm0,0x08(%2) \n" 3553 "lea 0x10(%2),%2 \n" 3554 "jge 40b \n" 3555 3556 "49: \n" 3557 "add $0x3,%4 \n" 3558 "jl 19f \n" 3559 3560 // 1 pixel loop \n" 3561 ".p2align 4 \n" 3562 "10: \n" 3563 "cvttps2dq %%xmm2,%%xmm0 \n" 3564 "packssdw %%xmm0,%%xmm0 \n" 3565 "pmaddwd %%xmm5,%%xmm0 \n" 3566 "addps %%xmm7,%%xmm2 \n" 3567 "movd %%xmm0,%1 \n" 3568#if defined(__x86_64__) 3569 "and $0x0fffffff,%1 \n" 3570#endif 3571 "movd (%0,%1,1),%%xmm0 \n" 3572 "sub $0x1,%4 \n" 3573 "movd %%xmm0,(%2) \n" 3574 "lea 0x4(%2),%2 \n" 3575 "jge 10b \n" 3576 "19: \n" 3577 : "+r"(src_argb), // %0 3578 "+r"(src_argb_stride_temp), // %1 3579 "+r"(dst_argb), // %2 3580 "+r"(uv_dudv), // %3 3581 "+rm"(width), // %4 3582 "+r"(temp) // %5 3583 : 3584 : "memory", "cc" 3585#if defined(__SSE2__) 3586 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 3587#endif 3588 ); 3589} 3590#endif // HAS_ARGBAFFINEROW_SSE2 3591 3592// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version 3593void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, 3594 ptrdiff_t src_stride, int dst_width, 3595 int source_y_fraction) { 3596 asm volatile ( 3597 "sub %1,%0 \n" 3598 "shr %3 \n" 3599 "cmp $0x0,%3 \n" 3600 "je 2f \n" 3601 "cmp $0x40,%3 \n" 3602 "je 3f \n" 3603 "movd %3,%%xmm0 \n" 3604 "neg %3 \n" 3605 "add $0x80,%3 \n" 3606 "movd %3,%%xmm5 \n" 3607 "punpcklbw %%xmm0,%%xmm5 \n" 3608 "punpcklwd %%xmm5,%%xmm5 \n" 3609 "pshufd $0x0,%%xmm5,%%xmm5 \n" 3610 ".p2align 4 \n" 3611 "1: \n" 3612 "movdqa (%1),%%xmm0 \n" 3613 "movdqa (%1,%4,1),%%xmm2 \n" 3614 "movdqa %%xmm0,%%xmm1 \n" 3615 "punpcklbw %%xmm2,%%xmm0 \n" 3616 "punpckhbw %%xmm2,%%xmm1 \n" 3617 "pmaddubsw %%xmm5,%%xmm0 \n" 3618 "pmaddubsw %%xmm5,%%xmm1 \n" 3619 "psrlw $0x7,%%xmm0 \n" 3620 "psrlw $0x7,%%xmm1 \n" 3621 "packuswb %%xmm1,%%xmm0 \n" 3622 "sub $0x4,%2 \n" 3623 "movdqa %%xmm0,(%1,%0,1) \n" 3624 "lea 0x10(%1),%1 \n" 3625 "jg 1b \n" 3626 "jmp 4f \n" 3627 ".p2align 4 \n" 3628 "2: \n" 3629 "movdqa (%1),%%xmm0 \n" 3630 "sub $0x4,%2 \n" 3631 "movdqa %%xmm0,(%1,%0,1) \n" 3632 "lea 0x10(%1),%1 \n" 3633 "jg 2b \n" 3634 "jmp 4f \n" 3635 ".p2align 4 \n" 3636 "3: \n" 3637 "movdqa (%1),%%xmm0 \n" 3638 "pavgb (%1,%4,1),%%xmm0 \n" 3639 "sub $0x4,%2 \n" 3640 "movdqa %%xmm0,(%1,%0,1) \n" 3641 "lea 0x10(%1),%1 \n" 3642 "jg 3b \n" 3643 "4: \n" 3644 ".p2align 4 \n" 3645 : "+r"(dst_ptr), // %0 3646 "+r"(src_ptr), // %1 3647 "+r"(dst_width), // %2 3648 "+r"(source_y_fraction) // %3 3649 : "r"(static_cast<intptr_t>(src_stride)) // %4 3650 : "memory", "cc" 3651#if defined(__SSE2__) 3652 , "xmm0", "xmm1", "xmm2", "xmm5" 3653#endif 3654 ); 3655} 3656 3657#endif // defined(__x86_64__) || defined(__i386__) 3658 3659#ifdef __cplusplus 3660} // extern "C" 3661} // namespace libyuv 3662#endif 3663