1/* 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include "libyuv/row.h" 12 13#ifdef __cplusplus 14namespace libyuv { 15extern "C" { 16#endif 17 18// This module is for GCC x86 and x64. 19#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) 20 21#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) 22 23// Constants for ARGB 24static vec8 kARGBToY = { 25 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 26}; 27 28// JPeg full range. 29static vec8 kARGBToYJ = { 30 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0 31}; 32#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) 33 34#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) 35 36static vec8 kARGBToU = { 37 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 38}; 39 40static vec8 kARGBToUJ = { 41 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0 42}; 43 44static vec8 kARGBToV = { 45 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, 46}; 47 48static vec8 kARGBToVJ = { 49 -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0 50}; 51 52// Constants for BGRA 53static vec8 kBGRAToY = { 54 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13 55}; 56 57static vec8 kBGRAToU = { 58 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112 59}; 60 61static vec8 kBGRAToV = { 62 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18 63}; 64 65// Constants for ABGR 66static vec8 kABGRToY = { 67 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0 68}; 69 70static vec8 kABGRToU = { 71 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0 72}; 73 74static vec8 kABGRToV = { 75 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0 76}; 77 78// Constants for RGBA. 79static vec8 kRGBAToY = { 80 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33 81}; 82 83static vec8 kRGBAToU = { 84 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38 85}; 86 87static vec8 kRGBAToV = { 88 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112 89}; 90 91static uvec8 kAddY16 = { 92 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u 93}; 94 95static vec16 kAddYJ64 = { 96 64, 64, 64, 64, 64, 64, 64, 64 97}; 98 99static uvec8 kAddUV128 = { 100 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 101 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u 102}; 103 104static uvec16 kAddUVJ128 = { 105 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u 106}; 107#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) 108 109#ifdef HAS_RGB24TOARGBROW_SSSE3 110 111// Shuffle table for converting RGB24 to ARGB. 112static uvec8 kShuffleMaskRGB24ToARGB = { 113 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u 114}; 115 116// Shuffle table for converting RAW to ARGB. 117static uvec8 kShuffleMaskRAWToARGB = { 118 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u 119}; 120 121// Shuffle table for converting ARGB to RGB24. 122static uvec8 kShuffleMaskARGBToRGB24 = { 123 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u 124}; 125 126// Shuffle table for converting ARGB to RAW. 127static uvec8 kShuffleMaskARGBToRAW = { 128 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u 129}; 130 131// Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 132static uvec8 kShuffleMaskARGBToRGB24_0 = { 133 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u 134}; 135 136// Shuffle table for converting ARGB to RAW. 137static uvec8 kShuffleMaskARGBToRAW_0 = { 138 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u 139}; 140#endif // HAS_RGB24TOARGBROW_SSSE3 141 142#if defined(TESTING) && defined(__x86_64__) 143void TestRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { 144 asm volatile ( 145 ".p2align 5 \n" 146 "mov %%eax,%%eax \n" 147 "mov %%ebx,%%ebx \n" 148 "mov %%ecx,%%ecx \n" 149 "mov %%edx,%%edx \n" 150 "mov %%esi,%%esi \n" 151 "mov %%edi,%%edi \n" 152 "mov %%ebp,%%ebp \n" 153 "mov %%esp,%%esp \n" 154 ".p2align 5 \n" 155 "mov %%r8d,%%r8d \n" 156 "mov %%r9d,%%r9d \n" 157 "mov %%r10d,%%r10d \n" 158 "mov %%r11d,%%r11d \n" 159 "mov %%r12d,%%r12d \n" 160 "mov %%r13d,%%r13d \n" 161 "mov %%r14d,%%r14d \n" 162 "mov %%r15d,%%r15d \n" 163 ".p2align 5 \n" 164 "lea (%%rax),%%eax \n" 165 "lea (%%rbx),%%ebx \n" 166 "lea (%%rcx),%%ecx \n" 167 "lea (%%rdx),%%edx \n" 168 "lea (%%rsi),%%esi \n" 169 "lea (%%rdi),%%edi \n" 170 "lea (%%rbp),%%ebp \n" 171 "lea (%%rsp),%%esp \n" 172 ".p2align 5 \n" 173 "lea (%%r8),%%r8d \n" 174 "lea (%%r9),%%r9d \n" 175 "lea (%%r10),%%r10d \n" 176 "lea (%%r11),%%r11d \n" 177 "lea (%%r12),%%r12d \n" 178 "lea (%%r13),%%r13d \n" 179 "lea (%%r14),%%r14d \n" 180 "lea (%%r15),%%r15d \n" 181 182 ".p2align 5 \n" 183 "lea 0x10(%%rax),%%eax \n" 184 "lea 0x10(%%rbx),%%ebx \n" 185 "lea 0x10(%%rcx),%%ecx \n" 186 "lea 0x10(%%rdx),%%edx \n" 187 "lea 0x10(%%rsi),%%esi \n" 188 "lea 0x10(%%rdi),%%edi \n" 189 "lea 0x10(%%rbp),%%ebp \n" 190 "lea 0x10(%%rsp),%%esp \n" 191 ".p2align 5 \n" 192 "lea 0x10(%%r8),%%r8d \n" 193 "lea 0x10(%%r9),%%r9d \n" 194 "lea 0x10(%%r10),%%r10d \n" 195 "lea 0x10(%%r11),%%r11d \n" 196 "lea 0x10(%%r12),%%r12d \n" 197 "lea 0x10(%%r13),%%r13d \n" 198 "lea 0x10(%%r14),%%r14d \n" 199 "lea 0x10(%%r15),%%r15d \n" 200 201 ".p2align 5 \n" 202 "add 0x10,%%eax \n" 203 "add 0x10,%%ebx \n" 204 "add 0x10,%%ecx \n" 205 "add 0x10,%%edx \n" 206 "add 0x10,%%esi \n" 207 "add 0x10,%%edi \n" 208 "add 0x10,%%ebp \n" 209 "add 0x10,%%esp \n" 210 ".p2align 5 \n" 211 "add 0x10,%%r8d \n" 212 "add 0x10,%%r9d \n" 213 "add 0x10,%%r10d \n" 214 "add 0x10,%%r11d \n" 215 "add 0x10,%%r12d \n" 216 "add 0x10,%%r13d \n" 217 "add 0x10,%%r14d \n" 218 "add 0x10,%%r15d \n" 219 220 ".p2align 2 \n" 221 "1: \n" 222 "movq " MEMACCESS(0) ",%%xmm0 \n" 223 "lea " MEMLEA(0x8,0) ",%0 \n" 224 "movdqa %%xmm0," MEMACCESS(1) " \n" 225 "lea " MEMLEA(0x20,1) ",%1 \n" 226 "sub $0x8,%2 \n" 227 "jg 1b \n" 228 : "+r"(src_y), // %0 229 "+r"(dst_argb), // %1 230 "+r"(pix) // %2 231 : 232 : "memory", "cc" 233#if defined(__SSE2__) 234 , "xmm0", "xmm1", "xmm5" 235#endif 236 ); 237} 238#endif // TESTING 239 240#ifdef HAS_I400TOARGBROW_SSE2 241void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { 242 asm volatile ( 243 "pcmpeqb %%xmm5,%%xmm5 \n" 244 "pslld $0x18,%%xmm5 \n" 245 LABELALIGN 246 "1: \n" 247 "movq " MEMACCESS(0) ",%%xmm0 \n" 248 "lea " MEMLEA(0x8,0) ",%0 \n" 249 "punpcklbw %%xmm0,%%xmm0 \n" 250 "movdqa %%xmm0,%%xmm1 \n" 251 "punpcklwd %%xmm0,%%xmm0 \n" 252 "punpckhwd %%xmm1,%%xmm1 \n" 253 "por %%xmm5,%%xmm0 \n" 254 "por %%xmm5,%%xmm1 \n" 255 "movdqa %%xmm0," MEMACCESS(1) " \n" 256 "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" 257 "lea " MEMLEA(0x20,1) ",%1 \n" 258 "sub $0x8,%2 \n" 259 "jg 1b \n" 260 : "+r"(src_y), // %0 261 "+r"(dst_argb), // %1 262 "+r"(pix) // %2 263 : 264 : "memory", "cc" 265#if defined(__SSE2__) 266 , "xmm0", "xmm1", "xmm5" 267#endif 268 ); 269} 270 271void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb, 272 int pix) { 273 asm volatile ( 274 "pcmpeqb %%xmm5,%%xmm5 \n" 275 "pslld $0x18,%%xmm5 \n" 276 LABELALIGN 277 "1: \n" 278 "movq " MEMACCESS(0) ",%%xmm0 \n" 279 "lea " MEMLEA(0x8,0) ",%0 \n" 280 "punpcklbw %%xmm0,%%xmm0 \n" 281 "movdqa %%xmm0,%%xmm1 \n" 282 "punpcklwd %%xmm0,%%xmm0 \n" 283 "punpckhwd %%xmm1,%%xmm1 \n" 284 "por %%xmm5,%%xmm0 \n" 285 "por %%xmm5,%%xmm1 \n" 286 "movdqu %%xmm0," MEMACCESS(1) " \n" 287 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" 288 "lea " MEMLEA(0x20,1) ",%1 \n" 289 "sub $0x8,%2 \n" 290 "jg 1b \n" 291 : "+r"(src_y), // %0 292 "+r"(dst_argb), // %1 293 "+r"(pix) // %2 294 : 295 : "memory", "cc" 296#if defined(__SSE2__) 297 , "xmm0", "xmm1", "xmm5" 298#endif 299 ); 300} 301#endif // HAS_I400TOARGBROW_SSE2 302 303#ifdef HAS_RGB24TOARGBROW_SSSE3 304void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { 305 asm volatile ( 306 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000 307 "pslld $0x18,%%xmm5 \n" 308 "movdqa %3,%%xmm4 \n" 309 LABELALIGN 310 "1: \n" 311 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 312 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 313 "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n" 314 "lea " MEMLEA(0x30,0) ",%0 \n" 315 "movdqa %%xmm3,%%xmm2 \n" 316 "palignr $0x8,%%xmm1,%%xmm2 \n" 317 "pshufb %%xmm4,%%xmm2 \n" 318 "por %%xmm5,%%xmm2 \n" 319 "palignr $0xc,%%xmm0,%%xmm1 \n" 320 "pshufb %%xmm4,%%xmm0 \n" 321 "movdqa %%xmm2," MEMACCESS2(0x20,1) " \n" 322 "por %%xmm5,%%xmm0 \n" 323 "pshufb %%xmm4,%%xmm1 \n" 324 "movdqa %%xmm0," MEMACCESS(1) " \n" 325 "por %%xmm5,%%xmm1 \n" 326 "palignr $0x4,%%xmm3,%%xmm3 \n" 327 "pshufb %%xmm4,%%xmm3 \n" 328 "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" 329 "por %%xmm5,%%xmm3 \n" 330 "sub $0x10,%2 \n" 331 "movdqa %%xmm3," MEMACCESS2(0x30,1) " \n" 332 "lea " MEMLEA(0x40,1) ",%1 \n" 333 "jg 1b \n" 334 : "+r"(src_rgb24), // %0 335 "+r"(dst_argb), // %1 336 "+r"(pix) // %2 337 : "m"(kShuffleMaskRGB24ToARGB) // %3 338 : "memory", "cc" 339#if defined(__SSE2__) 340 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 341#endif 342 ); 343} 344 345void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) { 346 asm volatile ( 347 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000 348 "pslld $0x18,%%xmm5 \n" 349 "movdqa %3,%%xmm4 \n" 350 LABELALIGN 351 "1: \n" 352 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 353 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 354 "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n" 355 "lea " MEMLEA(0x30,0) ",%0 \n" 356 "movdqa %%xmm3,%%xmm2 \n" 357 "palignr $0x8,%%xmm1,%%xmm2 \n" 358 "pshufb %%xmm4,%%xmm2 \n" 359 "por %%xmm5,%%xmm2 \n" 360 "palignr $0xc,%%xmm0,%%xmm1 \n" 361 "pshufb %%xmm4,%%xmm0 \n" 362 "movdqa %%xmm2," MEMACCESS2(0x20,1) " \n" 363 "por %%xmm5,%%xmm0 \n" 364 "pshufb %%xmm4,%%xmm1 \n" 365 "movdqa %%xmm0," MEMACCESS(1) " \n" 366 "por %%xmm5,%%xmm1 \n" 367 "palignr $0x4,%%xmm3,%%xmm3 \n" 368 "pshufb %%xmm4,%%xmm3 \n" 369 "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" 370 "por %%xmm5,%%xmm3 \n" 371 "sub $0x10,%2 \n" 372 "movdqa %%xmm3," MEMACCESS2(0x30,1) " \n" 373 "lea " MEMLEA(0x40,1) ",%1 \n" 374 "jg 1b \n" 375 : "+r"(src_raw), // %0 376 "+r"(dst_argb), // %1 377 "+r"(pix) // %2 378 : "m"(kShuffleMaskRAWToARGB) // %3 379 : "memory", "cc" 380#if defined(__SSE2__) 381 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 382#endif 383 ); 384} 385 386void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { 387 asm volatile ( 388 "mov $0x1080108,%%eax \n" 389 "movd %%eax,%%xmm5 \n" 390 "pshufd $0x0,%%xmm5,%%xmm5 \n" 391 "mov $0x20802080,%%eax \n" 392 "movd %%eax,%%xmm6 \n" 393 "pshufd $0x0,%%xmm6,%%xmm6 \n" 394 "pcmpeqb %%xmm3,%%xmm3 \n" 395 "psllw $0xb,%%xmm3 \n" 396 "pcmpeqb %%xmm4,%%xmm4 \n" 397 "psllw $0xa,%%xmm4 \n" 398 "psrlw $0x5,%%xmm4 \n" 399 "pcmpeqb %%xmm7,%%xmm7 \n" 400 "psllw $0x8,%%xmm7 \n" 401 "sub %0,%1 \n" 402 "sub %0,%1 \n" 403 LABELALIGN 404 "1: \n" 405 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 406 "movdqa %%xmm0,%%xmm1 \n" 407 "movdqa %%xmm0,%%xmm2 \n" 408 "pand %%xmm3,%%xmm1 \n" 409 "psllw $0xb,%%xmm2 \n" 410 "pmulhuw %%xmm5,%%xmm1 \n" 411 "pmulhuw %%xmm5,%%xmm2 \n" 412 "psllw $0x8,%%xmm1 \n" 413 "por %%xmm2,%%xmm1 \n" 414 "pand %%xmm4,%%xmm0 \n" 415 "pmulhuw %%xmm6,%%xmm0 \n" 416 "por %%xmm7,%%xmm0 \n" 417 "movdqa %%xmm1,%%xmm2 \n" 418 "punpcklbw %%xmm0,%%xmm1 \n" 419 "punpckhbw %%xmm0,%%xmm2 \n" 420 BUNDLEALIGN 421 MEMOPMEM(movdqa,xmm1,0x00,1,0,2) // movdqa %%xmm1,(%1,%0,2) 422 MEMOPMEM(movdqa,xmm2,0x10,1,0,2) // movdqa %%xmm2,0x10(%1,%0,2) 423 "lea " MEMLEA(0x10,0) ",%0 \n" 424 "sub $0x8,%2 \n" 425 "jg 1b \n" 426 : "+r"(src), // %0 427 "+r"(dst), // %1 428 "+r"(pix) // %2 429 : 430 : "memory", "cc", "eax" 431#if defined(__native_client__) && defined(__x86_64__) 432 , "r14" 433#endif 434#if defined(__SSE2__) 435 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 436#endif 437 ); 438} 439 440void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { 441 asm volatile ( 442 "mov $0x1080108,%%eax \n" 443 "movd %%eax,%%xmm5 \n" 444 "pshufd $0x0,%%xmm5,%%xmm5 \n" 445 "mov $0x42004200,%%eax \n" 446 "movd %%eax,%%xmm6 \n" 447 "pshufd $0x0,%%xmm6,%%xmm6 \n" 448 "pcmpeqb %%xmm3,%%xmm3 \n" 449 "psllw $0xb,%%xmm3 \n" 450 "movdqa %%xmm3,%%xmm4 \n" 451 "psrlw $0x6,%%xmm4 \n" 452 "pcmpeqb %%xmm7,%%xmm7 \n" 453 "psllw $0x8,%%xmm7 \n" 454 "sub %0,%1 \n" 455 "sub %0,%1 \n" 456 LABELALIGN 457 "1: \n" 458 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 459 "movdqa %%xmm0,%%xmm1 \n" 460 "movdqa %%xmm0,%%xmm2 \n" 461 "psllw $0x1,%%xmm1 \n" 462 "psllw $0xb,%%xmm2 \n" 463 "pand %%xmm3,%%xmm1 \n" 464 "pmulhuw %%xmm5,%%xmm2 \n" 465 "pmulhuw %%xmm5,%%xmm1 \n" 466 "psllw $0x8,%%xmm1 \n" 467 "por %%xmm2,%%xmm1 \n" 468 "movdqa %%xmm0,%%xmm2 \n" 469 "pand %%xmm4,%%xmm0 \n" 470 "psraw $0x8,%%xmm2 \n" 471 "pmulhuw %%xmm6,%%xmm0 \n" 472 "pand %%xmm7,%%xmm2 \n" 473 "por %%xmm2,%%xmm0 \n" 474 "movdqa %%xmm1,%%xmm2 \n" 475 "punpcklbw %%xmm0,%%xmm1 \n" 476 "punpckhbw %%xmm0,%%xmm2 \n" 477 BUNDLEALIGN 478 MEMOPMEM(movdqa,xmm1,0x00,1,0,2) // movdqa %%xmm1,(%1,%0,2) 479 MEMOPMEM(movdqa,xmm2,0x10,1,0,2) // movdqa %%xmm2,0x10(%1,%0,2) 480 "lea " MEMLEA(0x10,0) ",%0 \n" 481 "sub $0x8,%2 \n" 482 "jg 1b \n" 483 : "+r"(src), // %0 484 "+r"(dst), // %1 485 "+r"(pix) // %2 486 : 487 : "memory", "cc", "eax" 488#if defined(__native_client__) && defined(__x86_64__) 489 , "r14" 490#endif 491#if defined(__SSE2__) 492 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 493#endif 494 ); 495} 496 497void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { 498 asm volatile ( 499 "mov $0xf0f0f0f,%%eax \n" 500 "movd %%eax,%%xmm4 \n" 501 "pshufd $0x0,%%xmm4,%%xmm4 \n" 502 "movdqa %%xmm4,%%xmm5 \n" 503 "pslld $0x4,%%xmm5 \n" 504 "sub %0,%1 \n" 505 "sub %0,%1 \n" 506 LABELALIGN 507 "1: \n" 508 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 509 "movdqa %%xmm0,%%xmm2 \n" 510 "pand %%xmm4,%%xmm0 \n" 511 "pand %%xmm5,%%xmm2 \n" 512 "movdqa %%xmm0,%%xmm1 \n" 513 "movdqa %%xmm2,%%xmm3 \n" 514 "psllw $0x4,%%xmm1 \n" 515 "psrlw $0x4,%%xmm3 \n" 516 "por %%xmm1,%%xmm0 \n" 517 "por %%xmm3,%%xmm2 \n" 518 "movdqa %%xmm0,%%xmm1 \n" 519 "punpcklbw %%xmm2,%%xmm0 \n" 520 "punpckhbw %%xmm2,%%xmm1 \n" 521 BUNDLEALIGN 522 MEMOPMEM(movdqa,xmm0,0x00,1,0,2) // movdqa %%xmm0,(%1,%0,2) 523 MEMOPMEM(movdqa,xmm1,0x10,1,0,2) // movdqa %%xmm1,0x10(%1,%0,2) 524 "lea " MEMLEA(0x10,0) ",%0 \n" 525 "sub $0x8,%2 \n" 526 "jg 1b \n" 527 : "+r"(src), // %0 528 "+r"(dst), // %1 529 "+r"(pix) // %2 530 : 531 : "memory", "cc", "eax" 532#if defined(__native_client__) && defined(__x86_64__) 533 , "r14" 534#endif 535#if defined(__SSE2__) 536 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 537#endif 538 ); 539} 540 541void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) { 542 asm volatile ( 543 "movdqa %3,%%xmm6 \n" 544 LABELALIGN 545 "1: \n" 546 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 547 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 548 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 549 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" 550 "lea " MEMLEA(0x40,0) ",%0 \n" 551 "pshufb %%xmm6,%%xmm0 \n" 552 "pshufb %%xmm6,%%xmm1 \n" 553 "pshufb %%xmm6,%%xmm2 \n" 554 "pshufb %%xmm6,%%xmm3 \n" 555 "movdqa %%xmm1,%%xmm4 \n" 556 "psrldq $0x4,%%xmm1 \n" 557 "pslldq $0xc,%%xmm4 \n" 558 "movdqa %%xmm2,%%xmm5 \n" 559 "por %%xmm4,%%xmm0 \n" 560 "pslldq $0x8,%%xmm5 \n" 561 "movdqu %%xmm0," MEMACCESS(1) " \n" 562 "por %%xmm5,%%xmm1 \n" 563 "psrldq $0x8,%%xmm2 \n" 564 "pslldq $0x4,%%xmm3 \n" 565 "por %%xmm3,%%xmm2 \n" 566 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" 567 "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" 568 "lea " MEMLEA(0x30,1) ",%1 \n" 569 "sub $0x10,%2 \n" 570 "jg 1b \n" 571 : "+r"(src), // %0 572 "+r"(dst), // %1 573 "+r"(pix) // %2 574 : "m"(kShuffleMaskARGBToRGB24) // %3 575 : "memory", "cc" 576#if defined(__SSE2__) 577 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" 578#endif 579 ); 580} 581 582void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) { 583 asm volatile ( 584 "movdqa %3,%%xmm6 \n" 585 LABELALIGN 586 "1: \n" 587 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 588 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 589 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 590 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" 591 "lea " MEMLEA(0x40,0) ",%0 \n" 592 "pshufb %%xmm6,%%xmm0 \n" 593 "pshufb %%xmm6,%%xmm1 \n" 594 "pshufb %%xmm6,%%xmm2 \n" 595 "pshufb %%xmm6,%%xmm3 \n" 596 "movdqa %%xmm1,%%xmm4 \n" 597 "psrldq $0x4,%%xmm1 \n" 598 "pslldq $0xc,%%xmm4 \n" 599 "movdqa %%xmm2,%%xmm5 \n" 600 "por %%xmm4,%%xmm0 \n" 601 "pslldq $0x8,%%xmm5 \n" 602 "movdqu %%xmm0," MEMACCESS(1) " \n" 603 "por %%xmm5,%%xmm1 \n" 604 "psrldq $0x8,%%xmm2 \n" 605 "pslldq $0x4,%%xmm3 \n" 606 "por %%xmm3,%%xmm2 \n" 607 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" 608 "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" 609 "lea " MEMLEA(0x30,1) ",%1 \n" 610 "sub $0x10,%2 \n" 611 "jg 1b \n" 612 : "+r"(src), // %0 613 "+r"(dst), // %1 614 "+r"(pix) // %2 615 : "m"(kShuffleMaskARGBToRAW) // %3 616 : "memory", "cc" 617#if defined(__SSE2__) 618 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" 619#endif 620 ); 621} 622 623void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) { 624 asm volatile ( 625 "pcmpeqb %%xmm3,%%xmm3 \n" 626 "psrld $0x1b,%%xmm3 \n" 627 "pcmpeqb %%xmm4,%%xmm4 \n" 628 "psrld $0x1a,%%xmm4 \n" 629 "pslld $0x5,%%xmm4 \n" 630 "pcmpeqb %%xmm5,%%xmm5 \n" 631 "pslld $0xb,%%xmm5 \n" 632 LABELALIGN 633 "1: \n" 634 "movdqa " MEMACCESS(0) ",%%xmm0 \n" 635 "movdqa %%xmm0,%%xmm1 \n" 636 "movdqa %%xmm0,%%xmm2 \n" 637 "pslld $0x8,%%xmm0 \n" 638 "psrld $0x3,%%xmm1 \n" 639 "psrld $0x5,%%xmm2 \n" 640 "psrad $0x10,%%xmm0 \n" 641 "pand %%xmm3,%%xmm1 \n" 642 "pand %%xmm4,%%xmm2 \n" 643 "pand %%xmm5,%%xmm0 \n" 644 "por %%xmm2,%%xmm1 \n" 645 "por %%xmm1,%%xmm0 \n" 646 "packssdw %%xmm0,%%xmm0 \n" 647 "lea " MEMLEA(0x10,0) ",%0 \n" 648 "movq %%xmm0," MEMACCESS(1) " \n" 649 "lea " MEMLEA(0x8,1) ",%1 \n" 650 "sub $0x4,%2 \n" 651 "jg 1b \n" 652 : "+r"(src), // %0 653 "+r"(dst), // %1 654 "+r"(pix) // %2 655 : 656 : "memory", "cc" 657#if defined(__SSE2__) 658 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 659#endif 660 ); 661} 662 663void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) { 664 asm volatile ( 665 "pcmpeqb %%xmm4,%%xmm4 \n" 666 "psrld $0x1b,%%xmm4 \n" 667 "movdqa %%xmm4,%%xmm5 \n" 668 "pslld $0x5,%%xmm5 \n" 669 "movdqa %%xmm4,%%xmm6 \n" 670 "pslld $0xa,%%xmm6 \n" 671 "pcmpeqb %%xmm7,%%xmm7 \n" 672 "pslld $0xf,%%xmm7 \n" 673 LABELALIGN 674 "1: \n" 675 "movdqa " MEMACCESS(0) ",%%xmm0 \n" 676 "movdqa %%xmm0,%%xmm1 \n" 677 "movdqa %%xmm0,%%xmm2 \n" 678 "movdqa %%xmm0,%%xmm3 \n" 679 "psrad $0x10,%%xmm0 \n" 680 "psrld $0x3,%%xmm1 \n" 681 "psrld $0x6,%%xmm2 \n" 682 "psrld $0x9,%%xmm3 \n" 683 "pand %%xmm7,%%xmm0 \n" 684 "pand %%xmm4,%%xmm1 \n" 685 "pand %%xmm5,%%xmm2 \n" 686 "pand %%xmm6,%%xmm3 \n" 687 "por %%xmm1,%%xmm0 \n" 688 "por %%xmm3,%%xmm2 \n" 689 "por %%xmm2,%%xmm0 \n" 690 "packssdw %%xmm0,%%xmm0 \n" 691 "lea " MEMLEA(0x10,0) ",%0 \n" 692 "movq %%xmm0," MEMACCESS(1) " \n" 693 "lea " MEMACCESS2(0x8,1) ",%1 \n" 694 "sub $0x4,%2 \n" 695 "jg 1b \n" 696 : "+r"(src), // %0 697 "+r"(dst), // %1 698 "+r"(pix) // %2 699 : 700 : "memory", "cc" 701#if defined(__SSE2__) 702 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 703#endif 704 ); 705} 706 707void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) { 708 asm volatile ( 709 "pcmpeqb %%xmm4,%%xmm4 \n" 710 "psllw $0xc,%%xmm4 \n" 711 "movdqa %%xmm4,%%xmm3 \n" 712 "psrlw $0x8,%%xmm3 \n" 713 LABELALIGN 714 "1: \n" 715 "movdqa " MEMACCESS(0) ",%%xmm0 \n" 716 "movdqa %%xmm0,%%xmm1 \n" 717 "pand %%xmm3,%%xmm0 \n" 718 "pand %%xmm4,%%xmm1 \n" 719 "psrlq $0x4,%%xmm0 \n" 720 "psrlq $0x8,%%xmm1 \n" 721 "por %%xmm1,%%xmm0 \n" 722 "packuswb %%xmm0,%%xmm0 \n" 723 "lea " MEMLEA(0x10,0) ",%0 \n" 724 "movq %%xmm0," MEMACCESS(1) " \n" 725 "lea " MEMLEA(0x8,1) ",%1 \n" 726 "sub $0x4,%2 \n" 727 "jg 1b \n" 728 : "+r"(src), // %0 729 "+r"(dst), // %1 730 "+r"(pix) // %2 731 : 732 : "memory", "cc" 733#if defined(__SSE2__) 734 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" 735#endif 736 ); 737} 738#endif // HAS_RGB24TOARGBROW_SSSE3 739 740#ifdef HAS_ARGBTOYROW_SSSE3 741void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 742 asm volatile ( 743 "movdqa %4,%%xmm5 \n" 744 "movdqa %3,%%xmm4 \n" 745 LABELALIGN 746 "1: \n" 747 "movdqa " MEMACCESS(0) ",%%xmm0 \n" 748 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 749 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" 750 "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" 751 "pmaddubsw %%xmm4,%%xmm0 \n" 752 "pmaddubsw %%xmm4,%%xmm1 \n" 753 "pmaddubsw %%xmm4,%%xmm2 \n" 754 "pmaddubsw %%xmm4,%%xmm3 \n" 755 "lea " MEMLEA(0x40,0) ",%0 \n" 756 "phaddw %%xmm1,%%xmm0 \n" 757 "phaddw %%xmm3,%%xmm2 \n" 758 "psrlw $0x7,%%xmm0 \n" 759 "psrlw $0x7,%%xmm2 \n" 760 "packuswb %%xmm2,%%xmm0 \n" 761 "paddb %%xmm5,%%xmm0 \n" 762 "sub $0x10,%2 \n" 763 "movdqa %%xmm0," MEMACCESS(1) " \n" 764 "lea " MEMLEA(0x10,1) ",%1 \n" 765 "jg 1b \n" 766 : "+r"(src_argb), // %0 767 "+r"(dst_y), // %1 768 "+r"(pix) // %2 769 : "m"(kARGBToY), // %3 770 "m"(kAddY16) // %4 771 : "memory", "cc" 772#if defined(__SSE2__) 773 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 774#endif 775 ); 776} 777 778void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 779 asm volatile ( 780 "movdqa %4,%%xmm5 \n" 781 "movdqa %3,%%xmm4 \n" 782 LABELALIGN 783 "1: \n" 784 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 785 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 786 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 787 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" 788 "pmaddubsw %%xmm4,%%xmm0 \n" 789 "pmaddubsw %%xmm4,%%xmm1 \n" 790 "pmaddubsw %%xmm4,%%xmm2 \n" 791 "pmaddubsw %%xmm4,%%xmm3 \n" 792 "lea " MEMLEA(0x40,0) ",%0 \n" 793 "phaddw %%xmm1,%%xmm0 \n" 794 "phaddw %%xmm3,%%xmm2 \n" 795 "psrlw $0x7,%%xmm0 \n" 796 "psrlw $0x7,%%xmm2 \n" 797 "packuswb %%xmm2,%%xmm0 \n" 798 "paddb %%xmm5,%%xmm0 \n" 799 "sub $0x10,%2 \n" 800 "movdqu %%xmm0," MEMACCESS(1) " \n" 801 "lea " MEMLEA(0x10,1) ",%1 \n" 802 "jg 1b \n" 803 : "+r"(src_argb), // %0 804 "+r"(dst_y), // %1 805 "+r"(pix) // %2 806 : "m"(kARGBToY), // %3 807 "m"(kAddY16) // %4 808 : "memory", "cc" 809#if defined(__SSE2__) 810 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 811#endif 812 ); 813} 814#endif // HAS_ARGBTOYROW_SSSE3 815 816#ifdef HAS_ARGBTOYJROW_SSSE3 817void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 818 asm volatile ( 819 "movdqa %3,%%xmm4 \n" 820 "movdqa %4,%%xmm5 \n" 821 LABELALIGN 822 "1: \n" 823 "movdqa " MEMACCESS(0) ",%%xmm0 \n" 824 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 825 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" 826 "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" 827 "pmaddubsw %%xmm4,%%xmm0 \n" 828 "pmaddubsw %%xmm4,%%xmm1 \n" 829 "pmaddubsw %%xmm4,%%xmm2 \n" 830 "pmaddubsw %%xmm4,%%xmm3 \n" 831 "lea " MEMLEA(0x40,0) ",%0 \n" 832 "phaddw %%xmm1,%%xmm0 \n" 833 "phaddw %%xmm3,%%xmm2 \n" 834 "paddw %%xmm5,%%xmm0 \n" 835 "paddw %%xmm5,%%xmm2 \n" 836 "psrlw $0x7,%%xmm0 \n" 837 "psrlw $0x7,%%xmm2 \n" 838 "packuswb %%xmm2,%%xmm0 \n" 839 "sub $0x10,%2 \n" 840 "movdqa %%xmm0," MEMACCESS(1) " \n" 841 "lea " MEMLEA(0x10,1) ",%1 \n" 842 "jg 1b \n" 843 : "+r"(src_argb), // %0 844 "+r"(dst_y), // %1 845 "+r"(pix) // %2 846 : "m"(kARGBToYJ), // %3 847 "m"(kAddYJ64) // %4 848 : "memory", "cc" 849#if defined(__SSE2__) 850 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 851#endif 852 ); 853} 854 855void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 856 asm volatile ( 857 "movdqa %3,%%xmm4 \n" 858 "movdqa %4,%%xmm5 \n" 859 LABELALIGN 860 "1: \n" 861 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 862 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 863 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 864 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" 865 "pmaddubsw %%xmm4,%%xmm0 \n" 866 "pmaddubsw %%xmm4,%%xmm1 \n" 867 "pmaddubsw %%xmm4,%%xmm2 \n" 868 "pmaddubsw %%xmm4,%%xmm3 \n" 869 "lea " MEMLEA(0x40,0) ",%0 \n" 870 "phaddw %%xmm1,%%xmm0 \n" 871 "phaddw %%xmm3,%%xmm2 \n" 872 "paddw %%xmm5,%%xmm0 \n" 873 "paddw %%xmm5,%%xmm2 \n" 874 "psrlw $0x7,%%xmm0 \n" 875 "psrlw $0x7,%%xmm2 \n" 876 "packuswb %%xmm2,%%xmm0 \n" 877 "sub $0x10,%2 \n" 878 "movdqu %%xmm0," MEMACCESS(1) " \n" 879 "lea " MEMLEA(0x10,1) ",%1 \n" 880 "jg 1b \n" 881 : "+r"(src_argb), // %0 882 "+r"(dst_y), // %1 883 "+r"(pix) // %2 884 : "m"(kARGBToYJ), // %3 885 "m"(kAddYJ64) // %4 886 : "memory", "cc" 887#if defined(__SSE2__) 888 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 889#endif 890 ); 891} 892#endif // HAS_ARGBTOYJROW_SSSE3 893 894#ifdef HAS_ARGBTOUVROW_SSSE3 895// TODO(fbarchard): pass xmm constants to single block of assembly. 896// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes 897// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers, 898// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around 899// and considered unsafe. 900void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 901 uint8* dst_u, uint8* dst_v, int width) { 902 asm volatile ( 903 "movdqa %0,%%xmm4 \n" 904 "movdqa %1,%%xmm3 \n" 905 "movdqa %2,%%xmm5 \n" 906 : 907 : "m"(kARGBToU), // %0 908 "m"(kARGBToV), // %1 909 "m"(kAddUV128) // %2 910 ); 911 asm volatile ( 912 "sub %1,%2 \n" 913 LABELALIGN 914 "1: \n" 915 "movdqa " MEMACCESS(0) ",%%xmm0 \n" 916 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 917 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" 918 "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" 919 BUNDLEALIGN 920 MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 921 MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 922 MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2 923 MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6 924 "lea " MEMLEA(0x40,0) ",%0 \n" 925 "movdqa %%xmm0,%%xmm7 \n" 926 "shufps $0x88,%%xmm1,%%xmm0 \n" 927 "shufps $0xdd,%%xmm1,%%xmm7 \n" 928 "pavgb %%xmm7,%%xmm0 \n" 929 "movdqa %%xmm2,%%xmm7 \n" 930 "shufps $0x88,%%xmm6,%%xmm2 \n" 931 "shufps $0xdd,%%xmm6,%%xmm7 \n" 932 "pavgb %%xmm7,%%xmm2 \n" 933 "movdqa %%xmm0,%%xmm1 \n" 934 "movdqa %%xmm2,%%xmm6 \n" 935 "pmaddubsw %%xmm4,%%xmm0 \n" 936 "pmaddubsw %%xmm4,%%xmm2 \n" 937 "pmaddubsw %%xmm3,%%xmm1 \n" 938 "pmaddubsw %%xmm3,%%xmm6 \n" 939 "phaddw %%xmm2,%%xmm0 \n" 940 "phaddw %%xmm6,%%xmm1 \n" 941 "psraw $0x8,%%xmm0 \n" 942 "psraw $0x8,%%xmm1 \n" 943 "packsswb %%xmm1,%%xmm0 \n" 944 "paddb %%xmm5,%%xmm0 \n" 945 "sub $0x10,%3 \n" 946 "movlps %%xmm0," MEMACCESS(1) " \n" 947 BUNDLEALIGN 948 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) 949 "lea " MEMLEA(0x8,1) ",%1 \n" 950 "jg 1b \n" 951 : "+r"(src_argb0), // %0 952 "+r"(dst_u), // %1 953 "+r"(dst_v), // %2 954 "+rm"(width) // %3 955 : "r"((intptr_t)(src_stride_argb)) // %4 956 : "memory", "cc" 957#if defined(__native_client__) && defined(__x86_64__) 958 , "r14" 959#endif 960#if defined(__SSE2__) 961 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" 962#endif 963 ); 964} 965 966// TODO(fbarchard): Share code with ARGBToUVRow_SSSE3. 967void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 968 uint8* dst_u, uint8* dst_v, int width) { 969 asm volatile ( 970 "movdqa %0,%%xmm4 \n" 971 "movdqa %1,%%xmm3 \n" 972 "movdqa %2,%%xmm5 \n" 973 : 974 : "m"(kARGBToUJ), // %0 975 "m"(kARGBToVJ), // %1 976 "m"(kAddUVJ128) // %2 977 ); 978 asm volatile ( 979 "sub %1,%2 \n" 980 LABELALIGN 981 "1: \n" 982 "movdqa " MEMACCESS(0) ",%%xmm0 \n" 983 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 984 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" 985 "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" 986 BUNDLEALIGN 987 MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 988 MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 989 MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2 990 MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6 991 "lea " MEMLEA(0x40,0) ",%0 \n" 992 "movdqa %%xmm0,%%xmm7 \n" 993 "shufps $0x88,%%xmm1,%%xmm0 \n" 994 "shufps $0xdd,%%xmm1,%%xmm7 \n" 995 "pavgb %%xmm7,%%xmm0 \n" 996 "movdqa %%xmm2,%%xmm7 \n" 997 "shufps $0x88,%%xmm6,%%xmm2 \n" 998 "shufps $0xdd,%%xmm6,%%xmm7 \n" 999 "pavgb %%xmm7,%%xmm2 \n" 1000 "movdqa %%xmm0,%%xmm1 \n" 1001 "movdqa %%xmm2,%%xmm6 \n" 1002 "pmaddubsw %%xmm4,%%xmm0 \n" 1003 "pmaddubsw %%xmm4,%%xmm2 \n" 1004 "pmaddubsw %%xmm3,%%xmm1 \n" 1005 "pmaddubsw %%xmm3,%%xmm6 \n" 1006 "phaddw %%xmm2,%%xmm0 \n" 1007 "phaddw %%xmm6,%%xmm1 \n" 1008 "paddw %%xmm5,%%xmm0 \n" 1009 "paddw %%xmm5,%%xmm1 \n" 1010 "psraw $0x8,%%xmm0 \n" 1011 "psraw $0x8,%%xmm1 \n" 1012 "packsswb %%xmm1,%%xmm0 \n" 1013 "sub $0x10,%3 \n" 1014 "movlps %%xmm0," MEMACCESS(1) " \n" 1015 BUNDLEALIGN 1016 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) 1017 "lea " MEMLEA(0x8,1) ",%1 \n" 1018 "jg 1b \n" 1019 : "+r"(src_argb0), // %0 1020 "+r"(dst_u), // %1 1021 "+r"(dst_v), // %2 1022 "+rm"(width) // %3 1023 : "r"((intptr_t)(src_stride_argb)) // %4 1024 : "memory", "cc" 1025#if defined(__native_client__) && defined(__x86_64__) 1026 , "r14" 1027#endif 1028#if defined(__SSE2__) 1029 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" 1030#endif 1031 ); 1032} 1033 1034void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, 1035 uint8* dst_u, uint8* dst_v, int width) { 1036 asm volatile ( 1037 "movdqa %0,%%xmm4 \n" 1038 "movdqa %1,%%xmm3 \n" 1039 "movdqa %2,%%xmm5 \n" 1040 : 1041 : "m"(kARGBToU), // %0 1042 "m"(kARGBToV), // %1 1043 "m"(kAddUV128) // %2 1044 ); 1045 asm volatile ( 1046 "sub %1,%2 \n" 1047 LABELALIGN 1048 "1: \n" 1049 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1050 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1051 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1052 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" 1053 BUNDLEALIGN 1054 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 1055 "pavgb %%xmm7,%%xmm0 \n" 1056 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 1057 "pavgb %%xmm7,%%xmm1 \n" 1058 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 1059 "pavgb %%xmm7,%%xmm2 \n" 1060 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 1061 "pavgb %%xmm7,%%xmm6 \n" 1062 "lea " MEMLEA(0x40,0) ",%0 \n" 1063 "movdqa %%xmm0,%%xmm7 \n" 1064 "shufps $0x88,%%xmm1,%%xmm0 \n" 1065 "shufps $0xdd,%%xmm1,%%xmm7 \n" 1066 "pavgb %%xmm7,%%xmm0 \n" 1067 "movdqa %%xmm2,%%xmm7 \n" 1068 "shufps $0x88,%%xmm6,%%xmm2 \n" 1069 "shufps $0xdd,%%xmm6,%%xmm7 \n" 1070 "pavgb %%xmm7,%%xmm2 \n" 1071 "movdqa %%xmm0,%%xmm1 \n" 1072 "movdqa %%xmm2,%%xmm6 \n" 1073 "pmaddubsw %%xmm4,%%xmm0 \n" 1074 "pmaddubsw %%xmm4,%%xmm2 \n" 1075 "pmaddubsw %%xmm3,%%xmm1 \n" 1076 "pmaddubsw %%xmm3,%%xmm6 \n" 1077 "phaddw %%xmm2,%%xmm0 \n" 1078 "phaddw %%xmm6,%%xmm1 \n" 1079 "psraw $0x8,%%xmm0 \n" 1080 "psraw $0x8,%%xmm1 \n" 1081 "packsswb %%xmm1,%%xmm0 \n" 1082 "paddb %%xmm5,%%xmm0 \n" 1083 "sub $0x10,%3 \n" 1084 "movlps %%xmm0," MEMACCESS(1) " \n" 1085 BUNDLEALIGN 1086 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) 1087 "lea " MEMLEA(0x8,1) ",%1 \n" 1088 "jg 1b \n" 1089 : "+r"(src_argb0), // %0 1090 "+r"(dst_u), // %1 1091 "+r"(dst_v), // %2 1092 "+rm"(width) // %3 1093 : "r"((intptr_t)(src_stride_argb)) // %4 1094 : "memory", "cc" 1095#if defined(__native_client__) && defined(__x86_64__) 1096 , "r14" 1097#endif 1098#if defined(__SSE2__) 1099 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" 1100#endif 1101 ); 1102} 1103 1104void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, 1105 uint8* dst_u, uint8* dst_v, int width) { 1106 asm volatile ( 1107 "movdqa %0,%%xmm4 \n" 1108 "movdqa %1,%%xmm3 \n" 1109 "movdqa %2,%%xmm5 \n" 1110 : 1111 : "m"(kARGBToUJ), // %0 1112 "m"(kARGBToVJ), // %1 1113 "m"(kAddUVJ128) // %2 1114 ); 1115 asm volatile ( 1116 "sub %1,%2 \n" 1117 LABELALIGN 1118 "1: \n" 1119 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1120 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1121 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1122 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" 1123 BUNDLEALIGN 1124 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 1125 "pavgb %%xmm7,%%xmm0 \n" 1126 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 1127 "pavgb %%xmm7,%%xmm1 \n" 1128 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 1129 "pavgb %%xmm7,%%xmm2 \n" 1130 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 1131 "pavgb %%xmm7,%%xmm6 \n" 1132 "lea " MEMLEA(0x40,0) ",%0 \n" 1133 "movdqa %%xmm0,%%xmm7 \n" 1134 "shufps $0x88,%%xmm1,%%xmm0 \n" 1135 "shufps $0xdd,%%xmm1,%%xmm7 \n" 1136 "pavgb %%xmm7,%%xmm0 \n" 1137 "movdqa %%xmm2,%%xmm7 \n" 1138 "shufps $0x88,%%xmm6,%%xmm2 \n" 1139 "shufps $0xdd,%%xmm6,%%xmm7 \n" 1140 "pavgb %%xmm7,%%xmm2 \n" 1141 "movdqa %%xmm0,%%xmm1 \n" 1142 "movdqa %%xmm2,%%xmm6 \n" 1143 "pmaddubsw %%xmm4,%%xmm0 \n" 1144 "pmaddubsw %%xmm4,%%xmm2 \n" 1145 "pmaddubsw %%xmm3,%%xmm1 \n" 1146 "pmaddubsw %%xmm3,%%xmm6 \n" 1147 "phaddw %%xmm2,%%xmm0 \n" 1148 "phaddw %%xmm6,%%xmm1 \n" 1149 "paddw %%xmm5,%%xmm0 \n" 1150 "paddw %%xmm5,%%xmm1 \n" 1151 "psraw $0x8,%%xmm0 \n" 1152 "psraw $0x8,%%xmm1 \n" 1153 "packsswb %%xmm1,%%xmm0 \n" 1154 "sub $0x10,%3 \n" 1155 "movlps %%xmm0," MEMACCESS(1) " \n" 1156 BUNDLEALIGN 1157 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) 1158 "lea " MEMLEA(0x8,1) ",%1 \n" 1159 "jg 1b \n" 1160 : "+r"(src_argb0), // %0 1161 "+r"(dst_u), // %1 1162 "+r"(dst_v), // %2 1163 "+rm"(width) // %3 1164 : "r"((intptr_t)(src_stride_argb)) 1165 : "memory", "cc" 1166#if defined(__native_client__) && defined(__x86_64__) 1167 , "r14" 1168#endif 1169#if defined(__SSE2__) 1170 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" 1171#endif 1172 ); 1173} 1174 1175void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v, 1176 int width) { 1177 asm volatile ( 1178 "movdqa %0,%%xmm4 \n" 1179 "movdqa %1,%%xmm3 \n" 1180 "movdqa %2,%%xmm5 \n" 1181 : 1182 : "m"(kARGBToU), // %0 1183 "m"(kARGBToV), // %1 1184 "m"(kAddUV128) // %2 1185 ); 1186 asm volatile ( 1187 "sub %1,%2 \n" 1188 LABELALIGN 1189 "1: \n" 1190 "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1191 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1192 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1193 "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" 1194 "pmaddubsw %%xmm4,%%xmm0 \n" 1195 "pmaddubsw %%xmm4,%%xmm1 \n" 1196 "pmaddubsw %%xmm4,%%xmm2 \n" 1197 "pmaddubsw %%xmm4,%%xmm6 \n" 1198 "phaddw %%xmm1,%%xmm0 \n" 1199 "phaddw %%xmm6,%%xmm2 \n" 1200 "psraw $0x8,%%xmm0 \n" 1201 "psraw $0x8,%%xmm2 \n" 1202 "packsswb %%xmm2,%%xmm0 \n" 1203 "paddb %%xmm5,%%xmm0 \n" 1204 "sub $0x10,%3 \n" 1205 "movdqa %%xmm0," MEMACCESS(1) " \n" 1206 "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1207 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1208 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1209 "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" 1210 "pmaddubsw %%xmm3,%%xmm0 \n" 1211 "pmaddubsw %%xmm3,%%xmm1 \n" 1212 "pmaddubsw %%xmm3,%%xmm2 \n" 1213 "pmaddubsw %%xmm3,%%xmm6 \n" 1214 "phaddw %%xmm1,%%xmm0 \n" 1215 "phaddw %%xmm6,%%xmm2 \n" 1216 "psraw $0x8,%%xmm0 \n" 1217 "psraw $0x8,%%xmm2 \n" 1218 "packsswb %%xmm2,%%xmm0 \n" 1219 "paddb %%xmm5,%%xmm0 \n" 1220 "lea " MEMLEA(0x40,0) ",%0 \n" 1221 BUNDLEALIGN 1222 MEMOPMEM(movdqa,xmm0,0x00,1,2,1) // movdqa %%xmm0,(%1,%2,1) 1223 "lea " MEMLEA(0x10,1) ",%1 \n" 1224 "jg 1b \n" 1225 : "+r"(src_argb), // %0 1226 "+r"(dst_u), // %1 1227 "+r"(dst_v), // %2 1228 "+rm"(width) // %3 1229 : 1230 : "memory", "cc" 1231#if defined(__native_client__) && defined(__x86_64__) 1232 , "r14" 1233#endif 1234#if defined(__SSE2__) 1235 , "xmm0", "xmm1", "xmm2", "xmm6" 1236#endif 1237 ); 1238} 1239 1240void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_u, 1241 uint8* dst_v, int width) { 1242 asm volatile ( 1243 "movdqa %0,%%xmm4 \n" 1244 "movdqa %1,%%xmm3 \n" 1245 "movdqa %2,%%xmm5 \n" 1246 : 1247 : "m"(kARGBToU), // %0 1248 "m"(kARGBToV), // %1 1249 "m"(kAddUV128) // %2 1250 ); 1251 asm volatile ( 1252 "sub %1,%2 \n" 1253 LABELALIGN 1254 "1: \n" 1255 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1256 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1257 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1258 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" 1259 "pmaddubsw %%xmm4,%%xmm0 \n" 1260 "pmaddubsw %%xmm4,%%xmm1 \n" 1261 "pmaddubsw %%xmm4,%%xmm2 \n" 1262 "pmaddubsw %%xmm4,%%xmm6 \n" 1263 "phaddw %%xmm1,%%xmm0 \n" 1264 "phaddw %%xmm6,%%xmm2 \n" 1265 "psraw $0x8,%%xmm0 \n" 1266 "psraw $0x8,%%xmm2 \n" 1267 "packsswb %%xmm2,%%xmm0 \n" 1268 "paddb %%xmm5,%%xmm0 \n" 1269 "sub $0x10,%3 \n" 1270 "movdqu %%xmm0," MEMACCESS(1) " \n" 1271 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1272 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1273 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1274 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" 1275 "pmaddubsw %%xmm3,%%xmm0 \n" 1276 "pmaddubsw %%xmm3,%%xmm1 \n" 1277 "pmaddubsw %%xmm3,%%xmm2 \n" 1278 "pmaddubsw %%xmm3,%%xmm6 \n" 1279 "phaddw %%xmm1,%%xmm0 \n" 1280 "phaddw %%xmm6,%%xmm2 \n" 1281 "psraw $0x8,%%xmm0 \n" 1282 "psraw $0x8,%%xmm2 \n" 1283 "packsswb %%xmm2,%%xmm0 \n" 1284 "paddb %%xmm5,%%xmm0 \n" 1285 "lea " MEMLEA(0x40,0) ",%0 \n" 1286 BUNDLEALIGN 1287 MEMOPMEM(movdqu,xmm0,0x00,1,2,1) // movdqu %%xmm0,(%1,%2,1) 1288 "lea " MEMLEA(0x10,1) ",%1 \n" 1289 "jg 1b \n" 1290 : "+r"(src_argb), // %0 1291 "+r"(dst_u), // %1 1292 "+r"(dst_v), // %2 1293 "+rm"(width) // %3 1294 : 1295 : "memory", "cc" 1296#if defined(__native_client__) && defined(__x86_64__) 1297 , "r14" 1298#endif 1299#if defined(__SSE2__) 1300 , "xmm0", "xmm1", "xmm2", "xmm6" 1301#endif 1302 ); 1303} 1304 1305void ARGBToUV422Row_SSSE3(const uint8* src_argb0, 1306 uint8* dst_u, uint8* dst_v, int width) { 1307 asm volatile ( 1308 "movdqa %0,%%xmm4 \n" 1309 "movdqa %1,%%xmm3 \n" 1310 "movdqa %2,%%xmm5 \n" 1311 : 1312 : "m"(kARGBToU), // %0 1313 "m"(kARGBToV), // %1 1314 "m"(kAddUV128) // %2 1315 ); 1316 asm volatile ( 1317 "sub %1,%2 \n" 1318 LABELALIGN 1319 "1: \n" 1320 "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1321 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1322 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1323 "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" 1324 "lea " MEMLEA(0x40,0) ",%0 \n" 1325 "movdqa %%xmm0,%%xmm7 \n" 1326 "shufps $0x88,%%xmm1,%%xmm0 \n" 1327 "shufps $0xdd,%%xmm1,%%xmm7 \n" 1328 "pavgb %%xmm7,%%xmm0 \n" 1329 "movdqa %%xmm2,%%xmm7 \n" 1330 "shufps $0x88,%%xmm6,%%xmm2 \n" 1331 "shufps $0xdd,%%xmm6,%%xmm7 \n" 1332 "pavgb %%xmm7,%%xmm2 \n" 1333 "movdqa %%xmm0,%%xmm1 \n" 1334 "movdqa %%xmm2,%%xmm6 \n" 1335 "pmaddubsw %%xmm4,%%xmm0 \n" 1336 "pmaddubsw %%xmm4,%%xmm2 \n" 1337 "pmaddubsw %%xmm3,%%xmm1 \n" 1338 "pmaddubsw %%xmm3,%%xmm6 \n" 1339 "phaddw %%xmm2,%%xmm0 \n" 1340 "phaddw %%xmm6,%%xmm1 \n" 1341 "psraw $0x8,%%xmm0 \n" 1342 "psraw $0x8,%%xmm1 \n" 1343 "packsswb %%xmm1,%%xmm0 \n" 1344 "paddb %%xmm5,%%xmm0 \n" 1345 "sub $0x10,%3 \n" 1346 "movlps %%xmm0," MEMACCESS(1) " \n" 1347 BUNDLEALIGN 1348 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) 1349 "lea " MEMLEA(0x8,1) ",%1 \n" 1350 "jg 1b \n" 1351 : "+r"(src_argb0), // %0 1352 "+r"(dst_u), // %1 1353 "+r"(dst_v), // %2 1354 "+rm"(width) // %3 1355 : 1356 : "memory", "cc" 1357#if defined(__native_client__) && defined(__x86_64__) 1358 , "r14" 1359#endif 1360#if defined(__SSE2__) 1361 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" 1362#endif 1363 ); 1364} 1365 1366void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0, 1367 uint8* dst_u, uint8* dst_v, int width) { 1368 asm volatile ( 1369 "movdqa %0,%%xmm4 \n" 1370 "movdqa %1,%%xmm3 \n" 1371 "movdqa %2,%%xmm5 \n" 1372 : 1373 : "m"(kARGBToU), // %0 1374 "m"(kARGBToV), // %1 1375 "m"(kAddUV128) // %2 1376 ); 1377 asm volatile ( 1378 "sub %1,%2 \n" 1379 LABELALIGN 1380 "1: \n" 1381 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1382 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1383 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1384 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" 1385 "lea " MEMLEA(0x40,0) ",%0 \n" 1386 "movdqa %%xmm0,%%xmm7 \n" 1387 "shufps $0x88,%%xmm1,%%xmm0 \n" 1388 "shufps $0xdd,%%xmm1,%%xmm7 \n" 1389 "pavgb %%xmm7,%%xmm0 \n" 1390 "movdqa %%xmm2,%%xmm7 \n" 1391 "shufps $0x88,%%xmm6,%%xmm2 \n" 1392 "shufps $0xdd,%%xmm6,%%xmm7 \n" 1393 "pavgb %%xmm7,%%xmm2 \n" 1394 "movdqa %%xmm0,%%xmm1 \n" 1395 "movdqa %%xmm2,%%xmm6 \n" 1396 "pmaddubsw %%xmm4,%%xmm0 \n" 1397 "pmaddubsw %%xmm4,%%xmm2 \n" 1398 "pmaddubsw %%xmm3,%%xmm1 \n" 1399 "pmaddubsw %%xmm3,%%xmm6 \n" 1400 "phaddw %%xmm2,%%xmm0 \n" 1401 "phaddw %%xmm6,%%xmm1 \n" 1402 "psraw $0x8,%%xmm0 \n" 1403 "psraw $0x8,%%xmm1 \n" 1404 "packsswb %%xmm1,%%xmm0 \n" 1405 "paddb %%xmm5,%%xmm0 \n" 1406 "sub $0x10,%3 \n" 1407 "movlps %%xmm0," MEMACCESS(1) " \n" 1408 BUNDLEALIGN 1409 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) 1410 "lea " MEMLEA(0x8,1) ",%1 \n" 1411 "jg 1b \n" 1412 : "+r"(src_argb0), // %0 1413 "+r"(dst_u), // %1 1414 "+r"(dst_v), // %2 1415 "+rm"(width) // %3 1416 : 1417 : "memory", "cc" 1418#if defined(__native_client__) && defined(__x86_64__) 1419 , "r14" 1420#endif 1421#if defined(__SSE2__) 1422 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" 1423#endif 1424 ); 1425} 1426 1427void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) { 1428 asm volatile ( 1429 "movdqa %4,%%xmm5 \n" 1430 "movdqa %3,%%xmm4 \n" 1431 LABELALIGN 1432 "1: \n" 1433 "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1434 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1435 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1436 "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" 1437 "pmaddubsw %%xmm4,%%xmm0 \n" 1438 "pmaddubsw %%xmm4,%%xmm1 \n" 1439 "pmaddubsw %%xmm4,%%xmm2 \n" 1440 "pmaddubsw %%xmm4,%%xmm3 \n" 1441 "lea " MEMLEA(0x40,0) ",%0 \n" 1442 "phaddw %%xmm1,%%xmm0 \n" 1443 "phaddw %%xmm3,%%xmm2 \n" 1444 "psrlw $0x7,%%xmm0 \n" 1445 "psrlw $0x7,%%xmm2 \n" 1446 "packuswb %%xmm2,%%xmm0 \n" 1447 "paddb %%xmm5,%%xmm0 \n" 1448 "sub $0x10,%2 \n" 1449 "movdqa %%xmm0," MEMACCESS(1) " \n" 1450 "lea " MEMLEA(0x10,1) ",%1 \n" 1451 "jg 1b \n" 1452 : "+r"(src_bgra), // %0 1453 "+r"(dst_y), // %1 1454 "+r"(pix) // %2 1455 : "m"(kBGRAToY), // %3 1456 "m"(kAddY16) // %4 1457 : "memory", "cc" 1458#if defined(__SSE2__) 1459 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1460#endif 1461 ); 1462} 1463 1464void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) { 1465 asm volatile ( 1466 "movdqa %4,%%xmm5 \n" 1467 "movdqa %3,%%xmm4 \n" 1468 LABELALIGN 1469 "1: \n" 1470 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1471 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1472 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1473 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" 1474 "pmaddubsw %%xmm4,%%xmm0 \n" 1475 "pmaddubsw %%xmm4,%%xmm1 \n" 1476 "pmaddubsw %%xmm4,%%xmm2 \n" 1477 "pmaddubsw %%xmm4,%%xmm3 \n" 1478 "lea " MEMLEA(0x40,0) ",%0 \n" 1479 "phaddw %%xmm1,%%xmm0 \n" 1480 "phaddw %%xmm3,%%xmm2 \n" 1481 "psrlw $0x7,%%xmm0 \n" 1482 "psrlw $0x7,%%xmm2 \n" 1483 "packuswb %%xmm2,%%xmm0 \n" 1484 "paddb %%xmm5,%%xmm0 \n" 1485 "sub $0x10,%2 \n" 1486 "movdqu %%xmm0," MEMACCESS(1) " \n" 1487 "lea " MEMLEA(0x10,1) ",%1 \n" 1488 "jg 1b \n" 1489 : "+r"(src_bgra), // %0 1490 "+r"(dst_y), // %1 1491 "+r"(pix) // %2 1492 : "m"(kBGRAToY), // %3 1493 "m"(kAddY16) // %4 1494 : "memory", "cc" 1495#if defined(__SSE2__) 1496 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1497#endif 1498 ); 1499} 1500 1501void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra, 1502 uint8* dst_u, uint8* dst_v, int width) { 1503 asm volatile ( 1504 "movdqa %0,%%xmm4 \n" 1505 "movdqa %1,%%xmm3 \n" 1506 "movdqa %2,%%xmm5 \n" 1507 : 1508 : "m"(kBGRAToU), // %0 1509 "m"(kBGRAToV), // %1 1510 "m"(kAddUV128) // %2 1511 ); 1512 asm volatile ( 1513 "sub %1,%2 \n" 1514 LABELALIGN 1515 "1: \n" 1516 "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1517 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1518 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1519 "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" 1520 BUNDLEALIGN 1521 MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 1522 MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 1523 MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2 1524 MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6 1525 "lea " MEMLEA(0x40,0) ",%0 \n" 1526 "movdqa %%xmm0,%%xmm7 \n" 1527 "shufps $0x88,%%xmm1,%%xmm0 \n" 1528 "shufps $0xdd,%%xmm1,%%xmm7 \n" 1529 "pavgb %%xmm7,%%xmm0 \n" 1530 "movdqa %%xmm2,%%xmm7 \n" 1531 "shufps $0x88,%%xmm6,%%xmm2 \n" 1532 "shufps $0xdd,%%xmm6,%%xmm7 \n" 1533 "pavgb %%xmm7,%%xmm2 \n" 1534 "movdqa %%xmm0,%%xmm1 \n" 1535 "movdqa %%xmm2,%%xmm6 \n" 1536 "pmaddubsw %%xmm4,%%xmm0 \n" 1537 "pmaddubsw %%xmm4,%%xmm2 \n" 1538 "pmaddubsw %%xmm3,%%xmm1 \n" 1539 "pmaddubsw %%xmm3,%%xmm6 \n" 1540 "phaddw %%xmm2,%%xmm0 \n" 1541 "phaddw %%xmm6,%%xmm1 \n" 1542 "psraw $0x8,%%xmm0 \n" 1543 "psraw $0x8,%%xmm1 \n" 1544 "packsswb %%xmm1,%%xmm0 \n" 1545 "paddb %%xmm5,%%xmm0 \n" 1546 "sub $0x10,%3 \n" 1547 "movlps %%xmm0," MEMACCESS(1) " \n" 1548 BUNDLEALIGN 1549 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) 1550 "lea " MEMLEA(0x8,1) ",%1 \n" 1551 "jg 1b \n" 1552 : "+r"(src_bgra0), // %0 1553 "+r"(dst_u), // %1 1554 "+r"(dst_v), // %2 1555 "+rm"(width) // %3 1556 : "r"((intptr_t)(src_stride_bgra)) // %4 1557 : "memory", "cc" 1558#if defined(__native_client__) && defined(__x86_64__) 1559 , "r14" 1560#endif 1561#if defined(__SSE2__) 1562 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" 1563#endif 1564 ); 1565} 1566 1567void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra, 1568 uint8* dst_u, uint8* dst_v, int width) { 1569 asm volatile ( 1570 "movdqa %0,%%xmm4 \n" 1571 "movdqa %1,%%xmm3 \n" 1572 "movdqa %2,%%xmm5 \n" 1573 : 1574 : "m"(kBGRAToU), // %0 1575 "m"(kBGRAToV), // %1 1576 "m"(kAddUV128) // %2 1577 ); 1578 asm volatile ( 1579 "sub %1,%2 \n" 1580 LABELALIGN 1581 "1: \n" 1582 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1583 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1584 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1585 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" 1586 BUNDLEALIGN 1587 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 1588 "pavgb %%xmm7,%%xmm0 \n" 1589 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 1590 "pavgb %%xmm7,%%xmm1 \n" 1591 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 1592 "pavgb %%xmm7,%%xmm2 \n" 1593 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 1594 "pavgb %%xmm7,%%xmm6 \n" 1595 "lea " MEMLEA(0x40,0) ",%0 \n" 1596 "movdqa %%xmm0,%%xmm7 \n" 1597 "shufps $0x88,%%xmm1,%%xmm0 \n" 1598 "shufps $0xdd,%%xmm1,%%xmm7 \n" 1599 "pavgb %%xmm7,%%xmm0 \n" 1600 "movdqa %%xmm2,%%xmm7 \n" 1601 "shufps $0x88,%%xmm6,%%xmm2 \n" 1602 "shufps $0xdd,%%xmm6,%%xmm7 \n" 1603 "pavgb %%xmm7,%%xmm2 \n" 1604 "movdqa %%xmm0,%%xmm1 \n" 1605 "movdqa %%xmm2,%%xmm6 \n" 1606 "pmaddubsw %%xmm4,%%xmm0 \n" 1607 "pmaddubsw %%xmm4,%%xmm2 \n" 1608 "pmaddubsw %%xmm3,%%xmm1 \n" 1609 "pmaddubsw %%xmm3,%%xmm6 \n" 1610 "phaddw %%xmm2,%%xmm0 \n" 1611 "phaddw %%xmm6,%%xmm1 \n" 1612 "psraw $0x8,%%xmm0 \n" 1613 "psraw $0x8,%%xmm1 \n" 1614 "packsswb %%xmm1,%%xmm0 \n" 1615 "paddb %%xmm5,%%xmm0 \n" 1616 "sub $0x10,%3 \n" 1617 "movlps %%xmm0," MEMACCESS(1) " \n" 1618 BUNDLEALIGN 1619 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) 1620 "lea " MEMLEA(0x8,1) ",%1 \n" 1621 "jg 1b \n" 1622 : "+r"(src_bgra0), // %0 1623 "+r"(dst_u), // %1 1624 "+r"(dst_v), // %2 1625 "+rm"(width) // %3 1626 : "r"((intptr_t)(src_stride_bgra)) // %4 1627 : "memory", "cc" 1628#if defined(__native_client__) && defined(__x86_64__) 1629 , "r14" 1630#endif 1631#if defined(__SSE2__) 1632 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" 1633#endif 1634 ); 1635} 1636 1637void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) { 1638 asm volatile ( 1639 "movdqa %4,%%xmm5 \n" 1640 "movdqa %3,%%xmm4 \n" 1641 LABELALIGN 1642 "1: \n" 1643 "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1644 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1645 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1646 "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" 1647 "pmaddubsw %%xmm4,%%xmm0 \n" 1648 "pmaddubsw %%xmm4,%%xmm1 \n" 1649 "pmaddubsw %%xmm4,%%xmm2 \n" 1650 "pmaddubsw %%xmm4,%%xmm3 \n" 1651 "lea " MEMLEA(0x40,0) ",%0 \n" 1652 "phaddw %%xmm1,%%xmm0 \n" 1653 "phaddw %%xmm3,%%xmm2 \n" 1654 "psrlw $0x7,%%xmm0 \n" 1655 "psrlw $0x7,%%xmm2 \n" 1656 "packuswb %%xmm2,%%xmm0 \n" 1657 "paddb %%xmm5,%%xmm0 \n" 1658 "sub $0x10,%2 \n" 1659 "movdqa %%xmm0," MEMACCESS(1) " \n" 1660 "lea " MEMLEA(0x10,1) ",%1 \n" 1661 "jg 1b \n" 1662 : "+r"(src_abgr), // %0 1663 "+r"(dst_y), // %1 1664 "+r"(pix) // %2 1665 : "m"(kABGRToY), // %3 1666 "m"(kAddY16) // %4 1667 : "memory", "cc" 1668#if defined(__SSE2__) 1669 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1670#endif 1671 ); 1672} 1673 1674void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) { 1675 asm volatile ( 1676 "movdqa %4,%%xmm5 \n" 1677 "movdqa %3,%%xmm4 \n" 1678 LABELALIGN 1679 "1: \n" 1680 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1681 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1682 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1683 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" 1684 "pmaddubsw %%xmm4,%%xmm0 \n" 1685 "pmaddubsw %%xmm4,%%xmm1 \n" 1686 "pmaddubsw %%xmm4,%%xmm2 \n" 1687 "pmaddubsw %%xmm4,%%xmm3 \n" 1688 "lea " MEMLEA(0x40,0) ",%0 \n" 1689 "phaddw %%xmm1,%%xmm0 \n" 1690 "phaddw %%xmm3,%%xmm2 \n" 1691 "psrlw $0x7,%%xmm0 \n" 1692 "psrlw $0x7,%%xmm2 \n" 1693 "packuswb %%xmm2,%%xmm0 \n" 1694 "paddb %%xmm5,%%xmm0 \n" 1695 "sub $0x10,%2 \n" 1696 "movdqu %%xmm0," MEMACCESS(1) " \n" 1697 "lea " MEMLEA(0x10,1) ",%1 \n" 1698 "jg 1b \n" 1699 : "+r"(src_abgr), // %0 1700 "+r"(dst_y), // %1 1701 "+r"(pix) // %2 1702 : "m"(kABGRToY), // %3 1703 "m"(kAddY16) // %4 1704 : "memory", "cc" 1705#if defined(__SSE2__) 1706 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1707#endif 1708 ); 1709} 1710 1711void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) { 1712 asm volatile ( 1713 "movdqa %4,%%xmm5 \n" 1714 "movdqa %3,%%xmm4 \n" 1715 LABELALIGN 1716 "1: \n" 1717 "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1718 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1719 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1720 "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" 1721 "pmaddubsw %%xmm4,%%xmm0 \n" 1722 "pmaddubsw %%xmm4,%%xmm1 \n" 1723 "pmaddubsw %%xmm4,%%xmm2 \n" 1724 "pmaddubsw %%xmm4,%%xmm3 \n" 1725 "lea " MEMLEA(0x40,0) ",%0 \n" 1726 "phaddw %%xmm1,%%xmm0 \n" 1727 "phaddw %%xmm3,%%xmm2 \n" 1728 "psrlw $0x7,%%xmm0 \n" 1729 "psrlw $0x7,%%xmm2 \n" 1730 "packuswb %%xmm2,%%xmm0 \n" 1731 "paddb %%xmm5,%%xmm0 \n" 1732 "sub $0x10,%2 \n" 1733 "movdqa %%xmm0," MEMACCESS(1) " \n" 1734 "lea " MEMLEA(0x10,1) ",%1 \n" 1735 "jg 1b \n" 1736 : "+r"(src_rgba), // %0 1737 "+r"(dst_y), // %1 1738 "+r"(pix) // %2 1739 : "m"(kRGBAToY), // %3 1740 "m"(kAddY16) // %4 1741 : "memory", "cc" 1742#if defined(__SSE2__) 1743 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1744#endif 1745 ); 1746} 1747 1748void RGBAToYRow_Unaligned_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) { 1749 asm volatile ( 1750 "movdqa %4,%%xmm5 \n" 1751 "movdqa %3,%%xmm4 \n" 1752 LABELALIGN 1753 "1: \n" 1754 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1755 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1756 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1757 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" 1758 "pmaddubsw %%xmm4,%%xmm0 \n" 1759 "pmaddubsw %%xmm4,%%xmm1 \n" 1760 "pmaddubsw %%xmm4,%%xmm2 \n" 1761 "pmaddubsw %%xmm4,%%xmm3 \n" 1762 "lea " MEMLEA(0x40,0) ",%0 \n" 1763 "phaddw %%xmm1,%%xmm0 \n" 1764 "phaddw %%xmm3,%%xmm2 \n" 1765 "psrlw $0x7,%%xmm0 \n" 1766 "psrlw $0x7,%%xmm2 \n" 1767 "packuswb %%xmm2,%%xmm0 \n" 1768 "paddb %%xmm5,%%xmm0 \n" 1769 "sub $0x10,%2 \n" 1770 "movdqu %%xmm0," MEMACCESS(1) " \n" 1771 "lea " MEMLEA(0x10,1) ",%1 \n" 1772 "jg 1b \n" 1773 : "+r"(src_rgba), // %0 1774 "+r"(dst_y), // %1 1775 "+r"(pix) // %2 1776 : "m"(kRGBAToY), // %3 1777 "m"(kAddY16) // %4 1778 : "memory", "cc" 1779#if defined(__SSE2__) 1780 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1781#endif 1782 ); 1783} 1784 1785void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr, 1786 uint8* dst_u, uint8* dst_v, int width) { 1787 asm volatile ( 1788 "movdqa %0,%%xmm4 \n" 1789 "movdqa %1,%%xmm3 \n" 1790 "movdqa %2,%%xmm5 \n" 1791 : 1792 : "m"(kABGRToU), // %0 1793 "m"(kABGRToV), // %1 1794 "m"(kAddUV128) // %2 1795 ); 1796 asm volatile ( 1797 "sub %1,%2 \n" 1798 LABELALIGN 1799 "1: \n" 1800 "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1801 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1802 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1803 "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" 1804 BUNDLEALIGN 1805 MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 1806 MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 1807 MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2 1808 MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6 1809 "lea " MEMLEA(0x40,0) ",%0 \n" 1810 "movdqa %%xmm0,%%xmm7 \n" 1811 "shufps $0x88,%%xmm1,%%xmm0 \n" 1812 "shufps $0xdd,%%xmm1,%%xmm7 \n" 1813 "pavgb %%xmm7,%%xmm0 \n" 1814 "movdqa %%xmm2,%%xmm7 \n" 1815 "shufps $0x88,%%xmm6,%%xmm2 \n" 1816 "shufps $0xdd,%%xmm6,%%xmm7 \n" 1817 "pavgb %%xmm7,%%xmm2 \n" 1818 "movdqa %%xmm0,%%xmm1 \n" 1819 "movdqa %%xmm2,%%xmm6 \n" 1820 "pmaddubsw %%xmm4,%%xmm0 \n" 1821 "pmaddubsw %%xmm4,%%xmm2 \n" 1822 "pmaddubsw %%xmm3,%%xmm1 \n" 1823 "pmaddubsw %%xmm3,%%xmm6 \n" 1824 "phaddw %%xmm2,%%xmm0 \n" 1825 "phaddw %%xmm6,%%xmm1 \n" 1826 "psraw $0x8,%%xmm0 \n" 1827 "psraw $0x8,%%xmm1 \n" 1828 "packsswb %%xmm1,%%xmm0 \n" 1829 "paddb %%xmm5,%%xmm0 \n" 1830 "sub $0x10,%3 \n" 1831 "movlps %%xmm0," MEMACCESS(1) " \n" 1832 BUNDLEALIGN 1833 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) 1834 "lea " MEMLEA(0x8,1) ",%1 \n" 1835 "jg 1b \n" 1836 : "+r"(src_abgr0), // %0 1837 "+r"(dst_u), // %1 1838 "+r"(dst_v), // %2 1839 "+rm"(width) // %3 1840 : "r"((intptr_t)(src_stride_abgr)) // %4 1841 : "memory", "cc" 1842#if defined(__native_client__) && defined(__x86_64__) 1843 , "r14" 1844#endif 1845#if defined(__SSE2__) 1846 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" 1847#endif 1848 ); 1849} 1850 1851void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr, 1852 uint8* dst_u, uint8* dst_v, int width) { 1853 asm volatile ( 1854 "movdqa %0,%%xmm4 \n" 1855 "movdqa %1,%%xmm3 \n" 1856 "movdqa %2,%%xmm5 \n" 1857 : 1858 : "m"(kABGRToU), // %0 1859 "m"(kABGRToV), // %1 1860 "m"(kAddUV128) // %2 1861 ); 1862 asm volatile ( 1863 "sub %1,%2 \n" 1864 LABELALIGN 1865 "1: \n" 1866 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1867 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1868 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1869 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" 1870 BUNDLEALIGN 1871 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 1872 "pavgb %%xmm7,%%xmm0 \n" 1873 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 1874 "pavgb %%xmm7,%%xmm1 \n" 1875 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 1876 "pavgb %%xmm7,%%xmm2 \n" 1877 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 1878 "pavgb %%xmm7,%%xmm6 \n" 1879 "lea " MEMLEA(0x40,0) ",%0 \n" 1880 "movdqa %%xmm0,%%xmm7 \n" 1881 "shufps $0x88,%%xmm1,%%xmm0 \n" 1882 "shufps $0xdd,%%xmm1,%%xmm7 \n" 1883 "pavgb %%xmm7,%%xmm0 \n" 1884 "movdqa %%xmm2,%%xmm7 \n" 1885 "shufps $0x88,%%xmm6,%%xmm2 \n" 1886 "shufps $0xdd,%%xmm6,%%xmm7 \n" 1887 "pavgb %%xmm7,%%xmm2 \n" 1888 "movdqa %%xmm0,%%xmm1 \n" 1889 "movdqa %%xmm2,%%xmm6 \n" 1890 "pmaddubsw %%xmm4,%%xmm0 \n" 1891 "pmaddubsw %%xmm4,%%xmm2 \n" 1892 "pmaddubsw %%xmm3,%%xmm1 \n" 1893 "pmaddubsw %%xmm3,%%xmm6 \n" 1894 "phaddw %%xmm2,%%xmm0 \n" 1895 "phaddw %%xmm6,%%xmm1 \n" 1896 "psraw $0x8,%%xmm0 \n" 1897 "psraw $0x8,%%xmm1 \n" 1898 "packsswb %%xmm1,%%xmm0 \n" 1899 "paddb %%xmm5,%%xmm0 \n" 1900 "sub $0x10,%3 \n" 1901 "movlps %%xmm0," MEMACCESS(1) " \n" 1902 BUNDLEALIGN 1903 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) 1904 "lea " MEMLEA(0x8,1) ",%1 \n" 1905 "jg 1b \n" 1906 : "+r"(src_abgr0), // %0 1907 "+r"(dst_u), // %1 1908 "+r"(dst_v), // %2 1909 "+rm"(width) // %3 1910 : "r"((intptr_t)(src_stride_abgr)) // %4 1911 : "memory", "cc" 1912#if defined(__native_client__) && defined(__x86_64__) 1913 , "r14" 1914#endif 1915#if defined(__SSE2__) 1916 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" 1917#endif 1918 ); 1919} 1920 1921void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, 1922 uint8* dst_u, uint8* dst_v, int width) { 1923 asm volatile ( 1924 "movdqa %0,%%xmm4 \n" 1925 "movdqa %1,%%xmm3 \n" 1926 "movdqa %2,%%xmm5 \n" 1927 : 1928 : "m"(kRGBAToU), // %0 1929 "m"(kRGBAToV), // %1 1930 "m"(kAddUV128) // %2 1931 ); 1932 asm volatile ( 1933 "sub %1,%2 \n" 1934 LABELALIGN 1935 "1: \n" 1936 "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1937 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1938 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1939 "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" 1940 BUNDLEALIGN 1941 MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 1942 MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 1943 MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2 1944 MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6 1945 "lea " MEMLEA(0x40,0) ",%0 \n" 1946 "movdqa %%xmm0,%%xmm7 \n" 1947 "shufps $0x88,%%xmm1,%%xmm0 \n" 1948 "shufps $0xdd,%%xmm1,%%xmm7 \n" 1949 "pavgb %%xmm7,%%xmm0 \n" 1950 "movdqa %%xmm2,%%xmm7 \n" 1951 "shufps $0x88,%%xmm6,%%xmm2 \n" 1952 "shufps $0xdd,%%xmm6,%%xmm7 \n" 1953 "pavgb %%xmm7,%%xmm2 \n" 1954 "movdqa %%xmm0,%%xmm1 \n" 1955 "movdqa %%xmm2,%%xmm6 \n" 1956 "pmaddubsw %%xmm4,%%xmm0 \n" 1957 "pmaddubsw %%xmm4,%%xmm2 \n" 1958 "pmaddubsw %%xmm3,%%xmm1 \n" 1959 "pmaddubsw %%xmm3,%%xmm6 \n" 1960 "phaddw %%xmm2,%%xmm0 \n" 1961 "phaddw %%xmm6,%%xmm1 \n" 1962 "psraw $0x8,%%xmm0 \n" 1963 "psraw $0x8,%%xmm1 \n" 1964 "packsswb %%xmm1,%%xmm0 \n" 1965 "paddb %%xmm5,%%xmm0 \n" 1966 "sub $0x10,%3 \n" 1967 "movlps %%xmm0," MEMACCESS(1) " \n" 1968 BUNDLEALIGN 1969 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) 1970 "lea " MEMLEA(0x8,1) ",%1 \n" 1971 "jg 1b \n" 1972 : "+r"(src_rgba0), // %0 1973 "+r"(dst_u), // %1 1974 "+r"(dst_v), // %2 1975 "+rm"(width) // %3 1976 : "r"((intptr_t)(src_stride_rgba)) 1977 : "memory", "cc" 1978#if defined(__native_client__) && defined(__x86_64__) 1979 , "r14" 1980#endif 1981#if defined(__SSE2__) 1982 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" 1983#endif 1984 ); 1985} 1986 1987void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba0, int src_stride_rgba, 1988 uint8* dst_u, uint8* dst_v, int width) { 1989 asm volatile ( 1990 "movdqa %0,%%xmm4 \n" 1991 "movdqa %1,%%xmm3 \n" 1992 "movdqa %2,%%xmm5 \n" 1993 : 1994 : "m"(kRGBAToU), // %0 1995 "m"(kRGBAToV), // %1 1996 "m"(kAddUV128) // %2 1997 ); 1998 asm volatile ( 1999 "sub %1,%2 \n" 2000 LABELALIGN 2001 "1: \n" 2002 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 2003 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 2004 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 2005 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" 2006 BUNDLEALIGN 2007 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 2008 "pavgb %%xmm7,%%xmm0 \n" 2009 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 2010 "pavgb %%xmm7,%%xmm1 \n" 2011 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 2012 "pavgb %%xmm7,%%xmm2 \n" 2013 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 2014 "pavgb %%xmm7,%%xmm6 \n" 2015 "lea " MEMLEA(0x40,0) ",%0 \n" 2016 "movdqa %%xmm0,%%xmm7 \n" 2017 "shufps $0x88,%%xmm1,%%xmm0 \n" 2018 "shufps $0xdd,%%xmm1,%%xmm7 \n" 2019 "pavgb %%xmm7,%%xmm0 \n" 2020 "movdqa %%xmm2,%%xmm7 \n" 2021 "shufps $0x88,%%xmm6,%%xmm2 \n" 2022 "shufps $0xdd,%%xmm6,%%xmm7 \n" 2023 "pavgb %%xmm7,%%xmm2 \n" 2024 "movdqa %%xmm0,%%xmm1 \n" 2025 "movdqa %%xmm2,%%xmm6 \n" 2026 "pmaddubsw %%xmm4,%%xmm0 \n" 2027 "pmaddubsw %%xmm4,%%xmm2 \n" 2028 "pmaddubsw %%xmm3,%%xmm1 \n" 2029 "pmaddubsw %%xmm3,%%xmm6 \n" 2030 "phaddw %%xmm2,%%xmm0 \n" 2031 "phaddw %%xmm6,%%xmm1 \n" 2032 "psraw $0x8,%%xmm0 \n" 2033 "psraw $0x8,%%xmm1 \n" 2034 "packsswb %%xmm1,%%xmm0 \n" 2035 "paddb %%xmm5,%%xmm0 \n" 2036 "sub $0x10,%3 \n" 2037 "movlps %%xmm0," MEMACCESS(1) " \n" 2038 BUNDLEALIGN 2039 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) 2040 "lea " MEMLEA(0x8,1) ",%1 \n" 2041 "jg 1b \n" 2042 : "+r"(src_rgba0), // %0 2043 "+r"(dst_u), // %1 2044 "+r"(dst_v), // %2 2045 "+rm"(width) // %3 2046 : "r"((intptr_t)(src_stride_rgba)) // %4 2047 : "memory", "cc" 2048#if defined(__native_client__) && defined(__x86_64__) 2049 , "r14" 2050#endif 2051#if defined(__SSE2__) 2052 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" 2053#endif 2054 ); 2055} 2056#endif // HAS_ARGBTOUVROW_SSSE3 2057 2058#ifdef HAS_I422TOARGBROW_SSSE3 2059#define UB 127 /* min(63,(int8)(2.018 * 64)) */ 2060#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */ 2061#define UR 0 2062 2063#define VB 0 2064#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */ 2065#define VR 102 /* (int8)(1.596 * 64 + 0.5) */ 2066 2067// Bias 2068#define BB UB * 128 + VB * 128 2069#define BG UG * 128 + VG * 128 2070#define BR UR * 128 + VR * 128 2071 2072#define YG 74 /* (int8)(1.164 * 64 + 0.5) */ 2073 2074struct { 2075 vec8 kUVToB; // 0 2076 vec8 kUVToG; // 16 2077 vec8 kUVToR; // 32 2078 vec16 kUVBiasB; // 48 2079 vec16 kUVBiasG; // 64 2080 vec16 kUVBiasR; // 80 2081 vec16 kYSub16; // 96 2082 vec16 kYToRgb; // 112 2083 vec8 kVUToB; // 128 2084 vec8 kVUToG; // 144 2085 vec8 kVUToR; // 160 2086} static SIMD_ALIGNED(kYuvConstants) = { 2087 { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB }, 2088 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG }, 2089 { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR }, 2090 { BB, BB, BB, BB, BB, BB, BB, BB }, 2091 { BG, BG, BG, BG, BG, BG, BG, BG }, 2092 { BR, BR, BR, BR, BR, BR, BR, BR }, 2093 { 16, 16, 16, 16, 16, 16, 16, 16 }, 2094 { YG, YG, YG, YG, YG, YG, YG, YG }, 2095 { VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB }, 2096 { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG }, 2097 { VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR } 2098}; 2099 2100 2101// Read 8 UV from 411 2102#define READYUV444 \ 2103 "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ 2104 BUNDLEALIGN \ 2105 MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \ 2106 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ 2107 "punpcklbw %%xmm1,%%xmm0 \n" 2108 2109// Read 4 UV from 422, upsample to 8 UV 2110#define READYUV422 \ 2111 "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ 2112 BUNDLEALIGN \ 2113 MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ 2114 "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \ 2115 "punpcklbw %%xmm1,%%xmm0 \n" \ 2116 "punpcklwd %%xmm0,%%xmm0 \n" 2117 2118// Read 2 UV from 411, upsample to 8 UV 2119#define READYUV411 \ 2120 "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ 2121 BUNDLEALIGN \ 2122 MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ 2123 "lea " MEMLEA(0x2, [u_buf]) ",%[u_buf] \n" \ 2124 "punpcklbw %%xmm1,%%xmm0 \n" \ 2125 "punpcklwd %%xmm0,%%xmm0 \n" \ 2126 "punpckldq %%xmm0,%%xmm0 \n" 2127 2128// Read 4 UV from NV12, upsample to 8 UV 2129#define READNV12 \ 2130 "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ 2131 "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \ 2132 "punpcklwd %%xmm0,%%xmm0 \n" 2133 2134// Convert 8 pixels: 8 UV and 8 Y 2135#define YUVTORGB \ 2136 "movdqa %%xmm0,%%xmm1 \n" \ 2137 "movdqa %%xmm0,%%xmm2 \n" \ 2138 "pmaddubsw " MEMACCESS([kYuvConstants]) ",%%xmm0 \n" \ 2139 "pmaddubsw " MEMACCESS2(16, [kYuvConstants]) ",%%xmm1 \n" \ 2140 "pmaddubsw " MEMACCESS2(32, [kYuvConstants]) ",%%xmm2 \n" \ 2141 "psubw " MEMACCESS2(48, [kYuvConstants]) ",%%xmm0 \n" \ 2142 "psubw " MEMACCESS2(64, [kYuvConstants]) ",%%xmm1 \n" \ 2143 "psubw " MEMACCESS2(80, [kYuvConstants]) ",%%xmm2 \n" \ 2144 "movq " MEMACCESS([y_buf]) ",%%xmm3 \n" \ 2145 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \ 2146 "punpcklbw %%xmm4,%%xmm3 \n" \ 2147 "psubsw " MEMACCESS2(96, [kYuvConstants]) ",%%xmm3 \n" \ 2148 "pmullw " MEMACCESS2(112, [kYuvConstants]) ",%%xmm3 \n" \ 2149 "paddsw %%xmm3,%%xmm0 \n" \ 2150 "paddsw %%xmm3,%%xmm1 \n" \ 2151 "paddsw %%xmm3,%%xmm2 \n" \ 2152 "psraw $0x6,%%xmm0 \n" \ 2153 "psraw $0x6,%%xmm1 \n" \ 2154 "psraw $0x6,%%xmm2 \n" \ 2155 "packuswb %%xmm0,%%xmm0 \n" \ 2156 "packuswb %%xmm1,%%xmm1 \n" \ 2157 "packuswb %%xmm2,%%xmm2 \n" 2158 2159// Convert 8 pixels: 8 VU and 8 Y 2160#define YVUTORGB \ 2161 "movdqa %%xmm0,%%xmm1 \n" \ 2162 "movdqa %%xmm0,%%xmm2 \n" \ 2163 "pmaddubsw " MEMACCESS2(128, [kYuvConstants]) ",%%xmm0 \n" \ 2164 "pmaddubsw " MEMACCESS2(144, [kYuvConstants]) ",%%xmm1 \n" \ 2165 "pmaddubsw " MEMACCESS2(160, [kYuvConstants]) ",%%xmm2 \n" \ 2166 "psubw " MEMACCESS2(48, [kYuvConstants]) ",%%xmm0 \n" \ 2167 "psubw " MEMACCESS2(64, [kYuvConstants]) ",%%xmm1 \n" \ 2168 "psubw " MEMACCESS2(80, [kYuvConstants]) ",%%xmm2 \n" \ 2169 "movq " MEMACCESS([y_buf]) ",%%xmm3 \n" \ 2170 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \ 2171 "punpcklbw %%xmm4,%%xmm3 \n" \ 2172 "psubsw " MEMACCESS2(96, [kYuvConstants]) ",%%xmm3 \n" \ 2173 "pmullw " MEMACCESS2(112, [kYuvConstants]) ",%%xmm3 \n" \ 2174 "paddsw %%xmm3,%%xmm0 \n" \ 2175 "paddsw %%xmm3,%%xmm1 \n" \ 2176 "paddsw %%xmm3,%%xmm2 \n" \ 2177 "psraw $0x6,%%xmm0 \n" \ 2178 "psraw $0x6,%%xmm1 \n" \ 2179 "psraw $0x6,%%xmm2 \n" \ 2180 "packuswb %%xmm0,%%xmm0 \n" \ 2181 "packuswb %%xmm1,%%xmm1 \n" \ 2182 "packuswb %%xmm2,%%xmm2 \n" 2183 2184void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, 2185 const uint8* u_buf, 2186 const uint8* v_buf, 2187 uint8* dst_argb, 2188 int width) { 2189 asm volatile ( 2190 "sub %[u_buf],%[v_buf] \n" 2191 "pcmpeqb %%xmm5,%%xmm5 \n" 2192 "pxor %%xmm4,%%xmm4 \n" 2193 LABELALIGN 2194 "1: \n" 2195 READYUV444 2196 YUVTORGB 2197 "punpcklbw %%xmm1,%%xmm0 \n" 2198 "punpcklbw %%xmm5,%%xmm2 \n" 2199 "movdqa %%xmm0,%%xmm1 \n" 2200 "punpcklwd %%xmm2,%%xmm0 \n" 2201 "punpckhwd %%xmm2,%%xmm1 \n" 2202 "movdqa %%xmm0," MEMACCESS([dst_argb]) " \n" 2203 "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) " \n" 2204 "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" 2205 "sub $0x8,%[width] \n" 2206 "jg 1b \n" 2207 : [y_buf]"+r"(y_buf), // %[y_buf] 2208 [u_buf]"+r"(u_buf), // %[u_buf] 2209 [v_buf]"+r"(v_buf), // %[v_buf] 2210 [dst_argb]"+r"(dst_argb), // %[dst_argb] 2211 [width]"+rm"(width) // %[width] 2212 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] 2213 : "memory", "cc" 2214#if defined(__native_client__) && defined(__x86_64__) 2215 , "r14" 2216#endif 2217#if defined(__SSE2__) 2218 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2219#endif 2220 ); 2221} 2222 2223void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf, 2224 const uint8* u_buf, 2225 const uint8* v_buf, 2226 uint8* dst_rgb24, 2227 int width) { 2228// fpic 32 bit gcc 4.2 on OSX runs out of GPR regs. 2229#if defined(__i386__) 2230 asm volatile ( 2231 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n" 2232 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n" 2233 :: [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0), 2234 [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)); 2235#endif 2236 2237 asm volatile ( 2238#if !defined(__i386__) 2239 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n" 2240 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n" 2241#endif 2242 "sub %[u_buf],%[v_buf] \n" 2243 "pxor %%xmm4,%%xmm4 \n" 2244 LABELALIGN 2245 "1: \n" 2246 READYUV422 2247 YUVTORGB 2248 "punpcklbw %%xmm1,%%xmm0 \n" 2249 "punpcklbw %%xmm2,%%xmm2 \n" 2250 "movdqa %%xmm0,%%xmm1 \n" 2251 "punpcklwd %%xmm2,%%xmm0 \n" 2252 "punpckhwd %%xmm2,%%xmm1 \n" 2253 "pshufb %%xmm5,%%xmm0 \n" 2254 "pshufb %%xmm6,%%xmm1 \n" 2255 "palignr $0xc,%%xmm0,%%xmm1 \n" 2256 "movq %%xmm0," MEMACCESS([dst_rgb24]) "\n" 2257 "movdqu %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n" 2258 "lea " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n" 2259 "sub $0x8,%[width] \n" 2260 "jg 1b \n" 2261 : [y_buf]"+r"(y_buf), // %[y_buf] 2262 [u_buf]"+r"(u_buf), // %[u_buf] 2263 [v_buf]"+r"(v_buf), // %[v_buf] 2264 [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24] 2265 [width]"+rm"(width) // %[width] 2266 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) 2267#if !defined(__i386__) 2268 , [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0), 2269 [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24) 2270#endif 2271 : "memory", "cc" 2272#if defined(__native_client__) && defined(__x86_64__) 2273 , "r14" 2274#endif 2275#if defined(__SSE2__) 2276 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" 2277#endif 2278 ); 2279} 2280 2281void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf, 2282 const uint8* u_buf, 2283 const uint8* v_buf, 2284 uint8* dst_raw, 2285 int width) { 2286// fpic 32 bit gcc 4.2 on OSX runs out of GPR regs. 2287#if defined(__i386__) 2288 asm volatile ( 2289 "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n" 2290 "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n" 2291 :: [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0), 2292 [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW)); 2293#endif 2294 2295 asm volatile ( 2296#if !defined(__i386__) 2297 "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n" 2298 "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n" 2299#endif 2300 "sub %[u_buf],%[v_buf] \n" 2301 "pxor %%xmm4,%%xmm4 \n" 2302 LABELALIGN 2303 "1: \n" 2304 READYUV422 2305 YUVTORGB 2306 "punpcklbw %%xmm1,%%xmm0 \n" 2307 "punpcklbw %%xmm2,%%xmm2 \n" 2308 "movdqa %%xmm0,%%xmm1 \n" 2309 "punpcklwd %%xmm2,%%xmm0 \n" 2310 "punpckhwd %%xmm2,%%xmm1 \n" 2311 "pshufb %%xmm5,%%xmm0 \n" 2312 "pshufb %%xmm6,%%xmm1 \n" 2313 "palignr $0xc,%%xmm0,%%xmm1 \n" 2314 "movq %%xmm0," MEMACCESS([dst_raw]) " \n" 2315 "movdqu %%xmm1," MEMACCESS2(0x8,[dst_raw]) "\n" 2316 "lea " MEMLEA(0x18,[dst_raw]) ",%[dst_raw] \n" 2317 "sub $0x8,%[width] \n" 2318 "jg 1b \n" 2319 : [y_buf]"+r"(y_buf), // %[y_buf] 2320 [u_buf]"+r"(u_buf), // %[u_buf] 2321 [v_buf]"+r"(v_buf), // %[v_buf] 2322 [dst_raw]"+r"(dst_raw), // %[dst_raw] 2323 [width]"+rm"(width) // %[width] 2324 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) 2325#if !defined(__i386__) 2326 , [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0), 2327 [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW) 2328#endif 2329 : "memory", "cc" 2330#if defined(__native_client__) && defined(__x86_64__) 2331 , "r14" 2332#endif 2333#if defined(__SSE2__) 2334 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" 2335#endif 2336 ); 2337} 2338 2339void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf, 2340 const uint8* u_buf, 2341 const uint8* v_buf, 2342 uint8* dst_argb, 2343 int width) { 2344 asm volatile ( 2345 "sub %[u_buf],%[v_buf] \n" 2346 "pcmpeqb %%xmm5,%%xmm5 \n" 2347 "pxor %%xmm4,%%xmm4 \n" 2348 LABELALIGN 2349 "1: \n" 2350 READYUV422 2351 YUVTORGB 2352 "punpcklbw %%xmm1,%%xmm0 \n" 2353 "punpcklbw %%xmm5,%%xmm2 \n" 2354 "movdqa %%xmm0,%%xmm1 \n" 2355 "punpcklwd %%xmm2,%%xmm0 \n" 2356 "punpckhwd %%xmm2,%%xmm1 \n" 2357 "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n" 2358 "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" 2359 "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" 2360 "sub $0x8,%[width] \n" 2361 "jg 1b \n" 2362 : [y_buf]"+r"(y_buf), // %[y_buf] 2363 [u_buf]"+r"(u_buf), // %[u_buf] 2364 [v_buf]"+r"(v_buf), // %[v_buf] 2365 [dst_argb]"+r"(dst_argb), // %[dst_argb] 2366 [width]"+rm"(width) // %[width] 2367 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] 2368 : "memory", "cc" 2369#if defined(__native_client__) && defined(__x86_64__) 2370 , "r14" 2371#endif 2372#if defined(__SSE2__) 2373 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2374#endif 2375 ); 2376} 2377 2378void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, 2379 const uint8* u_buf, 2380 const uint8* v_buf, 2381 uint8* dst_argb, 2382 int width) { 2383 asm volatile ( 2384 "sub %[u_buf],%[v_buf] \n" 2385 "pcmpeqb %%xmm5,%%xmm5 \n" 2386 "pxor %%xmm4,%%xmm4 \n" 2387 LABELALIGN 2388 "1: \n" 2389 READYUV411 2390 YUVTORGB 2391 "punpcklbw %%xmm1,%%xmm0 \n" 2392 "punpcklbw %%xmm5,%%xmm2 \n" 2393 "movdqa %%xmm0,%%xmm1 \n" 2394 "punpcklwd %%xmm2,%%xmm0 \n" 2395 "punpckhwd %%xmm2,%%xmm1 \n" 2396 "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n" 2397 "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" 2398 "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" 2399 "sub $0x8,%[width] \n" 2400 "jg 1b \n" 2401 : [y_buf]"+r"(y_buf), // %[y_buf] 2402 [u_buf]"+r"(u_buf), // %[u_buf] 2403 [v_buf]"+r"(v_buf), // %[v_buf] 2404 [dst_argb]"+r"(dst_argb), // %[dst_argb] 2405 [width]"+rm"(width) // %[width] 2406 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] 2407 : "memory", "cc" 2408#if defined(__native_client__) && defined(__x86_64__) 2409 , "r14" 2410#endif 2411#if defined(__SSE2__) 2412 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2413#endif 2414 ); 2415} 2416 2417void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf, 2418 const uint8* uv_buf, 2419 uint8* dst_argb, 2420 int width) { 2421 asm volatile ( 2422 "pcmpeqb %%xmm5,%%xmm5 \n" 2423 "pxor %%xmm4,%%xmm4 \n" 2424 LABELALIGN 2425 "1: \n" 2426 READNV12 2427 YUVTORGB 2428 "punpcklbw %%xmm1,%%xmm0 \n" 2429 "punpcklbw %%xmm5,%%xmm2 \n" 2430 "movdqa %%xmm0,%%xmm1 \n" 2431 "punpcklwd %%xmm2,%%xmm0 \n" 2432 "punpckhwd %%xmm2,%%xmm1 \n" 2433 "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n" 2434 "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" 2435 "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" 2436 "sub $0x8,%[width] \n" 2437 "jg 1b \n" 2438 : [y_buf]"+r"(y_buf), // %[y_buf] 2439 [uv_buf]"+r"(uv_buf), // %[uv_buf] 2440 [dst_argb]"+r"(dst_argb), // %[dst_argb] 2441 [width]"+rm"(width) // %[width] 2442 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] 2443 : "memory", "cc" 2444 // Does not use r14. 2445#if defined(__SSE2__) 2446 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2447#endif 2448 ); 2449} 2450 2451void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf, 2452 const uint8* uv_buf, 2453 uint8* dst_argb, 2454 int width) { 2455 asm volatile ( 2456 "pcmpeqb %%xmm5,%%xmm5 \n" 2457 "pxor %%xmm4,%%xmm4 \n" 2458 LABELALIGN 2459 "1: \n" 2460 READNV12 2461 YVUTORGB 2462 "punpcklbw %%xmm1,%%xmm0 \n" 2463 "punpcklbw %%xmm5,%%xmm2 \n" 2464 "movdqa %%xmm0,%%xmm1 \n" 2465 "punpcklwd %%xmm2,%%xmm0 \n" 2466 "punpckhwd %%xmm2,%%xmm1 \n" 2467 "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n" 2468 "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" 2469 "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" 2470 "sub $0x8,%[width] \n" 2471 "jg 1b \n" 2472 : [y_buf]"+r"(y_buf), // %[y_buf] 2473 [uv_buf]"+r"(uv_buf), // %[uv_buf] 2474 [dst_argb]"+r"(dst_argb), // %[dst_argb] 2475 [width]"+rm"(width) // %[width] 2476 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] 2477 : "memory", "cc" 2478 // Does not use r14. 2479#if defined(__SSE2__) 2480 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2481#endif 2482 ); 2483} 2484 2485void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, 2486 const uint8* u_buf, 2487 const uint8* v_buf, 2488 uint8* dst_argb, 2489 int width) { 2490 asm volatile ( 2491 "sub %[u_buf],%[v_buf] \n" 2492 "pcmpeqb %%xmm5,%%xmm5 \n" 2493 "pxor %%xmm4,%%xmm4 \n" 2494 LABELALIGN 2495 "1: \n" 2496 READYUV444 2497 YUVTORGB 2498 "punpcklbw %%xmm1,%%xmm0 \n" 2499 "punpcklbw %%xmm5,%%xmm2 \n" 2500 "movdqa %%xmm0,%%xmm1 \n" 2501 "punpcklwd %%xmm2,%%xmm0 \n" 2502 "punpckhwd %%xmm2,%%xmm1 \n" 2503 "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" 2504 "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" 2505 "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" 2506 "sub $0x8,%[width] \n" 2507 "jg 1b \n" 2508 : [y_buf]"+r"(y_buf), // %[y_buf] 2509 [u_buf]"+r"(u_buf), // %[u_buf] 2510 [v_buf]"+r"(v_buf), // %[v_buf] 2511 [dst_argb]"+r"(dst_argb), // %[dst_argb] 2512 [width]"+rm"(width) // %[width] 2513 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] 2514 : "memory", "cc" 2515#if defined(__native_client__) && defined(__x86_64__) 2516 , "r14" 2517#endif 2518#if defined(__SSE2__) 2519 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2520#endif 2521 ); 2522} 2523 2524void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, 2525 const uint8* u_buf, 2526 const uint8* v_buf, 2527 uint8* dst_argb, 2528 int width) { 2529 asm volatile ( 2530 "sub %[u_buf],%[v_buf] \n" 2531 "pcmpeqb %%xmm5,%%xmm5 \n" 2532 "pxor %%xmm4,%%xmm4 \n" 2533 LABELALIGN 2534 "1: \n" 2535 READYUV422 2536 YUVTORGB 2537 "punpcklbw %%xmm1,%%xmm0 \n" 2538 "punpcklbw %%xmm5,%%xmm2 \n" 2539 "movdqa %%xmm0,%%xmm1 \n" 2540 "punpcklwd %%xmm2,%%xmm0 \n" 2541 "punpckhwd %%xmm2,%%xmm1 \n" 2542 "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" 2543 "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" 2544 "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" 2545 "sub $0x8,%[width] \n" 2546 "jg 1b \n" 2547 : [y_buf]"+r"(y_buf), // %[y_buf] 2548 [u_buf]"+r"(u_buf), // %[u_buf] 2549 [v_buf]"+r"(v_buf), // %[v_buf] 2550 [dst_argb]"+r"(dst_argb), // %[dst_argb] 2551 [width]"+rm"(width) // %[width] 2552 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] 2553 : "memory", "cc" 2554#if defined(__native_client__) && defined(__x86_64__) 2555 , "r14" 2556#endif 2557#if defined(__SSE2__) 2558 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2559#endif 2560 ); 2561} 2562 2563void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, 2564 const uint8* u_buf, 2565 const uint8* v_buf, 2566 uint8* dst_argb, 2567 int width) { 2568 asm volatile ( 2569 "sub %[u_buf],%[v_buf] \n" 2570 "pcmpeqb %%xmm5,%%xmm5 \n" 2571 "pxor %%xmm4,%%xmm4 \n" 2572 LABELALIGN 2573 "1: \n" 2574 READYUV411 2575 YUVTORGB 2576 "punpcklbw %%xmm1,%%xmm0 \n" 2577 "punpcklbw %%xmm5,%%xmm2 \n" 2578 "movdqa %%xmm0,%%xmm1 \n" 2579 "punpcklwd %%xmm2,%%xmm0 \n" 2580 "punpckhwd %%xmm2,%%xmm1 \n" 2581 "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" 2582 "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" 2583 "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" 2584 "sub $0x8,%[width] \n" 2585 "jg 1b \n" 2586 : [y_buf]"+r"(y_buf), // %[y_buf] 2587 [u_buf]"+r"(u_buf), // %[u_buf] 2588 [v_buf]"+r"(v_buf), // %[v_buf] 2589 [dst_argb]"+r"(dst_argb), // %[dst_argb] 2590 [width]"+rm"(width) // %[width] 2591 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] 2592 : "memory", "cc" 2593#if defined(__native_client__) && defined(__x86_64__) 2594 , "r14" 2595#endif 2596#if defined(__SSE2__) 2597 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2598#endif 2599 ); 2600} 2601 2602void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, 2603 const uint8* uv_buf, 2604 uint8* dst_argb, 2605 int width) { 2606 asm volatile ( 2607 "pcmpeqb %%xmm5,%%xmm5 \n" 2608 "pxor %%xmm4,%%xmm4 \n" 2609 LABELALIGN 2610 "1: \n" 2611 READNV12 2612 YUVTORGB 2613 "punpcklbw %%xmm1,%%xmm0 \n" 2614 "punpcklbw %%xmm5,%%xmm2 \n" 2615 "movdqa %%xmm0,%%xmm1 \n" 2616 "punpcklwd %%xmm2,%%xmm0 \n" 2617 "punpckhwd %%xmm2,%%xmm1 \n" 2618 "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" 2619 "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" 2620 "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" 2621 "sub $0x8,%[width] \n" 2622 "jg 1b \n" 2623 : [y_buf]"+r"(y_buf), // %[y_buf] 2624 [uv_buf]"+r"(uv_buf), // %[uv_buf] 2625 [dst_argb]"+r"(dst_argb), // %[dst_argb] 2626 [width]"+rm"(width) // %[width] 2627 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] 2628 : "memory", "cc" 2629 // Does not use r14. 2630#if defined(__SSE2__) 2631 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2632#endif 2633 ); 2634} 2635 2636void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, 2637 const uint8* uv_buf, 2638 uint8* dst_argb, 2639 int width) { 2640 asm volatile ( 2641 "pcmpeqb %%xmm5,%%xmm5 \n" 2642 "pxor %%xmm4,%%xmm4 \n" 2643 LABELALIGN 2644 "1: \n" 2645 READNV12 2646 YVUTORGB 2647 "punpcklbw %%xmm1,%%xmm0 \n" 2648 "punpcklbw %%xmm5,%%xmm2 \n" 2649 "movdqa %%xmm0,%%xmm1 \n" 2650 "punpcklwd %%xmm2,%%xmm0 \n" 2651 "punpckhwd %%xmm2,%%xmm1 \n" 2652 "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" 2653 "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" 2654 "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" 2655 "sub $0x8,%[width] \n" 2656 "jg 1b \n" 2657 : [y_buf]"+r"(y_buf), // %[y_buf] 2658 [uv_buf]"+r"(uv_buf), // %[uv_buf] 2659 [dst_argb]"+r"(dst_argb), // %[dst_argb] 2660 [width]"+rm"(width) // %[width] 2661 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] 2662 : "memory", "cc" 2663 // Does not use r14. 2664#if defined(__SSE2__) 2665 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2666#endif 2667 ); 2668} 2669 2670void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf, 2671 const uint8* u_buf, 2672 const uint8* v_buf, 2673 uint8* dst_bgra, 2674 int width) { 2675 asm volatile ( 2676 "sub %[u_buf],%[v_buf] \n" 2677 "pcmpeqb %%xmm5,%%xmm5 \n" 2678 "pxor %%xmm4,%%xmm4 \n" 2679 LABELALIGN 2680 "1: \n" 2681 READYUV422 2682 YUVTORGB 2683 "pcmpeqb %%xmm5,%%xmm5 \n" 2684 "punpcklbw %%xmm0,%%xmm1 \n" 2685 "punpcklbw %%xmm2,%%xmm5 \n" 2686 "movdqa %%xmm5,%%xmm0 \n" 2687 "punpcklwd %%xmm1,%%xmm5 \n" 2688 "punpckhwd %%xmm1,%%xmm0 \n" 2689 "movdqa %%xmm5," MEMACCESS([dst_bgra]) "\n" 2690 "movdqa %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n" 2691 "lea " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n" 2692 "sub $0x8,%[width] \n" 2693 "jg 1b \n" 2694 : [y_buf]"+r"(y_buf), // %[y_buf] 2695 [u_buf]"+r"(u_buf), // %[u_buf] 2696 [v_buf]"+r"(v_buf), // %[v_buf] 2697 [dst_bgra]"+r"(dst_bgra), // %[dst_bgra] 2698 [width]"+rm"(width) // %[width] 2699 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] 2700 : "memory", "cc" 2701#if defined(__native_client__) && defined(__x86_64__) 2702 , "r14" 2703#endif 2704#if defined(__SSE2__) 2705 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2706#endif 2707 ); 2708} 2709 2710void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf, 2711 const uint8* u_buf, 2712 const uint8* v_buf, 2713 uint8* dst_abgr, 2714 int width) { 2715 asm volatile ( 2716 "sub %[u_buf],%[v_buf] \n" 2717 "pcmpeqb %%xmm5,%%xmm5 \n" 2718 "pxor %%xmm4,%%xmm4 \n" 2719 LABELALIGN 2720 "1: \n" 2721 READYUV422 2722 YUVTORGB 2723 "punpcklbw %%xmm1,%%xmm2 \n" 2724 "punpcklbw %%xmm5,%%xmm0 \n" 2725 "movdqa %%xmm2,%%xmm1 \n" 2726 "punpcklwd %%xmm0,%%xmm2 \n" 2727 "punpckhwd %%xmm0,%%xmm1 \n" 2728 "movdqa %%xmm2," MEMACCESS([dst_abgr]) "\n" 2729 "movdqa %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n" 2730 "lea " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n" 2731 "sub $0x8,%[width] \n" 2732 "jg 1b \n" 2733 : [y_buf]"+r"(y_buf), // %[y_buf] 2734 [u_buf]"+r"(u_buf), // %[u_buf] 2735 [v_buf]"+r"(v_buf), // %[v_buf] 2736 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr] 2737 [width]"+rm"(width) // %[width] 2738 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] 2739 : "memory", "cc" 2740#if defined(__native_client__) && defined(__x86_64__) 2741 , "r14" 2742#endif 2743#if defined(__SSE2__) 2744 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2745#endif 2746 ); 2747} 2748 2749void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, 2750 const uint8* u_buf, 2751 const uint8* v_buf, 2752 uint8* dst_rgba, 2753 int width) { 2754 asm volatile ( 2755 "sub %[u_buf],%[v_buf] \n" 2756 "pcmpeqb %%xmm5,%%xmm5 \n" 2757 "pxor %%xmm4,%%xmm4 \n" 2758 LABELALIGN 2759 "1: \n" 2760 READYUV422 2761 YUVTORGB 2762 "pcmpeqb %%xmm5,%%xmm5 \n" 2763 "punpcklbw %%xmm2,%%xmm1 \n" 2764 "punpcklbw %%xmm0,%%xmm5 \n" 2765 "movdqa %%xmm5,%%xmm0 \n" 2766 "punpcklwd %%xmm1,%%xmm5 \n" 2767 "punpckhwd %%xmm1,%%xmm0 \n" 2768 "movdqa %%xmm5," MEMACCESS([dst_rgba]) "\n" 2769 "movdqa %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "\n" 2770 "lea " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n" 2771 "sub $0x8,%[width] \n" 2772 "jg 1b \n" 2773 : [y_buf]"+r"(y_buf), // %[y_buf] 2774 [u_buf]"+r"(u_buf), // %[u_buf] 2775 [v_buf]"+r"(v_buf), // %[v_buf] 2776 [dst_rgba]"+r"(dst_rgba), // %[dst_rgba] 2777 [width]"+rm"(width) // %[width] 2778 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] 2779 : "memory", "cc" 2780#if defined(__native_client__) && defined(__x86_64__) 2781 , "r14" 2782#endif 2783#if defined(__SSE2__) 2784 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2785#endif 2786 ); 2787} 2788 2789void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, 2790 const uint8* u_buf, 2791 const uint8* v_buf, 2792 uint8* dst_bgra, 2793 int width) { 2794 asm volatile ( 2795 "sub %[u_buf],%[v_buf] \n" 2796 "pcmpeqb %%xmm5,%%xmm5 \n" 2797 "pxor %%xmm4,%%xmm4 \n" 2798 LABELALIGN 2799 "1: \n" 2800 READYUV422 2801 YUVTORGB 2802 "pcmpeqb %%xmm5,%%xmm5 \n" 2803 "punpcklbw %%xmm0,%%xmm1 \n" 2804 "punpcklbw %%xmm2,%%xmm5 \n" 2805 "movdqa %%xmm5,%%xmm0 \n" 2806 "punpcklwd %%xmm1,%%xmm5 \n" 2807 "punpckhwd %%xmm1,%%xmm0 \n" 2808 "movdqu %%xmm5," MEMACCESS([dst_bgra]) "\n" 2809 "movdqu %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n" 2810 "lea " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n" 2811 "sub $0x8,%[width] \n" 2812 "jg 1b \n" 2813 : [y_buf]"+r"(y_buf), // %[y_buf] 2814 [u_buf]"+r"(u_buf), // %[u_buf] 2815 [v_buf]"+r"(v_buf), // %[v_buf] 2816 [dst_bgra]"+r"(dst_bgra), // %[dst_bgra] 2817 [width]"+rm"(width) // %[width] 2818 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] 2819 : "memory", "cc" 2820#if defined(__native_client__) && defined(__x86_64__) 2821 , "r14" 2822#endif 2823#if defined(__SSE2__) 2824 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2825#endif 2826 ); 2827} 2828 2829void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, 2830 const uint8* u_buf, 2831 const uint8* v_buf, 2832 uint8* dst_abgr, 2833 int width) { 2834 asm volatile ( 2835 "sub %[u_buf],%[v_buf] \n" 2836 "pcmpeqb %%xmm5,%%xmm5 \n" 2837 "pxor %%xmm4,%%xmm4 \n" 2838 LABELALIGN 2839 "1: \n" 2840 READYUV422 2841 YUVTORGB 2842 "punpcklbw %%xmm1,%%xmm2 \n" 2843 "punpcklbw %%xmm5,%%xmm0 \n" 2844 "movdqa %%xmm2,%%xmm1 \n" 2845 "punpcklwd %%xmm0,%%xmm2 \n" 2846 "punpckhwd %%xmm0,%%xmm1 \n" 2847 "movdqu %%xmm2," MEMACCESS([dst_abgr]) "\n" 2848 "movdqu %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n" 2849 "lea " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n" 2850 "sub $0x8,%[width] \n" 2851 "jg 1b \n" 2852 : [y_buf]"+r"(y_buf), // %[y_buf] 2853 [u_buf]"+r"(u_buf), // %[u_buf] 2854 [v_buf]"+r"(v_buf), // %[v_buf] 2855 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr] 2856 [width]"+rm"(width) // %[width] 2857 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] 2858 : "memory", "cc" 2859#if defined(__native_client__) && defined(__x86_64__) 2860 , "r14" 2861#endif 2862#if defined(__SSE2__) 2863 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2864#endif 2865 ); 2866} 2867 2868void OMITFP I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf, 2869 const uint8* u_buf, 2870 const uint8* v_buf, 2871 uint8* dst_rgba, 2872 int width) { 2873 asm volatile ( 2874 "sub %[u_buf],%[v_buf] \n" 2875 "pcmpeqb %%xmm5,%%xmm5 \n" 2876 "pxor %%xmm4,%%xmm4 \n" 2877 LABELALIGN 2878 "1: \n" 2879 READYUV422 2880 YUVTORGB 2881 "pcmpeqb %%xmm5,%%xmm5 \n" 2882 "punpcklbw %%xmm2,%%xmm1 \n" 2883 "punpcklbw %%xmm0,%%xmm5 \n" 2884 "movdqa %%xmm5,%%xmm0 \n" 2885 "punpcklwd %%xmm1,%%xmm5 \n" 2886 "punpckhwd %%xmm1,%%xmm0 \n" 2887 "movdqu %%xmm5," MEMACCESS([dst_rgba]) "\n" 2888 "movdqu %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "\n" 2889 "lea " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n" 2890 "sub $0x8,%[width] \n" 2891 "jg 1b \n" 2892 : [y_buf]"+r"(y_buf), // %[y_buf] 2893 [u_buf]"+r"(u_buf), // %[u_buf] 2894 [v_buf]"+r"(v_buf), // %[v_buf] 2895 [dst_rgba]"+r"(dst_rgba), // %[dst_rgba] 2896 [width]"+rm"(width) // %[width] 2897 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] 2898 : "memory", "cc" 2899#if defined(__native_client__) && defined(__x86_64__) 2900 , "r14" 2901#endif 2902#if defined(__SSE2__) 2903 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2904#endif 2905 ); 2906} 2907 2908#endif // HAS_I422TOARGBROW_SSSE3 2909 2910#ifdef HAS_YTOARGBROW_SSE2 2911void YToARGBRow_SSE2(const uint8* y_buf, 2912 uint8* dst_argb, 2913 int width) { 2914 asm volatile ( 2915 "pxor %%xmm5,%%xmm5 \n" 2916 "pcmpeqb %%xmm4,%%xmm4 \n" 2917 "pslld $0x18,%%xmm4 \n" 2918 "mov $0x00100010,%%eax \n" 2919 "movd %%eax,%%xmm3 \n" 2920 "pshufd $0x0,%%xmm3,%%xmm3 \n" 2921 "mov $0x004a004a,%%eax \n" 2922 "movd %%eax,%%xmm2 \n" 2923 "pshufd $0x0,%%xmm2,%%xmm2 \n" 2924 LABELALIGN 2925 "1: \n" 2926 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 2927 "movq " MEMACCESS(0) ",%%xmm0 \n" 2928 "lea " MEMLEA(0x8,0) ",%0 \n" 2929 "punpcklbw %%xmm5,%%xmm0 \n" 2930 "psubusw %%xmm3,%%xmm0 \n" 2931 "pmullw %%xmm2,%%xmm0 \n" 2932 "psrlw $6, %%xmm0 \n" 2933 "packuswb %%xmm0,%%xmm0 \n" 2934 2935 // Step 2: Weave into ARGB 2936 "punpcklbw %%xmm0,%%xmm0 \n" 2937 "movdqa %%xmm0,%%xmm1 \n" 2938 "punpcklwd %%xmm0,%%xmm0 \n" 2939 "punpckhwd %%xmm1,%%xmm1 \n" 2940 "por %%xmm4,%%xmm0 \n" 2941 "por %%xmm4,%%xmm1 \n" 2942 "movdqa %%xmm0," MEMACCESS(1) " \n" 2943 "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" 2944 "lea " MEMLEA(0x20,1) ",%1 \n" 2945 2946 "sub $0x8,%2 \n" 2947 "jg 1b \n" 2948 : "+r"(y_buf), // %0 2949 "+r"(dst_argb), // %1 2950 "+rm"(width) // %2 2951 : 2952 : "memory", "cc", "eax" 2953#if defined(__SSE2__) 2954 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" 2955#endif 2956 ); 2957} 2958#endif // HAS_YTOARGBROW_SSE2 2959 2960#ifdef HAS_MIRRORROW_SSSE3 2961// Shuffle table for reversing the bytes. 2962static uvec8 kShuffleMirror = { 2963 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u 2964}; 2965 2966void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { 2967 intptr_t temp_width = (intptr_t)(width); 2968 asm volatile ( 2969 "movdqa %3,%%xmm5 \n" 2970 "lea " MEMLEA(-0x10,0) ",%0 \n" 2971 LABELALIGN 2972 "1: \n" 2973 MEMOPREG(movdqa,0x00,0,2,1,xmm0) // movdqa (%0,%2),%%xmm0 2974 "pshufb %%xmm5,%%xmm0 \n" 2975 "sub $0x10,%2 \n" 2976 "movdqa %%xmm0," MEMACCESS(1) " \n" 2977 "lea " MEMLEA(0x10,1) ",%1 \n" 2978 "jg 1b \n" 2979 : "+r"(src), // %0 2980 "+r"(dst), // %1 2981 "+r"(temp_width) // %2 2982 : "m"(kShuffleMirror) // %3 2983 : "memory", "cc" 2984#if defined(__native_client__) && defined(__x86_64__) 2985 , "r14" 2986#endif 2987#if defined(__SSE2__) 2988 , "xmm0", "xmm5" 2989#endif 2990 ); 2991} 2992#endif // HAS_MIRRORROW_SSSE3 2993 2994#ifdef HAS_MIRRORROW_SSE2 2995void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { 2996 intptr_t temp_width = (intptr_t)(width); 2997 asm volatile ( 2998 "lea " MEMLEA(-0x10,0) ",%0 \n" 2999 LABELALIGN 3000 "1: \n" 3001 MEMOPREG(movdqu,0x00,0,2,1,xmm0) // movdqu (%0,%2),%%xmm0 3002 "movdqa %%xmm0,%%xmm1 \n" 3003 "psllw $0x8,%%xmm0 \n" 3004 "psrlw $0x8,%%xmm1 \n" 3005 "por %%xmm1,%%xmm0 \n" 3006 "pshuflw $0x1b,%%xmm0,%%xmm0 \n" 3007 "pshufhw $0x1b,%%xmm0,%%xmm0 \n" 3008 "pshufd $0x4e,%%xmm0,%%xmm0 \n" 3009 "sub $0x10,%2 \n" 3010 "movdqu %%xmm0," MEMACCESS(1) " \n" 3011 "lea " MEMLEA(0x10,1)",%1 \n" 3012 "jg 1b \n" 3013 : "+r"(src), // %0 3014 "+r"(dst), // %1 3015 "+r"(temp_width) // %2 3016 : 3017 : "memory", "cc" 3018#if defined(__native_client__) && defined(__x86_64__) 3019 , "r14" 3020#endif 3021#if defined(__SSE2__) 3022 , "xmm0", "xmm1" 3023#endif 3024 ); 3025} 3026#endif // HAS_MIRRORROW_SSE2 3027 3028#ifdef HAS_MIRRORROW_UV_SSSE3 3029// Shuffle table for reversing the bytes of UV channels. 3030static uvec8 kShuffleMirrorUV = { 3031 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u 3032}; 3033void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, 3034 int width) { 3035 intptr_t temp_width = (intptr_t)(width); 3036 asm volatile ( 3037 "movdqa %4,%%xmm1 \n" 3038 "lea " MEMLEA4(-0x10,0,3,2) ",%0 \n" 3039 "sub %1,%2 \n" 3040 LABELALIGN 3041 "1: \n" 3042 "movdqa " MEMACCESS(0) ",%%xmm0 \n" 3043 "lea " MEMLEA(-0x10,0) ",%0 \n" 3044 "pshufb %%xmm1,%%xmm0 \n" 3045 "sub $8,%3 \n" 3046 "movlpd %%xmm0," MEMACCESS(1) " \n" 3047 BUNDLEALIGN 3048 MEMOPMEM(movhpd,xmm0,0x00,1,2,1) // movhpd %%xmm0,(%1,%2) 3049 "lea " MEMLEA(0x8,1) ",%1 \n" 3050 "jg 1b \n" 3051 : "+r"(src), // %0 3052 "+r"(dst_u), // %1 3053 "+r"(dst_v), // %2 3054 "+r"(temp_width) // %3 3055 : "m"(kShuffleMirrorUV) // %4 3056 : "memory", "cc" 3057#if defined(__native_client__) && defined(__x86_64__) 3058 , "r14" 3059#endif 3060#if defined(__SSE2__) 3061 , "xmm0", "xmm1" 3062#endif 3063 ); 3064} 3065#endif // HAS_MIRRORROW_UV_SSSE3 3066 3067#ifdef HAS_ARGBMIRRORROW_SSSE3 3068// Shuffle table for reversing the bytes. 3069static uvec8 kARGBShuffleMirror = { 3070 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u 3071}; 3072 3073void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { 3074 intptr_t temp_width = (intptr_t)(width); 3075 asm volatile ( 3076 "lea " MEMLEA4(-0x10,0,2,4) ",%0 \n" 3077 "movdqa %3,%%xmm5 \n" 3078 LABELALIGN 3079 "1: \n" 3080 "movdqa " MEMACCESS(0) ",%%xmm0 \n" 3081 "pshufb %%xmm5,%%xmm0 \n" 3082 "lea " MEMLEA(-0x10,0) ",%0 \n" 3083 "sub $0x4,%2 \n" 3084 "movdqa %%xmm0," MEMACCESS(1) " \n" 3085 "lea " MEMLEA(0x10,1) ",%1 \n" 3086 "jg 1b \n" 3087 : "+r"(src), // %0 3088 "+r"(dst), // %1 3089 "+r"(temp_width) // %2 3090 : "m"(kARGBShuffleMirror) // %3 3091 : "memory", "cc" 3092#if defined(__SSE2__) 3093 , "xmm0", "xmm5" 3094#endif 3095 ); 3096} 3097#endif // HAS_ARGBMIRRORROW_SSSE3 3098 3099#ifdef HAS_SPLITUVROW_SSE2 3100void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { 3101 asm volatile ( 3102 "pcmpeqb %%xmm5,%%xmm5 \n" 3103 "psrlw $0x8,%%xmm5 \n" 3104 "sub %1,%2 \n" 3105 LABELALIGN 3106 "1: \n" 3107 "movdqa " MEMACCESS(0) ",%%xmm0 \n" 3108 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 3109 "lea " MEMLEA(0x20,0) ",%0 \n" 3110 "movdqa %%xmm0,%%xmm2 \n" 3111 "movdqa %%xmm1,%%xmm3 \n" 3112 "pand %%xmm5,%%xmm0 \n" 3113 "pand %%xmm5,%%xmm1 \n" 3114 "packuswb %%xmm1,%%xmm0 \n" 3115 "psrlw $0x8,%%xmm2 \n" 3116 "psrlw $0x8,%%xmm3 \n" 3117 "packuswb %%xmm3,%%xmm2 \n" 3118 "movdqa %%xmm0," MEMACCESS(1) " \n" 3119 MEMOPMEM(movdqa,xmm2,0x00,1,2,1) // movdqa %%xmm2,(%1,%2) 3120 "lea " MEMLEA(0x10,1) ",%1 \n" 3121 "sub $0x10,%3 \n" 3122 "jg 1b \n" 3123 : "+r"(src_uv), // %0 3124 "+r"(dst_u), // %1 3125 "+r"(dst_v), // %2 3126 "+r"(pix) // %3 3127 : 3128 : "memory", "cc" 3129#if defined(__native_client__) && defined(__x86_64__) 3130 , "r14" 3131#endif 3132#if defined(__SSE2__) 3133 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 3134#endif 3135 ); 3136} 3137 3138void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, 3139 int pix) { 3140 asm volatile ( 3141 "pcmpeqb %%xmm5,%%xmm5 \n" 3142 "psrlw $0x8,%%xmm5 \n" 3143 "sub %1,%2 \n" 3144 LABELALIGN 3145 "1: \n" 3146 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 3147 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 3148 "lea " MEMLEA(0x20,0) ",%0 \n" 3149 "movdqa %%xmm0,%%xmm2 \n" 3150 "movdqa %%xmm1,%%xmm3 \n" 3151 "pand %%xmm5,%%xmm0 \n" 3152 "pand %%xmm5,%%xmm1 \n" 3153 "packuswb %%xmm1,%%xmm0 \n" 3154 "psrlw $0x8,%%xmm2 \n" 3155 "psrlw $0x8,%%xmm3 \n" 3156 "packuswb %%xmm3,%%xmm2 \n" 3157 "movdqu %%xmm0," MEMACCESS(1) " \n" 3158 MEMOPMEM(movdqu,xmm2,0x00,1,2,1) // movdqu %%xmm2,(%1,%2) 3159 "lea " MEMLEA(0x10,1) ",%1 \n" 3160 "sub $0x10,%3 \n" 3161 "jg 1b \n" 3162 : "+r"(src_uv), // %0 3163 "+r"(dst_u), // %1 3164 "+r"(dst_v), // %2 3165 "+r"(pix) // %3 3166 : 3167 : "memory", "cc" 3168#if defined(__native_client__) && defined(__x86_64__) 3169 , "r14" 3170#endif 3171#if defined(__SSE2__) 3172 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 3173#endif 3174 ); 3175} 3176#endif // HAS_SPLITUVROW_SSE2 3177 3178#ifdef HAS_MERGEUVROW_SSE2 3179void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, 3180 int width) { 3181 asm volatile ( 3182 "sub %0,%1 \n" 3183 LABELALIGN 3184 "1: \n" 3185 "movdqa " MEMACCESS(0) ",%%xmm0 \n" 3186 MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1 3187 "lea " MEMLEA(0x10,0) ",%0 \n" 3188 "movdqa %%xmm0,%%xmm2 \n" 3189 "punpcklbw %%xmm1,%%xmm0 \n" 3190 "punpckhbw %%xmm1,%%xmm2 \n" 3191 "movdqa %%xmm0," MEMACCESS(2) " \n" 3192 "movdqa %%xmm2," MEMACCESS2(0x10,2) " \n" 3193 "lea " MEMLEA(0x20,2) ",%2 \n" 3194 "sub $0x10,%3 \n" 3195 "jg 1b \n" 3196 : "+r"(src_u), // %0 3197 "+r"(src_v), // %1 3198 "+r"(dst_uv), // %2 3199 "+r"(width) // %3 3200 : 3201 : "memory", "cc" 3202#if defined(__native_client__) && defined(__x86_64__) 3203 , "r14" 3204#endif 3205#if defined(__SSE2__) 3206 , "xmm0", "xmm1", "xmm2" 3207#endif 3208 ); 3209} 3210 3211void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v, 3212 uint8* dst_uv, int width) { 3213 asm volatile ( 3214 "sub %0,%1 \n" 3215 LABELALIGN 3216 "1: \n" 3217 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 3218 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 3219 "lea " MEMLEA(0x10,0) ",%0 \n" 3220 "movdqa %%xmm0,%%xmm2 \n" 3221 "punpcklbw %%xmm1,%%xmm0 \n" 3222 "punpckhbw %%xmm1,%%xmm2 \n" 3223 "movdqu %%xmm0," MEMACCESS(2) " \n" 3224 "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n" 3225 "lea " MEMLEA(0x20,2) ",%2 \n" 3226 "sub $0x10,%3 \n" 3227 "jg 1b \n" 3228 : "+r"(src_u), // %0 3229 "+r"(src_v), // %1 3230 "+r"(dst_uv), // %2 3231 "+r"(width) // %3 3232 : 3233 : "memory", "cc" 3234#if defined(__native_client__) && defined(__x86_64__) 3235 , "r14" 3236#endif 3237#if defined(__SSE2__) 3238 , "xmm0", "xmm1", "xmm2" 3239#endif 3240 ); 3241} 3242#endif // HAS_MERGEUVROW_SSE2 3243 3244#ifdef HAS_COPYROW_SSE2 3245void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { 3246 asm volatile ( 3247 LABELALIGN 3248 "1: \n" 3249 "movdqa " MEMACCESS(0) ",%%xmm0 \n" 3250 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 3251 "lea " MEMLEA(0x20,0) ",%0 \n" 3252 "movdqa %%xmm0," MEMACCESS(1) " \n" 3253 "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" 3254 "lea " MEMLEA(0x20,1) ",%1 \n" 3255 "sub $0x20,%2 \n" 3256 "jg 1b \n" 3257 : "+r"(src), // %0 3258 "+r"(dst), // %1 3259 "+r"(count) // %2 3260 : 3261 : "memory", "cc" 3262#if defined(__SSE2__) 3263 , "xmm0", "xmm1" 3264#endif 3265 ); 3266} 3267#endif // HAS_COPYROW_SSE2 3268 3269#ifdef HAS_COPYROW_X86 3270void CopyRow_X86(const uint8* src, uint8* dst, int width) { 3271 size_t width_tmp = (size_t)(width); 3272 asm volatile ( 3273 "shr $0x2,%2 \n" 3274 "rep movsl " MEMMOVESTRING(0,1) " \n" 3275 : "+S"(src), // %0 3276 "+D"(dst), // %1 3277 "+c"(width_tmp) // %2 3278 : 3279 : "memory", "cc" 3280 ); 3281} 3282#endif // HAS_COPYROW_X86 3283 3284#ifdef HAS_COPYROW_ERMS 3285// Unaligned Multiple of 1. 3286void CopyRow_ERMS(const uint8* src, uint8* dst, int width) { 3287 size_t width_tmp = (size_t)(width); 3288 asm volatile ( 3289 "rep movsb " MEMMOVESTRING(0,1) " \n" 3290 : "+S"(src), // %0 3291 "+D"(dst), // %1 3292 "+c"(width_tmp) // %2 3293 : 3294 : "memory", "cc" 3295 ); 3296} 3297#endif // HAS_COPYROW_ERMS 3298 3299#ifdef HAS_ARGBCOPYALPHAROW_SSE2 3300// width in pixels 3301void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { 3302 asm volatile ( 3303 "pcmpeqb %%xmm0,%%xmm0 \n" 3304 "pslld $0x18,%%xmm0 \n" 3305 "pcmpeqb %%xmm1,%%xmm1 \n" 3306 "psrld $0x8,%%xmm1 \n" 3307 LABELALIGN 3308 "1: \n" 3309 "movdqa " MEMACCESS(0) ",%%xmm2 \n" 3310 "movdqa " MEMACCESS2(0x10,0) ",%%xmm3 \n" 3311 "lea " MEMLEA(0x20,0) ",%0 \n" 3312 "movdqa " MEMACCESS(1) ",%%xmm4 \n" 3313 "movdqa " MEMACCESS2(0x10,1) ",%%xmm5 \n" 3314 "pand %%xmm0,%%xmm2 \n" 3315 "pand %%xmm0,%%xmm3 \n" 3316 "pand %%xmm1,%%xmm4 \n" 3317 "pand %%xmm1,%%xmm5 \n" 3318 "por %%xmm4,%%xmm2 \n" 3319 "por %%xmm5,%%xmm3 \n" 3320 "movdqa %%xmm2," MEMACCESS(1) " \n" 3321 "movdqa %%xmm3," MEMACCESS2(0x10,1) " \n" 3322 "lea " MEMLEA(0x20,1) ",%1 \n" 3323 "sub $0x8,%2 \n" 3324 "jg 1b \n" 3325 : "+r"(src), // %0 3326 "+r"(dst), // %1 3327 "+r"(width) // %2 3328 : 3329 : "memory", "cc" 3330#if defined(__SSE2__) 3331 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 3332#endif 3333 ); 3334} 3335#endif // HAS_ARGBCOPYALPHAROW_SSE2 3336 3337#ifdef HAS_ARGBCOPYALPHAROW_AVX2 3338// width in pixels 3339void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { 3340 asm volatile ( 3341 "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" 3342 "vpsrld $0x8,%%ymm0,%%ymm0 \n" 3343 LABELALIGN 3344 "1: \n" 3345 "vmovdqu " MEMACCESS(0) ",%%ymm1 \n" 3346 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm2 \n" 3347 "lea " MEMLEA(0x40,0) ",%0 \n" 3348 "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n" 3349 "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n" 3350 "vmovdqu %%ymm1," MEMACCESS(1) " \n" 3351 "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n" 3352 "lea " MEMLEA(0x40,1) ",%1 \n" 3353 "sub $0x10,%2 \n" 3354 "jg 1b \n" 3355 "vzeroupper \n" 3356 : "+r"(src), // %0 3357 "+r"(dst), // %1 3358 "+r"(width) // %2 3359 : 3360 : "memory", "cc" 3361#if defined(__SSE2__) 3362 , "xmm0", "xmm1", "xmm2" 3363#endif 3364 ); 3365} 3366#endif // HAS_ARGBCOPYALPHAROW_AVX2 3367 3368#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 3369// width in pixels 3370void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { 3371 asm volatile ( 3372 "pcmpeqb %%xmm0,%%xmm0 \n" 3373 "pslld $0x18,%%xmm0 \n" 3374 "pcmpeqb %%xmm1,%%xmm1 \n" 3375 "psrld $0x8,%%xmm1 \n" 3376 LABELALIGN 3377 "1: \n" 3378 "movq " MEMACCESS(0) ",%%xmm2 \n" 3379 "lea " MEMLEA(0x8,0) ",%0 \n" 3380 "punpcklbw %%xmm2,%%xmm2 \n" 3381 "punpckhwd %%xmm2,%%xmm3 \n" 3382 "punpcklwd %%xmm2,%%xmm2 \n" 3383 "movdqa " MEMACCESS(1) ",%%xmm4 \n" 3384 "movdqa " MEMACCESS2(0x10,1) ",%%xmm5 \n" 3385 "pand %%xmm0,%%xmm2 \n" 3386 "pand %%xmm0,%%xmm3 \n" 3387 "pand %%xmm1,%%xmm4 \n" 3388 "pand %%xmm1,%%xmm5 \n" 3389 "por %%xmm4,%%xmm2 \n" 3390 "por %%xmm5,%%xmm3 \n" 3391 "movdqa %%xmm2," MEMACCESS(1) " \n" 3392 "movdqa %%xmm3," MEMACCESS2(0x10,1) " \n" 3393 "lea " MEMLEA(0x20,1) ",%1 \n" 3394 "sub $0x8,%2 \n" 3395 "jg 1b \n" 3396 : "+r"(src), // %0 3397 "+r"(dst), // %1 3398 "+r"(width) // %2 3399 : 3400 : "memory", "cc" 3401#if defined(__SSE2__) 3402 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 3403#endif 3404 ); 3405} 3406#endif // HAS_ARGBCOPYYTOALPHAROW_SSE2 3407 3408#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 3409// width in pixels 3410void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { 3411 asm volatile ( 3412 "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" 3413 "vpsrld $0x8,%%ymm0,%%ymm0 \n" 3414 LABELALIGN 3415 "1: \n" 3416 "vpmovzxbd " MEMACCESS(0) ",%%ymm1 \n" 3417 "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2 \n" 3418 "lea " MEMLEA(0x10,0) ",%0 \n" 3419 "vpslld $0x18,%%ymm1,%%ymm1 \n" 3420 "vpslld $0x18,%%ymm2,%%ymm2 \n" 3421 "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n" 3422 "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n" 3423 "vmovdqu %%ymm1," MEMACCESS(1) " \n" 3424 "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n" 3425 "lea " MEMLEA(0x40,1) ",%1 \n" 3426 "sub $0x10,%2 \n" 3427 "jg 1b \n" 3428 "vzeroupper \n" 3429 : "+r"(src), // %0 3430 "+r"(dst), // %1 3431 "+r"(width) // %2 3432 : 3433 : "memory", "cc" 3434#if defined(__SSE2__) 3435 , "xmm0", "xmm1", "xmm2" 3436#endif 3437 ); 3438} 3439#endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 3440 3441#ifdef HAS_SETROW_X86 3442void SetRow_X86(uint8* dst, uint32 v32, int width) { 3443 size_t width_tmp = (size_t)(width); 3444 asm volatile ( 3445 "shr $0x2,%1 \n" 3446 "rep stosl " MEMSTORESTRING(eax,0) " \n" 3447 : "+D"(dst), // %0 3448 "+c"(width_tmp) // %1 3449 : "a"(v32) // %2 3450 : "memory", "cc"); 3451} 3452 3453void ARGBSetRows_X86(uint8* dst, uint32 v32, int width, 3454 int dst_stride, int height) { 3455 for (int y = 0; y < height; ++y) { 3456 size_t width_tmp = (size_t)(width); 3457 uint32* d = (uint32*)(dst); 3458 asm volatile ( 3459 "rep stosl " MEMSTORESTRING(eax,0) " \n" 3460 : "+D"(d), // %0 3461 "+c"(width_tmp) // %1 3462 : "a"(v32) // %2 3463 : "memory", "cc"); 3464 dst += dst_stride; 3465 } 3466} 3467#endif // HAS_SETROW_X86 3468 3469#ifdef HAS_YUY2TOYROW_SSE2 3470void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) { 3471 asm volatile ( 3472 "pcmpeqb %%xmm5,%%xmm5 \n" 3473 "psrlw $0x8,%%xmm5 \n" 3474 LABELALIGN 3475 "1: \n" 3476 "movdqa " MEMACCESS(0) ",%%xmm0 \n" 3477 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 3478 "lea " MEMLEA(0x20,0) ",%0 \n" 3479 "pand %%xmm5,%%xmm0 \n" 3480 "pand %%xmm5,%%xmm1 \n" 3481 "packuswb %%xmm1,%%xmm0 \n" 3482 "movdqa %%xmm0," MEMACCESS(1) " \n" 3483 "lea " MEMLEA(0x10,1) ",%1 \n" 3484 "sub $0x10,%2 \n" 3485 "jg 1b \n" 3486 : "+r"(src_yuy2), // %0 3487 "+r"(dst_y), // %1 3488 "+r"(pix) // %2 3489 : 3490 : "memory", "cc" 3491#if defined(__SSE2__) 3492 , "xmm0", "xmm1", "xmm5" 3493#endif 3494 ); 3495} 3496 3497void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, 3498 uint8* dst_u, uint8* dst_v, int pix) { 3499 asm volatile ( 3500 "pcmpeqb %%xmm5,%%xmm5 \n" 3501 "psrlw $0x8,%%xmm5 \n" 3502 "sub %1,%2 \n" 3503 LABELALIGN 3504 "1: \n" 3505 "movdqa " MEMACCESS(0) ",%%xmm0 \n" 3506 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 3507 BUNDLEALIGN 3508 MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2 3509 MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3 3510 "lea " MEMLEA(0x20,0) ",%0 \n" 3511 "pavgb %%xmm2,%%xmm0 \n" 3512 "pavgb %%xmm3,%%xmm1 \n" 3513 "psrlw $0x8,%%xmm0 \n" 3514 "psrlw $0x8,%%xmm1 \n" 3515 "packuswb %%xmm1,%%xmm0 \n" 3516 "movdqa %%xmm0,%%xmm1 \n" 3517 "pand %%xmm5,%%xmm0 \n" 3518 "packuswb %%xmm0,%%xmm0 \n" 3519 "psrlw $0x8,%%xmm1 \n" 3520 "packuswb %%xmm1,%%xmm1 \n" 3521 "movq %%xmm0," MEMACCESS(1) " \n" 3522 BUNDLEALIGN 3523 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) 3524 "lea " MEMLEA(0x8,1) ",%1 \n" 3525 "sub $0x10,%3 \n" 3526 "jg 1b \n" 3527 : "+r"(src_yuy2), // %0 3528 "+r"(dst_u), // %1 3529 "+r"(dst_v), // %2 3530 "+r"(pix) // %3 3531 : "r"((intptr_t)(stride_yuy2)) // %4 3532 : "memory", "cc" 3533#if defined(__native_client__) && defined(__x86_64__) 3534 , "r14" 3535#endif 3536#if defined(__SSE2__) 3537 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 3538#endif 3539 ); 3540} 3541 3542void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, 3543 uint8* dst_u, uint8* dst_v, int pix) { 3544 asm volatile ( 3545 "pcmpeqb %%xmm5,%%xmm5 \n" 3546 "psrlw $0x8,%%xmm5 \n" 3547 "sub %1,%2 \n" 3548 LABELALIGN 3549 "1: \n" 3550 "movdqa " MEMACCESS(0) ",%%xmm0 \n" 3551 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 3552 "lea " MEMLEA(0x20,0) ",%0 \n" 3553 "psrlw $0x8,%%xmm0 \n" 3554 "psrlw $0x8,%%xmm1 \n" 3555 "packuswb %%xmm1,%%xmm0 \n" 3556 "movdqa %%xmm0,%%xmm1 \n" 3557 "pand %%xmm5,%%xmm0 \n" 3558 "packuswb %%xmm0,%%xmm0 \n" 3559 "psrlw $0x8,%%xmm1 \n" 3560 "packuswb %%xmm1,%%xmm1 \n" 3561 "movq %%xmm0," MEMACCESS(1) " \n" 3562 BUNDLEALIGN 3563 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) 3564 "lea " MEMLEA(0x8,1) ",%1 \n" 3565 "sub $0x10,%3 \n" 3566 "jg 1b \n" 3567 : "+r"(src_yuy2), // %0 3568 "+r"(dst_u), // %1 3569 "+r"(dst_v), // %2 3570 "+r"(pix) // %3 3571 : 3572 : "memory", "cc" 3573#if defined(__native_client__) && defined(__x86_64__) 3574 , "r14" 3575#endif 3576#if defined(__SSE2__) 3577 , "xmm0", "xmm1", "xmm5" 3578#endif 3579 ); 3580} 3581 3582void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2, 3583 uint8* dst_y, int pix) { 3584 asm volatile ( 3585 "pcmpeqb %%xmm5,%%xmm5 \n" 3586 "psrlw $0x8,%%xmm5 \n" 3587 LABELALIGN 3588 "1: \n" 3589 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 3590 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 3591 "lea " MEMLEA(0x20,0) ",%0 \n" 3592 "pand %%xmm5,%%xmm0 \n" 3593 "pand %%xmm5,%%xmm1 \n" 3594 "packuswb %%xmm1,%%xmm0 \n" 3595 "sub $0x10,%2 \n" 3596 "movdqu %%xmm0," MEMACCESS(1) " \n" 3597 "lea " MEMLEA(0x10,1) ",%1 \n" 3598 "jg 1b \n" 3599 : "+r"(src_yuy2), // %0 3600 "+r"(dst_y), // %1 3601 "+r"(pix) // %2 3602 : 3603 : "memory", "cc" 3604#if defined(__SSE2__) 3605 , "xmm0", "xmm1", "xmm5" 3606#endif 3607 ); 3608} 3609 3610void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, 3611 int stride_yuy2, 3612 uint8* dst_u, uint8* dst_v, int pix) { 3613 asm volatile ( 3614 "pcmpeqb %%xmm5,%%xmm5 \n" 3615 "psrlw $0x8,%%xmm5 \n" 3616 "sub %1,%2 \n" 3617 LABELALIGN 3618 "1: \n" 3619 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 3620 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 3621 BUNDLEALIGN 3622 MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 3623 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 3624 "lea " MEMLEA(0x20,0) ",%0 \n" 3625 "pavgb %%xmm2,%%xmm0 \n" 3626 "pavgb %%xmm3,%%xmm1 \n" 3627 "psrlw $0x8,%%xmm0 \n" 3628 "psrlw $0x8,%%xmm1 \n" 3629 "packuswb %%xmm1,%%xmm0 \n" 3630 "movdqa %%xmm0,%%xmm1 \n" 3631 "pand %%xmm5,%%xmm0 \n" 3632 "packuswb %%xmm0,%%xmm0 \n" 3633 "psrlw $0x8,%%xmm1 \n" 3634 "packuswb %%xmm1,%%xmm1 \n" 3635 "movq %%xmm0," MEMACCESS(1) " \n" 3636 BUNDLEALIGN 3637 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) 3638 "lea " MEMLEA(0x8,1) ",%1 \n" 3639 "sub $0x10,%3 \n" 3640 "jg 1b \n" 3641 : "+r"(src_yuy2), // %0 3642 "+r"(dst_u), // %1 3643 "+r"(dst_v), // %2 3644 "+r"(pix) // %3 3645 : "r"((intptr_t)(stride_yuy2)) // %4 3646 : "memory", "cc" 3647#if defined(__native_client__) && defined(__x86_64__) 3648 , "r14" 3649#endif 3650#if defined(__SSE2__) 3651 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 3652#endif 3653 ); 3654} 3655 3656void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2, 3657 uint8* dst_u, uint8* dst_v, int pix) { 3658 asm volatile ( 3659 "pcmpeqb %%xmm5,%%xmm5 \n" 3660 "psrlw $0x8,%%xmm5 \n" 3661 "sub %1,%2 \n" 3662 LABELALIGN 3663 "1: \n" 3664 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 3665 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 3666 "lea " MEMLEA(0x20,0) ",%0 \n" 3667 "psrlw $0x8,%%xmm0 \n" 3668 "psrlw $0x8,%%xmm1 \n" 3669 "packuswb %%xmm1,%%xmm0 \n" 3670 "movdqa %%xmm0,%%xmm1 \n" 3671 "pand %%xmm5,%%xmm0 \n" 3672 "packuswb %%xmm0,%%xmm0 \n" 3673 "psrlw $0x8,%%xmm1 \n" 3674 "packuswb %%xmm1,%%xmm1 \n" 3675 "movq %%xmm0," MEMACCESS(1) " \n" 3676 BUNDLEALIGN 3677 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) 3678 "lea " MEMLEA(0x8,1) ",%1 \n" 3679 "sub $0x10,%3 \n" 3680 "jg 1b \n" 3681 : "+r"(src_yuy2), // %0 3682 "+r"(dst_u), // %1 3683 "+r"(dst_v), // %2 3684 "+r"(pix) // %3 3685 : 3686 : "memory", "cc" 3687#if defined(__native_client__) && defined(__x86_64__) 3688 , "r14" 3689#endif 3690#if defined(__SSE2__) 3691 , "xmm0", "xmm1", "xmm5" 3692#endif 3693 ); 3694} 3695 3696void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) { 3697 asm volatile ( 3698 LABELALIGN 3699 "1: \n" 3700 "movdqa " MEMACCESS(0) ",%%xmm0 \n" 3701 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 3702 "lea " MEMLEA(0x20,0) ",%0 \n" 3703 "psrlw $0x8,%%xmm0 \n" 3704 "psrlw $0x8,%%xmm1 \n" 3705 "packuswb %%xmm1,%%xmm0 \n" 3706 "sub $0x10,%2 \n" 3707 "movdqa %%xmm0," MEMACCESS(1) " \n" 3708 "lea " MEMLEA(0x10,1) ",%1 \n" 3709 "jg 1b \n" 3710 : "+r"(src_uyvy), // %0 3711 "+r"(dst_y), // %1 3712 "+r"(pix) // %2 3713 : 3714 : "memory", "cc" 3715#if defined(__SSE2__) 3716 , "xmm0", "xmm1" 3717#endif 3718 ); 3719} 3720 3721void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, 3722 uint8* dst_u, uint8* dst_v, int pix) { 3723 asm volatile ( 3724 "pcmpeqb %%xmm5,%%xmm5 \n" 3725 "psrlw $0x8,%%xmm5 \n" 3726 "sub %1,%2 \n" 3727 LABELALIGN 3728 "1: \n" 3729 "movdqa " MEMACCESS(0) ",%%xmm0 \n" 3730 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 3731 BUNDLEALIGN 3732 MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2 3733 MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3 3734 "lea " MEMLEA(0x20,0) ",%0 \n" 3735 "pavgb %%xmm2,%%xmm0 \n" 3736 "pavgb %%xmm3,%%xmm1 \n" 3737 "pand %%xmm5,%%xmm0 \n" 3738 "pand %%xmm5,%%xmm1 \n" 3739 "packuswb %%xmm1,%%xmm0 \n" 3740 "movdqa %%xmm0,%%xmm1 \n" 3741 "pand %%xmm5,%%xmm0 \n" 3742 "packuswb %%xmm0,%%xmm0 \n" 3743 "psrlw $0x8,%%xmm1 \n" 3744 "packuswb %%xmm1,%%xmm1 \n" 3745 "movq %%xmm0," MEMACCESS(1) " \n" 3746 BUNDLEALIGN 3747 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) 3748 "lea " MEMLEA(0x8,1) ",%1 \n" 3749 "sub $0x10,%3 \n" 3750 "jg 1b \n" 3751 : "+r"(src_uyvy), // %0 3752 "+r"(dst_u), // %1 3753 "+r"(dst_v), // %2 3754 "+r"(pix) // %3 3755 : "r"((intptr_t)(stride_uyvy)) // %4 3756 : "memory", "cc" 3757#if defined(__native_client__) && defined(__x86_64__) 3758 , "r14" 3759#endif 3760#if defined(__SSE2__) 3761 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 3762#endif 3763 ); 3764} 3765 3766void UYVYToUV422Row_SSE2(const uint8* src_uyvy, 3767 uint8* dst_u, uint8* dst_v, int pix) { 3768 asm volatile ( 3769 "pcmpeqb %%xmm5,%%xmm5 \n" 3770 "psrlw $0x8,%%xmm5 \n" 3771 "sub %1,%2 \n" 3772 LABELALIGN 3773 "1: \n" 3774 "movdqa " MEMACCESS(0) ",%%xmm0 \n" 3775 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 3776 "lea " MEMLEA(0x20,0) ",%0 \n" 3777 "pand %%xmm5,%%xmm0 \n" 3778 "pand %%xmm5,%%xmm1 \n" 3779 "packuswb %%xmm1,%%xmm0 \n" 3780 "movdqa %%xmm0,%%xmm1 \n" 3781 "pand %%xmm5,%%xmm0 \n" 3782 "packuswb %%xmm0,%%xmm0 \n" 3783 "psrlw $0x8,%%xmm1 \n" 3784 "packuswb %%xmm1,%%xmm1 \n" 3785 "movq %%xmm0," MEMACCESS(1) " \n" 3786 BUNDLEALIGN 3787 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) 3788 "lea " MEMLEA(0x8,1) ",%1 \n" 3789 "sub $0x10,%3 \n" 3790 "jg 1b \n" 3791 : "+r"(src_uyvy), // %0 3792 "+r"(dst_u), // %1 3793 "+r"(dst_v), // %2 3794 "+r"(pix) // %3 3795 : 3796 : "memory", "cc" 3797#if defined(__native_client__) && defined(__x86_64__) 3798 , "r14" 3799#endif 3800#if defined(__SSE2__) 3801 , "xmm0", "xmm1", "xmm5" 3802#endif 3803 ); 3804} 3805 3806void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy, 3807 uint8* dst_y, int pix) { 3808 asm volatile ( 3809 LABELALIGN 3810 "1: \n" 3811 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 3812 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 3813 "lea " MEMLEA(0x20,0) ",%0 \n" 3814 "psrlw $0x8,%%xmm0 \n" 3815 "psrlw $0x8,%%xmm1 \n" 3816 "packuswb %%xmm1,%%xmm0 \n" 3817 "sub $0x10,%2 \n" 3818 "movdqu %%xmm0," MEMACCESS(1) " \n" 3819 "lea " MEMLEA(0x10,1) ",%1 \n" 3820 "jg 1b \n" 3821 : "+r"(src_uyvy), // %0 3822 "+r"(dst_y), // %1 3823 "+r"(pix) // %2 3824 : 3825 : "memory", "cc" 3826#if defined(__SSE2__) 3827 , "xmm0", "xmm1" 3828#endif 3829 ); 3830} 3831 3832void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, 3833 uint8* dst_u, uint8* dst_v, int pix) { 3834 asm volatile ( 3835 "pcmpeqb %%xmm5,%%xmm5 \n" 3836 "psrlw $0x8,%%xmm5 \n" 3837 "sub %1,%2 \n" 3838 LABELALIGN 3839 "1: \n" 3840 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 3841 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 3842 BUNDLEALIGN 3843 MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 3844 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 3845 "lea " MEMLEA(0x20,0) ",%0 \n" 3846 "pavgb %%xmm2,%%xmm0 \n" 3847 "pavgb %%xmm3,%%xmm1 \n" 3848 "pand %%xmm5,%%xmm0 \n" 3849 "pand %%xmm5,%%xmm1 \n" 3850 "packuswb %%xmm1,%%xmm0 \n" 3851 "movdqa %%xmm0,%%xmm1 \n" 3852 "pand %%xmm5,%%xmm0 \n" 3853 "packuswb %%xmm0,%%xmm0 \n" 3854 "psrlw $0x8,%%xmm1 \n" 3855 "packuswb %%xmm1,%%xmm1 \n" 3856 "movq %%xmm0," MEMACCESS(1) " \n" 3857 BUNDLEALIGN 3858 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) 3859 "lea " MEMLEA(0x8,1) ",%1 \n" 3860 "sub $0x10,%3 \n" 3861 "jg 1b \n" 3862 : "+r"(src_uyvy), // %0 3863 "+r"(dst_u), // %1 3864 "+r"(dst_v), // %2 3865 "+r"(pix) // %3 3866 : "r"((intptr_t)(stride_uyvy)) // %4 3867 : "memory", "cc" 3868#if defined(__native_client__) && defined(__x86_64__) 3869 , "r14" 3870#endif 3871#if defined(__SSE2__) 3872 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 3873#endif 3874 ); 3875} 3876 3877void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy, 3878 uint8* dst_u, uint8* dst_v, int pix) { 3879 asm volatile ( 3880 "pcmpeqb %%xmm5,%%xmm5 \n" 3881 "psrlw $0x8,%%xmm5 \n" 3882 "sub %1,%2 \n" 3883 LABELALIGN 3884 "1: \n" 3885 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 3886 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 3887 "lea " MEMLEA(0x20,0) ",%0 \n" 3888 "pand %%xmm5,%%xmm0 \n" 3889 "pand %%xmm5,%%xmm1 \n" 3890 "packuswb %%xmm1,%%xmm0 \n" 3891 "movdqa %%xmm0,%%xmm1 \n" 3892 "pand %%xmm5,%%xmm0 \n" 3893 "packuswb %%xmm0,%%xmm0 \n" 3894 "psrlw $0x8,%%xmm1 \n" 3895 "packuswb %%xmm1,%%xmm1 \n" 3896 "movq %%xmm0," MEMACCESS(1) " \n" 3897 BUNDLEALIGN 3898 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) 3899 "lea " MEMLEA(0x8,1) ",%1 \n" 3900 "sub $0x10,%3 \n" 3901 "jg 1b \n" 3902 : "+r"(src_uyvy), // %0 3903 "+r"(dst_u), // %1 3904 "+r"(dst_v), // %2 3905 "+r"(pix) // %3 3906 : 3907 : "memory", "cc" 3908#if defined(__native_client__) && defined(__x86_64__) 3909 , "r14" 3910#endif 3911#if defined(__SSE2__) 3912 , "xmm0", "xmm1", "xmm5" 3913#endif 3914 ); 3915} 3916#endif // HAS_YUY2TOYROW_SSE2 3917 3918#ifdef HAS_ARGBBLENDROW_SSE2 3919// Blend 8 pixels at a time. 3920void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, 3921 uint8* dst_argb, int width) { 3922 asm volatile ( 3923 "pcmpeqb %%xmm7,%%xmm7 \n" 3924 "psrlw $0xf,%%xmm7 \n" 3925 "pcmpeqb %%xmm6,%%xmm6 \n" 3926 "psrlw $0x8,%%xmm6 \n" 3927 "pcmpeqb %%xmm5,%%xmm5 \n" 3928 "psllw $0x8,%%xmm5 \n" 3929 "pcmpeqb %%xmm4,%%xmm4 \n" 3930 "pslld $0x18,%%xmm4 \n" 3931 "sub $0x1,%3 \n" 3932 "je 91f \n" 3933 "jl 99f \n" 3934 3935 // 1 pixel loop until destination pointer is aligned. 3936 "10: \n" 3937 "test $0xf,%2 \n" 3938 "je 19f \n" 3939 "movd " MEMACCESS(0) ",%%xmm3 \n" 3940 "lea " MEMLEA(0x4,0) ",%0 \n" 3941 "movdqa %%xmm3,%%xmm0 \n" 3942 "pxor %%xmm4,%%xmm3 \n" 3943 "movd " MEMACCESS(1) ",%%xmm2 \n" 3944 "psrlw $0x8,%%xmm3 \n" 3945 "pshufhw $0xf5,%%xmm3,%%xmm3 \n" 3946 "pshuflw $0xf5,%%xmm3,%%xmm3 \n" 3947 "pand %%xmm6,%%xmm2 \n" 3948 "paddw %%xmm7,%%xmm3 \n" 3949 "pmullw %%xmm3,%%xmm2 \n" 3950 "movd " MEMACCESS(1) ",%%xmm1 \n" 3951 "lea " MEMLEA(0x4,1) ",%1 \n" 3952 "psrlw $0x8,%%xmm1 \n" 3953 "por %%xmm4,%%xmm0 \n" 3954 "pmullw %%xmm3,%%xmm1 \n" 3955 "psrlw $0x8,%%xmm2 \n" 3956 "paddusb %%xmm2,%%xmm0 \n" 3957 "pand %%xmm5,%%xmm1 \n" 3958 "paddusb %%xmm1,%%xmm0 \n" 3959 "sub $0x1,%3 \n" 3960 "movd %%xmm0," MEMACCESS(2) " \n" 3961 "lea " MEMLEA(0x4,2) ",%2 \n" 3962 "jge 10b \n" 3963 3964 "19: \n" 3965 "add $1-4,%3 \n" 3966 "jl 49f \n" 3967 3968 // 4 pixel loop. 3969 LABELALIGN 3970 "41: \n" 3971 "movdqu " MEMACCESS(0) ",%%xmm3 \n" 3972 "lea " MEMLEA(0x10,0) ",%0 \n" 3973 "movdqa %%xmm3,%%xmm0 \n" 3974 "pxor %%xmm4,%%xmm3 \n" 3975 "movdqu " MEMACCESS(1) ",%%xmm2 \n" 3976 "psrlw $0x8,%%xmm3 \n" 3977 "pshufhw $0xf5,%%xmm3,%%xmm3 \n" 3978 "pshuflw $0xf5,%%xmm3,%%xmm3 \n" 3979 "pand %%xmm6,%%xmm2 \n" 3980 "paddw %%xmm7,%%xmm3 \n" 3981 "pmullw %%xmm3,%%xmm2 \n" 3982 "movdqu " MEMACCESS(1) ",%%xmm1 \n" 3983 "lea " MEMLEA(0x10,1) ",%1 \n" 3984 "psrlw $0x8,%%xmm1 \n" 3985 "por %%xmm4,%%xmm0 \n" 3986 "pmullw %%xmm3,%%xmm1 \n" 3987 "psrlw $0x8,%%xmm2 \n" 3988 "paddusb %%xmm2,%%xmm0 \n" 3989 "pand %%xmm5,%%xmm1 \n" 3990 "paddusb %%xmm1,%%xmm0 \n" 3991 "sub $0x4,%3 \n" 3992 "movdqa %%xmm0," MEMACCESS(2) " \n" 3993 "lea " MEMLEA(0x10,2) ",%2 \n" 3994 "jge 41b \n" 3995 3996 "49: \n" 3997 "add $0x3,%3 \n" 3998 "jl 99f \n" 3999 4000 // 1 pixel loop. 4001 "91: \n" 4002 "movd " MEMACCESS(0) ",%%xmm3 \n" 4003 "lea " MEMLEA(0x4,0) ",%0 \n" 4004 "movdqa %%xmm3,%%xmm0 \n" 4005 "pxor %%xmm4,%%xmm3 \n" 4006 "movd " MEMACCESS(1) ",%%xmm2 \n" 4007 "psrlw $0x8,%%xmm3 \n" 4008 "pshufhw $0xf5,%%xmm3,%%xmm3 \n" 4009 "pshuflw $0xf5,%%xmm3,%%xmm3 \n" 4010 "pand %%xmm6,%%xmm2 \n" 4011 "paddw %%xmm7,%%xmm3 \n" 4012 "pmullw %%xmm3,%%xmm2 \n" 4013 "movd " MEMACCESS(1) ",%%xmm1 \n" 4014 "lea " MEMLEA(0x4,1) ",%1 \n" 4015 "psrlw $0x8,%%xmm1 \n" 4016 "por %%xmm4,%%xmm0 \n" 4017 "pmullw %%xmm3,%%xmm1 \n" 4018 "psrlw $0x8,%%xmm2 \n" 4019 "paddusb %%xmm2,%%xmm0 \n" 4020 "pand %%xmm5,%%xmm1 \n" 4021 "paddusb %%xmm1,%%xmm0 \n" 4022 "sub $0x1,%3 \n" 4023 "movd %%xmm0," MEMACCESS(2) " \n" 4024 "lea " MEMLEA(0x4,2) ",%2 \n" 4025 "jge 91b \n" 4026 "99: \n" 4027 : "+r"(src_argb0), // %0 4028 "+r"(src_argb1), // %1 4029 "+r"(dst_argb), // %2 4030 "+r"(width) // %3 4031 : 4032 : "memory", "cc" 4033#if defined(__SSE2__) 4034 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 4035#endif 4036 ); 4037} 4038#endif // HAS_ARGBBLENDROW_SSE2 4039 4040#ifdef HAS_ARGBBLENDROW_SSSE3 4041// Shuffle table for isolating alpha. 4042static uvec8 kShuffleAlpha = { 4043 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, 4044 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 4045}; 4046 4047// Blend 8 pixels at a time 4048// Shuffle table for reversing the bytes. 4049 4050// Same as SSE2, but replaces 4051// psrlw xmm3, 8 // alpha 4052// pshufhw xmm3, xmm3,0F5h // 8 alpha words 4053// pshuflw xmm3, xmm3,0F5h 4054// with.. 4055// pshufb xmm3, kShuffleAlpha // alpha 4056 4057void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, 4058 uint8* dst_argb, int width) { 4059 asm volatile ( 4060 "pcmpeqb %%xmm7,%%xmm7 \n" 4061 "psrlw $0xf,%%xmm7 \n" 4062 "pcmpeqb %%xmm6,%%xmm6 \n" 4063 "psrlw $0x8,%%xmm6 \n" 4064 "pcmpeqb %%xmm5,%%xmm5 \n" 4065 "psllw $0x8,%%xmm5 \n" 4066 "pcmpeqb %%xmm4,%%xmm4 \n" 4067 "pslld $0x18,%%xmm4 \n" 4068 "sub $0x1,%3 \n" 4069 "je 91f \n" 4070 "jl 99f \n" 4071 4072 // 1 pixel loop until destination pointer is aligned. 4073 "10: \n" 4074 "test $0xf,%2 \n" 4075 "je 19f \n" 4076 "movd " MEMACCESS(0) ",%%xmm3 \n" 4077 "lea " MEMLEA(0x4,0) ",%0 \n" 4078 "movdqa %%xmm3,%%xmm0 \n" 4079 "pxor %%xmm4,%%xmm3 \n" 4080 "movd " MEMACCESS(1) ",%%xmm2 \n" 4081 "pshufb %4,%%xmm3 \n" 4082 "pand %%xmm6,%%xmm2 \n" 4083 "paddw %%xmm7,%%xmm3 \n" 4084 "pmullw %%xmm3,%%xmm2 \n" 4085 "movd " MEMACCESS(1) ",%%xmm1 \n" 4086 "lea " MEMLEA(0x4,1) ",%1 \n" 4087 "psrlw $0x8,%%xmm1 \n" 4088 "por %%xmm4,%%xmm0 \n" 4089 "pmullw %%xmm3,%%xmm1 \n" 4090 "psrlw $0x8,%%xmm2 \n" 4091 "paddusb %%xmm2,%%xmm0 \n" 4092 "pand %%xmm5,%%xmm1 \n" 4093 "paddusb %%xmm1,%%xmm0 \n" 4094 "sub $0x1,%3 \n" 4095 "movd %%xmm0," MEMACCESS(2) " \n" 4096 "lea " MEMLEA(0x4,2) ",%2 \n" 4097 "jge 10b \n" 4098 4099 "19: \n" 4100 "add $1-4,%3 \n" 4101 "jl 49f \n" 4102 "test $0xf,%0 \n" 4103 "jne 41f \n" 4104 "test $0xf,%1 \n" 4105 "jne 41f \n" 4106 4107 // 4 pixel loop. 4108 LABELALIGN 4109 "40: \n" 4110 "movdqa " MEMACCESS(0) ",%%xmm3 \n" 4111 "lea " MEMLEA(0x10,0) ",%0 \n" 4112 "movdqa %%xmm3,%%xmm0 \n" 4113 "pxor %%xmm4,%%xmm3 \n" 4114 "movdqa " MEMACCESS(1) ",%%xmm2 \n" 4115 "pshufb %4,%%xmm3 \n" 4116 "pand %%xmm6,%%xmm2 \n" 4117 "paddw %%xmm7,%%xmm3 \n" 4118 "pmullw %%xmm3,%%xmm2 \n" 4119 "movdqa " MEMACCESS(1) ",%%xmm1 \n" 4120 "lea " MEMLEA(0x10,1) ",%1 \n" 4121 "psrlw $0x8,%%xmm1 \n" 4122 "por %%xmm4,%%xmm0 \n" 4123 "pmullw %%xmm3,%%xmm1 \n" 4124 "psrlw $0x8,%%xmm2 \n" 4125 "paddusb %%xmm2,%%xmm0 \n" 4126 "pand %%xmm5,%%xmm1 \n" 4127 "paddusb %%xmm1,%%xmm0 \n" 4128 "sub $0x4,%3 \n" 4129 "movdqa %%xmm0," MEMACCESS(2) " \n" 4130 "lea " MEMLEA(0x10,2) ",%2 \n" 4131 "jge 40b \n" 4132 "jmp 49f \n" 4133 4134 // 4 pixel unaligned loop. 4135 LABELALIGN 4136 "41: \n" 4137 "movdqu " MEMACCESS(0) ",%%xmm3 \n" 4138 "lea " MEMLEA(0x10,0) ",%0 \n" 4139 "movdqa %%xmm3,%%xmm0 \n" 4140 "pxor %%xmm4,%%xmm3 \n" 4141 "movdqu " MEMACCESS(1) ",%%xmm2 \n" 4142 "pshufb %4,%%xmm3 \n" 4143 "pand %%xmm6,%%xmm2 \n" 4144 "paddw %%xmm7,%%xmm3 \n" 4145 "pmullw %%xmm3,%%xmm2 \n" 4146 "movdqu " MEMACCESS(1) ",%%xmm1 \n" 4147 "lea " MEMLEA(0x10,1) ",%1 \n" 4148 "psrlw $0x8,%%xmm1 \n" 4149 "por %%xmm4,%%xmm0 \n" 4150 "pmullw %%xmm3,%%xmm1 \n" 4151 "psrlw $0x8,%%xmm2 \n" 4152 "paddusb %%xmm2,%%xmm0 \n" 4153 "pand %%xmm5,%%xmm1 \n" 4154 "paddusb %%xmm1,%%xmm0 \n" 4155 "sub $0x4,%3 \n" 4156 "movdqa %%xmm0," MEMACCESS(2) " \n" 4157 "lea " MEMLEA(0x10,2) ",%2 \n" 4158 "jge 41b \n" 4159 4160 "49: \n" 4161 "add $0x3,%3 \n" 4162 "jl 99f \n" 4163 4164 // 1 pixel loop. 4165 "91: \n" 4166 "movd " MEMACCESS(0) ",%%xmm3 \n" 4167 "lea " MEMLEA(0x4,0) ",%0 \n" 4168 "movdqa %%xmm3,%%xmm0 \n" 4169 "pxor %%xmm4,%%xmm3 \n" 4170 "movd " MEMACCESS(1) ",%%xmm2 \n" 4171 "pshufb %4,%%xmm3 \n" 4172 "pand %%xmm6,%%xmm2 \n" 4173 "paddw %%xmm7,%%xmm3 \n" 4174 "pmullw %%xmm3,%%xmm2 \n" 4175 "movd " MEMACCESS(1) ",%%xmm1 \n" 4176 "lea " MEMLEA(0x4,1) ",%1 \n" 4177 "psrlw $0x8,%%xmm1 \n" 4178 "por %%xmm4,%%xmm0 \n" 4179 "pmullw %%xmm3,%%xmm1 \n" 4180 "psrlw $0x8,%%xmm2 \n" 4181 "paddusb %%xmm2,%%xmm0 \n" 4182 "pand %%xmm5,%%xmm1 \n" 4183 "paddusb %%xmm1,%%xmm0 \n" 4184 "sub $0x1,%3 \n" 4185 "movd %%xmm0," MEMACCESS(2) " \n" 4186 "lea " MEMLEA(0x4,2) ",%2 \n" 4187 "jge 91b \n" 4188 "99: \n" 4189 : "+r"(src_argb0), // %0 4190 "+r"(src_argb1), // %1 4191 "+r"(dst_argb), // %2 4192 "+r"(width) // %3 4193 : "m"(kShuffleAlpha) // %4 4194 : "memory", "cc" 4195#if defined(__SSE2__) 4196 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 4197#endif 4198 ); 4199} 4200#endif // HAS_ARGBBLENDROW_SSSE3 4201 4202#ifdef HAS_ARGBATTENUATEROW_SSE2 4203// Attenuate 4 pixels at a time. 4204// aligned to 16 bytes 4205void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { 4206 asm volatile ( 4207 "pcmpeqb %%xmm4,%%xmm4 \n" 4208 "pslld $0x18,%%xmm4 \n" 4209 "pcmpeqb %%xmm5,%%xmm5 \n" 4210 "psrld $0x8,%%xmm5 \n" 4211 4212 // 4 pixel loop. 4213 LABELALIGN 4214 "1: \n" 4215 "movdqa " MEMACCESS(0) ",%%xmm0 \n" 4216 "punpcklbw %%xmm0,%%xmm0 \n" 4217 "pshufhw $0xff,%%xmm0,%%xmm2 \n" 4218 "pshuflw $0xff,%%xmm2,%%xmm2 \n" 4219 "pmulhuw %%xmm2,%%xmm0 \n" 4220 "movdqa " MEMACCESS(0) ",%%xmm1 \n" 4221 "punpckhbw %%xmm1,%%xmm1 \n" 4222 "pshufhw $0xff,%%xmm1,%%xmm2 \n" 4223 "pshuflw $0xff,%%xmm2,%%xmm2 \n" 4224 "pmulhuw %%xmm2,%%xmm1 \n" 4225 "movdqa " MEMACCESS(0) ",%%xmm2 \n" 4226 "lea " MEMLEA(0x10,0) ",%0 \n" 4227 "psrlw $0x8,%%xmm0 \n" 4228 "pand %%xmm4,%%xmm2 \n" 4229 "psrlw $0x8,%%xmm1 \n" 4230 "packuswb %%xmm1,%%xmm0 \n" 4231 "pand %%xmm5,%%xmm0 \n" 4232 "por %%xmm2,%%xmm0 \n" 4233 "sub $0x4,%2 \n" 4234 "movdqa %%xmm0," MEMACCESS(1) " \n" 4235 "lea " MEMLEA(0x10,1) ",%1 \n" 4236 "jg 1b \n" 4237 : "+r"(src_argb), // %0 4238 "+r"(dst_argb), // %1 4239 "+r"(width) // %2 4240 : 4241 : "memory", "cc" 4242#if defined(__SSE2__) 4243 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 4244#endif 4245 ); 4246} 4247#endif // HAS_ARGBATTENUATEROW_SSE2 4248 4249#ifdef HAS_ARGBATTENUATEROW_SSSE3 4250// Shuffle table duplicating alpha 4251static uvec8 kShuffleAlpha0 = { 4252 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u, 4253}; 4254static uvec8 kShuffleAlpha1 = { 4255 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, 4256 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u, 4257}; 4258// Attenuate 4 pixels at a time. 4259// aligned to 16 bytes 4260void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { 4261 asm volatile ( 4262 "pcmpeqb %%xmm3,%%xmm3 \n" 4263 "pslld $0x18,%%xmm3 \n" 4264 "movdqa %3,%%xmm4 \n" 4265 "movdqa %4,%%xmm5 \n" 4266 4267 // 4 pixel loop. 4268 LABELALIGN 4269 "1: \n" 4270 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 4271 "pshufb %%xmm4,%%xmm0 \n" 4272 "movdqu " MEMACCESS(0) ",%%xmm1 \n" 4273 "punpcklbw %%xmm1,%%xmm1 \n" 4274 "pmulhuw %%xmm1,%%xmm0 \n" 4275 "movdqu " MEMACCESS(0) ",%%xmm1 \n" 4276 "pshufb %%xmm5,%%xmm1 \n" 4277 "movdqu " MEMACCESS(0) ",%%xmm2 \n" 4278 "punpckhbw %%xmm2,%%xmm2 \n" 4279 "pmulhuw %%xmm2,%%xmm1 \n" 4280 "movdqu " MEMACCESS(0) ",%%xmm2 \n" 4281 "lea " MEMLEA(0x10,0) ",%0 \n" 4282 "pand %%xmm3,%%xmm2 \n" 4283 "psrlw $0x8,%%xmm0 \n" 4284 "psrlw $0x8,%%xmm1 \n" 4285 "packuswb %%xmm1,%%xmm0 \n" 4286 "por %%xmm2,%%xmm0 \n" 4287 "sub $0x4,%2 \n" 4288 "movdqu %%xmm0," MEMACCESS(1) " \n" 4289 "lea " MEMLEA(0x10,1) ",%1 \n" 4290 "jg 1b \n" 4291 : "+r"(src_argb), // %0 4292 "+r"(dst_argb), // %1 4293 "+r"(width) // %2 4294 : "m"(kShuffleAlpha0), // %3 4295 "m"(kShuffleAlpha1) // %4 4296 : "memory", "cc" 4297#if defined(__SSE2__) 4298 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 4299#endif 4300 ); 4301} 4302#endif // HAS_ARGBATTENUATEROW_SSSE3 4303 4304#ifdef HAS_ARGBUNATTENUATEROW_SSE2 4305// Unattenuate 4 pixels at a time. 4306// aligned to 16 bytes 4307void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, 4308 int width) { 4309 uintptr_t alpha = 0; 4310 asm volatile ( 4311 // 4 pixel loop. 4312 LABELALIGN 4313 "1: \n" 4314 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 4315 "movzb " MEMACCESS2(0x03,0) ",%3 \n" 4316 "punpcklbw %%xmm0,%%xmm0 \n" 4317 MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2 4318 "movzb " MEMACCESS2(0x07,0) ",%3 \n" 4319 MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3 4320 "pshuflw $0x40,%%xmm2,%%xmm2 \n" 4321 "pshuflw $0x40,%%xmm3,%%xmm3 \n" 4322 "movlhps %%xmm3,%%xmm2 \n" 4323 "pmulhuw %%xmm2,%%xmm0 \n" 4324 "movdqu " MEMACCESS(0) ",%%xmm1 \n" 4325 "movzb " MEMACCESS2(0x0b,0) ",%3 \n" 4326 "punpckhbw %%xmm1,%%xmm1 \n" 4327 BUNDLEALIGN 4328 MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2 4329 "movzb " MEMACCESS2(0x0f,0) ",%3 \n" 4330 MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3 4331 "pshuflw $0x40,%%xmm2,%%xmm2 \n" 4332 "pshuflw $0x40,%%xmm3,%%xmm3 \n" 4333 "movlhps %%xmm3,%%xmm2 \n" 4334 "pmulhuw %%xmm2,%%xmm1 \n" 4335 "lea " MEMLEA(0x10,0) ",%0 \n" 4336 "packuswb %%xmm1,%%xmm0 \n" 4337 "sub $0x4,%2 \n" 4338 "movdqu %%xmm0," MEMACCESS(1) " \n" 4339 "lea " MEMLEA(0x10,1) ",%1 \n" 4340 "jg 1b \n" 4341 : "+r"(src_argb), // %0 4342 "+r"(dst_argb), // %1 4343 "+r"(width), // %2 4344 "+r"(alpha) // %3 4345 : "r"(fixed_invtbl8) // %4 4346 : "memory", "cc" 4347#if defined(__native_client__) && defined(__x86_64__) 4348 , "r14" 4349#endif 4350#if defined(__SSE2__) 4351 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 4352#endif 4353 ); 4354} 4355#endif // HAS_ARGBUNATTENUATEROW_SSE2 4356 4357#ifdef HAS_ARGBGRAYROW_SSSE3 4358// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels 4359void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { 4360 asm volatile ( 4361 "movdqa %3,%%xmm4 \n" 4362 "movdqa %4,%%xmm5 \n" 4363 4364 // 8 pixel loop. 4365 LABELALIGN 4366 "1: \n" 4367 "movdqa " MEMACCESS(0) ",%%xmm0 \n" 4368 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 4369 "pmaddubsw %%xmm4,%%xmm0 \n" 4370 "pmaddubsw %%xmm4,%%xmm1 \n" 4371 "phaddw %%xmm1,%%xmm0 \n" 4372 "paddw %%xmm5,%%xmm0 \n" 4373 "psrlw $0x7,%%xmm0 \n" 4374 "packuswb %%xmm0,%%xmm0 \n" 4375 "movdqa " MEMACCESS(0) ",%%xmm2 \n" 4376 "movdqa " MEMACCESS2(0x10,0) ",%%xmm3 \n" 4377 "lea " MEMLEA(0x20,0) ",%0 \n" 4378 "psrld $0x18,%%xmm2 \n" 4379 "psrld $0x18,%%xmm3 \n" 4380 "packuswb %%xmm3,%%xmm2 \n" 4381 "packuswb %%xmm2,%%xmm2 \n" 4382 "movdqa %%xmm0,%%xmm3 \n" 4383 "punpcklbw %%xmm0,%%xmm0 \n" 4384 "punpcklbw %%xmm2,%%xmm3 \n" 4385 "movdqa %%xmm0,%%xmm1 \n" 4386 "punpcklwd %%xmm3,%%xmm0 \n" 4387 "punpckhwd %%xmm3,%%xmm1 \n" 4388 "sub $0x8,%2 \n" 4389 "movdqa %%xmm0," MEMACCESS(1) " \n" 4390 "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" 4391 "lea " MEMLEA(0x20,1) ",%1 \n" 4392 "jg 1b \n" 4393 : "+r"(src_argb), // %0 4394 "+r"(dst_argb), // %1 4395 "+r"(width) // %2 4396 : "m"(kARGBToYJ), // %3 4397 "m"(kAddYJ64) // %4 4398 : "memory", "cc" 4399#if defined(__SSE2__) 4400 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 4401#endif 4402 ); 4403} 4404#endif // HAS_ARGBGRAYROW_SSSE3 4405 4406#ifdef HAS_ARGBSEPIAROW_SSSE3 4407// b = (r * 35 + g * 68 + b * 17) >> 7 4408// g = (r * 45 + g * 88 + b * 22) >> 7 4409// r = (r * 50 + g * 98 + b * 24) >> 7 4410// Constant for ARGB color to sepia tone 4411static vec8 kARGBToSepiaB = { 4412 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0 4413}; 4414 4415static vec8 kARGBToSepiaG = { 4416 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0 4417}; 4418 4419static vec8 kARGBToSepiaR = { 4420 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0 4421}; 4422 4423// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. 4424void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { 4425 asm volatile ( 4426 "movdqa %2,%%xmm2 \n" 4427 "movdqa %3,%%xmm3 \n" 4428 "movdqa %4,%%xmm4 \n" 4429 4430 // 8 pixel loop. 4431 LABELALIGN 4432 "1: \n" 4433 "movdqa " MEMACCESS(0) ",%%xmm0 \n" 4434 "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n" 4435 "pmaddubsw %%xmm2,%%xmm0 \n" 4436 "pmaddubsw %%xmm2,%%xmm6 \n" 4437 "phaddw %%xmm6,%%xmm0 \n" 4438 "psrlw $0x7,%%xmm0 \n" 4439 "packuswb %%xmm0,%%xmm0 \n" 4440 "movdqa " MEMACCESS(0) ",%%xmm5 \n" 4441 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 4442 "pmaddubsw %%xmm3,%%xmm5 \n" 4443 "pmaddubsw %%xmm3,%%xmm1 \n" 4444 "phaddw %%xmm1,%%xmm5 \n" 4445 "psrlw $0x7,%%xmm5 \n" 4446 "packuswb %%xmm5,%%xmm5 \n" 4447 "punpcklbw %%xmm5,%%xmm0 \n" 4448 "movdqa " MEMACCESS(0) ",%%xmm5 \n" 4449 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 4450 "pmaddubsw %%xmm4,%%xmm5 \n" 4451 "pmaddubsw %%xmm4,%%xmm1 \n" 4452 "phaddw %%xmm1,%%xmm5 \n" 4453 "psrlw $0x7,%%xmm5 \n" 4454 "packuswb %%xmm5,%%xmm5 \n" 4455 "movdqa " MEMACCESS(0) ",%%xmm6 \n" 4456 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 4457 "psrld $0x18,%%xmm6 \n" 4458 "psrld $0x18,%%xmm1 \n" 4459 "packuswb %%xmm1,%%xmm6 \n" 4460 "packuswb %%xmm6,%%xmm6 \n" 4461 "punpcklbw %%xmm6,%%xmm5 \n" 4462 "movdqa %%xmm0,%%xmm1 \n" 4463 "punpcklwd %%xmm5,%%xmm0 \n" 4464 "punpckhwd %%xmm5,%%xmm1 \n" 4465 "sub $0x8,%1 \n" 4466 "movdqa %%xmm0," MEMACCESS(0) " \n" 4467 "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n" 4468 "lea " MEMLEA(0x20,0) ",%0 \n" 4469 "jg 1b \n" 4470 : "+r"(dst_argb), // %0 4471 "+r"(width) // %1 4472 : "m"(kARGBToSepiaB), // %2 4473 "m"(kARGBToSepiaG), // %3 4474 "m"(kARGBToSepiaR) // %4 4475 : "memory", "cc" 4476#if defined(__SSE2__) 4477 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" 4478#endif 4479 ); 4480} 4481#endif // HAS_ARGBSEPIAROW_SSSE3 4482 4483#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 4484// Tranform 8 ARGB pixels (32 bytes) with color matrix. 4485// Same as Sepia except matrix is provided. 4486void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, 4487 const int8* matrix_argb, int width) { 4488 asm volatile ( 4489 "movdqu " MEMACCESS(3) ",%%xmm5 \n" 4490 "pshufd $0x00,%%xmm5,%%xmm2 \n" 4491 "pshufd $0x55,%%xmm5,%%xmm3 \n" 4492 "pshufd $0xaa,%%xmm5,%%xmm4 \n" 4493 "pshufd $0xff,%%xmm5,%%xmm5 \n" 4494 4495 // 8 pixel loop. 4496 LABELALIGN 4497 "1: \n" 4498 "movdqa " MEMACCESS(0) ",%%xmm0 \n" 4499 "movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n" 4500 "pmaddubsw %%xmm2,%%xmm0 \n" 4501 "pmaddubsw %%xmm2,%%xmm7 \n" 4502 "movdqa " MEMACCESS(0) ",%%xmm6 \n" 4503 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 4504 "pmaddubsw %%xmm3,%%xmm6 \n" 4505 "pmaddubsw %%xmm3,%%xmm1 \n" 4506 "phaddsw %%xmm7,%%xmm0 \n" 4507 "phaddsw %%xmm1,%%xmm6 \n" 4508 "psraw $0x6,%%xmm0 \n" 4509 "psraw $0x6,%%xmm6 \n" 4510 "packuswb %%xmm0,%%xmm0 \n" 4511 "packuswb %%xmm6,%%xmm6 \n" 4512 "punpcklbw %%xmm6,%%xmm0 \n" 4513 "movdqa " MEMACCESS(0) ",%%xmm1 \n" 4514 "movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n" 4515 "pmaddubsw %%xmm4,%%xmm1 \n" 4516 "pmaddubsw %%xmm4,%%xmm7 \n" 4517 "phaddsw %%xmm7,%%xmm1 \n" 4518 "movdqa " MEMACCESS(0) ",%%xmm6 \n" 4519 "movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n" 4520 "pmaddubsw %%xmm5,%%xmm6 \n" 4521 "pmaddubsw %%xmm5,%%xmm7 \n" 4522 "phaddsw %%xmm7,%%xmm6 \n" 4523 "psraw $0x6,%%xmm1 \n" 4524 "psraw $0x6,%%xmm6 \n" 4525 "packuswb %%xmm1,%%xmm1 \n" 4526 "packuswb %%xmm6,%%xmm6 \n" 4527 "punpcklbw %%xmm6,%%xmm1 \n" 4528 "movdqa %%xmm0,%%xmm6 \n" 4529 "punpcklwd %%xmm1,%%xmm0 \n" 4530 "punpckhwd %%xmm1,%%xmm6 \n" 4531 "sub $0x8,%2 \n" 4532 "movdqa %%xmm0," MEMACCESS(1) " \n" 4533 "movdqa %%xmm6," MEMACCESS2(0x10,1) " \n" 4534 "lea " MEMLEA(0x20,0) ",%0 \n" 4535 "lea " MEMLEA(0x20,1) ",%1 \n" 4536 "jg 1b \n" 4537 : "+r"(src_argb), // %0 4538 "+r"(dst_argb), // %1 4539 "+r"(width) // %2 4540 : "r"(matrix_argb) // %3 4541 : "memory", "cc" 4542#if defined(__SSE2__) 4543 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 4544#endif 4545 ); 4546} 4547#endif // HAS_ARGBCOLORMATRIXROW_SSSE3 4548 4549#ifdef HAS_ARGBQUANTIZEROW_SSE2 4550// Quantize 4 ARGB pixels (16 bytes). 4551// aligned to 16 bytes 4552void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, 4553 int interval_offset, int width) { 4554 asm volatile ( 4555 "movd %2,%%xmm2 \n" 4556 "movd %3,%%xmm3 \n" 4557 "movd %4,%%xmm4 \n" 4558 "pshuflw $0x40,%%xmm2,%%xmm2 \n" 4559 "pshufd $0x44,%%xmm2,%%xmm2 \n" 4560 "pshuflw $0x40,%%xmm3,%%xmm3 \n" 4561 "pshufd $0x44,%%xmm3,%%xmm3 \n" 4562 "pshuflw $0x40,%%xmm4,%%xmm4 \n" 4563 "pshufd $0x44,%%xmm4,%%xmm4 \n" 4564 "pxor %%xmm5,%%xmm5 \n" 4565 "pcmpeqb %%xmm6,%%xmm6 \n" 4566 "pslld $0x18,%%xmm6 \n" 4567 4568 // 4 pixel loop. 4569 LABELALIGN 4570 "1: \n" 4571 "movdqa " MEMACCESS(0) ",%%xmm0 \n" 4572 "punpcklbw %%xmm5,%%xmm0 \n" 4573 "pmulhuw %%xmm2,%%xmm0 \n" 4574 "movdqa " MEMACCESS(0) ",%%xmm1 \n" 4575 "punpckhbw %%xmm5,%%xmm1 \n" 4576 "pmulhuw %%xmm2,%%xmm1 \n" 4577 "pmullw %%xmm3,%%xmm0 \n" 4578 "movdqa " MEMACCESS(0) ",%%xmm7 \n" 4579 "pmullw %%xmm3,%%xmm1 \n" 4580 "pand %%xmm6,%%xmm7 \n" 4581 "paddw %%xmm4,%%xmm0 \n" 4582 "paddw %%xmm4,%%xmm1 \n" 4583 "packuswb %%xmm1,%%xmm0 \n" 4584 "por %%xmm7,%%xmm0 \n" 4585 "sub $0x4,%1 \n" 4586 "movdqa %%xmm0," MEMACCESS(0) " \n" 4587 "lea " MEMLEA(0x10,0) ",%0 \n" 4588 "jg 1b \n" 4589 : "+r"(dst_argb), // %0 4590 "+r"(width) // %1 4591 : "r"(scale), // %2 4592 "r"(interval_size), // %3 4593 "r"(interval_offset) // %4 4594 : "memory", "cc" 4595#if defined(__SSE2__) 4596 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 4597#endif 4598 ); 4599} 4600#endif // HAS_ARGBQUANTIZEROW_SSE2 4601 4602#ifdef HAS_ARGBSHADEROW_SSE2 4603// Shade 4 pixels at a time by specified value. 4604// Aligned to 16 bytes. 4605void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, 4606 uint32 value) { 4607 asm volatile ( 4608 "movd %3,%%xmm2 \n" 4609 "punpcklbw %%xmm2,%%xmm2 \n" 4610 "punpcklqdq %%xmm2,%%xmm2 \n" 4611 4612 // 4 pixel loop. 4613 LABELALIGN 4614 "1: \n" 4615 "movdqa " MEMACCESS(0) ",%%xmm0 \n" 4616 "lea " MEMLEA(0x10,0) ",%0 \n" 4617 "movdqa %%xmm0,%%xmm1 \n" 4618 "punpcklbw %%xmm0,%%xmm0 \n" 4619 "punpckhbw %%xmm1,%%xmm1 \n" 4620 "pmulhuw %%xmm2,%%xmm0 \n" 4621 "pmulhuw %%xmm2,%%xmm1 \n" 4622 "psrlw $0x8,%%xmm0 \n" 4623 "psrlw $0x8,%%xmm1 \n" 4624 "packuswb %%xmm1,%%xmm0 \n" 4625 "sub $0x4,%2 \n" 4626 "movdqa %%xmm0," MEMACCESS(1) " \n" 4627 "lea " MEMLEA(0x10,1) ",%1 \n" 4628 "jg 1b \n" 4629 : "+r"(src_argb), // %0 4630 "+r"(dst_argb), // %1 4631 "+r"(width) // %2 4632 : "r"(value) // %3 4633 : "memory", "cc" 4634#if defined(__SSE2__) 4635 , "xmm0", "xmm1", "xmm2" 4636#endif 4637 ); 4638} 4639#endif // HAS_ARGBSHADEROW_SSE2 4640 4641#ifdef HAS_ARGBMULTIPLYROW_SSE2 4642// Multiply 2 rows of ARGB pixels together, 4 pixels at a time. 4643void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, 4644 uint8* dst_argb, int width) { 4645 asm volatile ( 4646 "pxor %%xmm5,%%xmm5 \n" 4647 4648 // 4 pixel loop. 4649 LABELALIGN 4650 "1: \n" 4651 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 4652 "lea " MEMLEA(0x10,0) ",%0 \n" 4653 "movdqu " MEMACCESS(1) ",%%xmm2 \n" 4654 "lea " MEMLEA(0x10,1) ",%1 \n" 4655 "movdqu %%xmm0,%%xmm1 \n" 4656 "movdqu %%xmm2,%%xmm3 \n" 4657 "punpcklbw %%xmm0,%%xmm0 \n" 4658 "punpckhbw %%xmm1,%%xmm1 \n" 4659 "punpcklbw %%xmm5,%%xmm2 \n" 4660 "punpckhbw %%xmm5,%%xmm3 \n" 4661 "pmulhuw %%xmm2,%%xmm0 \n" 4662 "pmulhuw %%xmm3,%%xmm1 \n" 4663 "packuswb %%xmm1,%%xmm0 \n" 4664 "sub $0x4,%3 \n" 4665 "movdqu %%xmm0," MEMACCESS(2) " \n" 4666 "lea " MEMLEA(0x10,2) ",%2 \n" 4667 "jg 1b \n" 4668 : "+r"(src_argb0), // %0 4669 "+r"(src_argb1), // %1 4670 "+r"(dst_argb), // %2 4671 "+r"(width) // %3 4672 : 4673 : "memory", "cc" 4674#if defined(__SSE2__) 4675 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 4676#endif 4677 ); 4678} 4679#endif // HAS_ARGBMULTIPLYROW_SSE2 4680 4681#ifdef HAS_ARGBADDROW_SSE2 4682// Add 2 rows of ARGB pixels together, 4 pixels at a time. 4683void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, 4684 uint8* dst_argb, int width) { 4685 asm volatile ( 4686 // 4 pixel loop. 4687 LABELALIGN 4688 "1: \n" 4689 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 4690 "lea " MEMLEA(0x10,0) ",%0 \n" 4691 "movdqu " MEMACCESS(1) ",%%xmm1 \n" 4692 "lea " MEMLEA(0x10,1) ",%1 \n" 4693 "paddusb %%xmm1,%%xmm0 \n" 4694 "sub $0x4,%3 \n" 4695 "movdqu %%xmm0," MEMACCESS(2) " \n" 4696 "lea " MEMLEA(0x10,2) ",%2 \n" 4697 "jg 1b \n" 4698 : "+r"(src_argb0), // %0 4699 "+r"(src_argb1), // %1 4700 "+r"(dst_argb), // %2 4701 "+r"(width) // %3 4702 : 4703 : "memory", "cc" 4704#if defined(__SSE2__) 4705 , "xmm0", "xmm1" 4706#endif 4707 ); 4708} 4709#endif // HAS_ARGBADDROW_SSE2 4710 4711#ifdef HAS_ARGBSUBTRACTROW_SSE2 4712// Subtract 2 rows of ARGB pixels, 4 pixels at a time. 4713void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, 4714 uint8* dst_argb, int width) { 4715 asm volatile ( 4716 // 4 pixel loop. 4717 LABELALIGN 4718 "1: \n" 4719 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 4720 "lea " MEMLEA(0x10,0) ",%0 \n" 4721 "movdqu " MEMACCESS(1) ",%%xmm1 \n" 4722 "lea " MEMLEA(0x10,1) ",%1 \n" 4723 "psubusb %%xmm1,%%xmm0 \n" 4724 "sub $0x4,%3 \n" 4725 "movdqu %%xmm0," MEMACCESS(2) " \n" 4726 "lea " MEMLEA(0x10,2) ",%2 \n" 4727 "jg 1b \n" 4728 : "+r"(src_argb0), // %0 4729 "+r"(src_argb1), // %1 4730 "+r"(dst_argb), // %2 4731 "+r"(width) // %3 4732 : 4733 : "memory", "cc" 4734#if defined(__SSE2__) 4735 , "xmm0", "xmm1" 4736#endif 4737 ); 4738} 4739#endif // HAS_ARGBSUBTRACTROW_SSE2 4740 4741#ifdef HAS_SOBELXROW_SSE2 4742// SobelX as a matrix is 4743// -1 0 1 4744// -2 0 2 4745// -1 0 1 4746void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, 4747 const uint8* src_y2, uint8* dst_sobelx, int width) { 4748 asm volatile ( 4749 "sub %0,%1 \n" 4750 "sub %0,%2 \n" 4751 "sub %0,%3 \n" 4752 "pxor %%xmm5,%%xmm5 \n" 4753 4754 // 8 pixel loop. 4755 LABELALIGN 4756 "1: \n" 4757 "movq " MEMACCESS(0) ",%%xmm0 \n" 4758 "movq " MEMACCESS2(0x2,0) ",%%xmm1 \n" 4759 "punpcklbw %%xmm5,%%xmm0 \n" 4760 "punpcklbw %%xmm5,%%xmm1 \n" 4761 "psubw %%xmm1,%%xmm0 \n" 4762 BUNDLEALIGN 4763 MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1 4764 MEMOPREG(movq,0x02,0,1,1,xmm2) // movq 0x2(%0,%1,1),%%xmm2 4765 "punpcklbw %%xmm5,%%xmm1 \n" 4766 "punpcklbw %%xmm5,%%xmm2 \n" 4767 "psubw %%xmm2,%%xmm1 \n" 4768 BUNDLEALIGN 4769 MEMOPREG(movq,0x00,0,2,1,xmm2) // movq (%0,%2,1),%%xmm2 4770 MEMOPREG(movq,0x02,0,2,1,xmm3) // movq 0x2(%0,%2,1),%%xmm3 4771 "punpcklbw %%xmm5,%%xmm2 \n" 4772 "punpcklbw %%xmm5,%%xmm3 \n" 4773 "psubw %%xmm3,%%xmm2 \n" 4774 "paddw %%xmm2,%%xmm0 \n" 4775 "paddw %%xmm1,%%xmm0 \n" 4776 "paddw %%xmm1,%%xmm0 \n" 4777 "pxor %%xmm1,%%xmm1 \n" 4778 "psubw %%xmm0,%%xmm1 \n" 4779 "pmaxsw %%xmm1,%%xmm0 \n" 4780 "packuswb %%xmm0,%%xmm0 \n" 4781 "sub $0x8,%4 \n" 4782 BUNDLEALIGN 4783 MEMOPMEM(movq,xmm0,0x00,0,3,1) // movq %%xmm0,(%0,%3,1) 4784 "lea " MEMLEA(0x8,0) ",%0 \n" 4785 "jg 1b \n" 4786 : "+r"(src_y0), // %0 4787 "+r"(src_y1), // %1 4788 "+r"(src_y2), // %2 4789 "+r"(dst_sobelx), // %3 4790 "+r"(width) // %4 4791 : 4792 : "memory", "cc" 4793#if defined(__native_client__) && defined(__x86_64__) 4794 , "r14" 4795#endif 4796#if defined(__SSE2__) 4797 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 4798#endif 4799 ); 4800} 4801#endif // HAS_SOBELXROW_SSE2 4802 4803#ifdef HAS_SOBELYROW_SSE2 4804// SobelY as a matrix is 4805// -1 -2 -1 4806// 0 0 0 4807// 1 2 1 4808void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, 4809 uint8* dst_sobely, int width) { 4810 asm volatile ( 4811 "sub %0,%1 \n" 4812 "sub %0,%2 \n" 4813 "pxor %%xmm5,%%xmm5 \n" 4814 4815 // 8 pixel loop. 4816 LABELALIGN 4817 "1: \n" 4818 "movq " MEMACCESS(0) ",%%xmm0 \n" 4819 MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1 4820 "punpcklbw %%xmm5,%%xmm0 \n" 4821 "punpcklbw %%xmm5,%%xmm1 \n" 4822 "psubw %%xmm1,%%xmm0 \n" 4823 BUNDLEALIGN 4824 "movq " MEMACCESS2(0x1,0) ",%%xmm1 \n" 4825 MEMOPREG(movq,0x01,0,1,1,xmm2) // movq 0x1(%0,%1,1),%%xmm2 4826 "punpcklbw %%xmm5,%%xmm1 \n" 4827 "punpcklbw %%xmm5,%%xmm2 \n" 4828 "psubw %%xmm2,%%xmm1 \n" 4829 BUNDLEALIGN 4830 "movq " MEMACCESS2(0x2,0) ",%%xmm2 \n" 4831 MEMOPREG(movq,0x02,0,1,1,xmm3) // movq 0x2(%0,%1,1),%%xmm3 4832 "punpcklbw %%xmm5,%%xmm2 \n" 4833 "punpcklbw %%xmm5,%%xmm3 \n" 4834 "psubw %%xmm3,%%xmm2 \n" 4835 "paddw %%xmm2,%%xmm0 \n" 4836 "paddw %%xmm1,%%xmm0 \n" 4837 "paddw %%xmm1,%%xmm0 \n" 4838 "pxor %%xmm1,%%xmm1 \n" 4839 "psubw %%xmm0,%%xmm1 \n" 4840 "pmaxsw %%xmm1,%%xmm0 \n" 4841 "packuswb %%xmm0,%%xmm0 \n" 4842 "sub $0x8,%3 \n" 4843 BUNDLEALIGN 4844 MEMOPMEM(movq,xmm0,0x00,0,2,1) // movq %%xmm0,(%0,%2,1) 4845 "lea " MEMLEA(0x8,0) ",%0 \n" 4846 "jg 1b \n" 4847 : "+r"(src_y0), // %0 4848 "+r"(src_y1), // %1 4849 "+r"(dst_sobely), // %2 4850 "+r"(width) // %3 4851 : 4852 : "memory", "cc" 4853#if defined(__native_client__) && defined(__x86_64__) 4854 , "r14" 4855#endif 4856#if defined(__SSE2__) 4857 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 4858#endif 4859 ); 4860} 4861#endif // HAS_SOBELYROW_SSE2 4862 4863#ifdef HAS_SOBELROW_SSE2 4864// Adds Sobel X and Sobel Y and stores Sobel into ARGB. 4865// A = 255 4866// R = Sobel 4867// G = Sobel 4868// B = Sobel 4869void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, 4870 uint8* dst_argb, int width) { 4871 asm volatile ( 4872 "sub %0,%1 \n" 4873 "pcmpeqb %%xmm5,%%xmm5 \n" 4874 "pslld $0x18,%%xmm5 \n" 4875 4876 // 8 pixel loop. 4877 LABELALIGN 4878 "1: \n" 4879 "movdqa " MEMACCESS(0) ",%%xmm0 \n" 4880 MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1 4881 "lea " MEMLEA(0x10,0) ",%0 \n" 4882 "paddusb %%xmm1,%%xmm0 \n" 4883 "movdqa %%xmm0,%%xmm2 \n" 4884 "punpcklbw %%xmm0,%%xmm2 \n" 4885 "punpckhbw %%xmm0,%%xmm0 \n" 4886 "movdqa %%xmm2,%%xmm1 \n" 4887 "punpcklwd %%xmm2,%%xmm1 \n" 4888 "punpckhwd %%xmm2,%%xmm2 \n" 4889 "por %%xmm5,%%xmm1 \n" 4890 "por %%xmm5,%%xmm2 \n" 4891 "movdqa %%xmm0,%%xmm3 \n" 4892 "punpcklwd %%xmm0,%%xmm3 \n" 4893 "punpckhwd %%xmm0,%%xmm0 \n" 4894 "por %%xmm5,%%xmm3 \n" 4895 "por %%xmm5,%%xmm0 \n" 4896 "sub $0x10,%3 \n" 4897 "movdqa %%xmm1," MEMACCESS(2) " \n" 4898 "movdqa %%xmm2," MEMACCESS2(0x10,2) " \n" 4899 "movdqa %%xmm3," MEMACCESS2(0x20,2) " \n" 4900 "movdqa %%xmm0," MEMACCESS2(0x30,2) " \n" 4901 "lea " MEMLEA(0x40,2) ",%2 \n" 4902 "jg 1b \n" 4903 : "+r"(src_sobelx), // %0 4904 "+r"(src_sobely), // %1 4905 "+r"(dst_argb), // %2 4906 "+r"(width) // %3 4907 : 4908 : "memory", "cc" 4909#if defined(__native_client__) && defined(__x86_64__) 4910 , "r14" 4911#endif 4912#if defined(__SSE2__) 4913 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 4914#endif 4915 ); 4916} 4917#endif // HAS_SOBELROW_SSE2 4918 4919#ifdef HAS_SOBELTOPLANEROW_SSE2 4920// Adds Sobel X and Sobel Y and stores Sobel into a plane. 4921void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, 4922 uint8* dst_y, int width) { 4923 asm volatile ( 4924 "sub %0,%1 \n" 4925 "pcmpeqb %%xmm5,%%xmm5 \n" 4926 "pslld $0x18,%%xmm5 \n" 4927 4928 // 8 pixel loop. 4929 LABELALIGN 4930 "1: \n" 4931 "movdqa " MEMACCESS(0) ",%%xmm0 \n" 4932 MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1 4933 "lea " MEMLEA(0x10,0) ",%0 \n" 4934 "paddusb %%xmm1,%%xmm0 \n" 4935 "sub $0x10,%3 \n" 4936 "movdqa %%xmm0," MEMACCESS(2) " \n" 4937 "lea " MEMLEA(0x10,2) ",%2 \n" 4938 "jg 1b \n" 4939 : "+r"(src_sobelx), // %0 4940 "+r"(src_sobely), // %1 4941 "+r"(dst_y), // %2 4942 "+r"(width) // %3 4943 : 4944 : "memory", "cc" 4945#if defined(__native_client__) && defined(__x86_64__) 4946 , "r14" 4947#endif 4948#if defined(__SSE2__) 4949 , "xmm0", "xmm1" 4950#endif 4951 ); 4952} 4953#endif // HAS_SOBELTOPLANEROW_SSE2 4954 4955#ifdef HAS_SOBELXYROW_SSE2 4956// Mixes Sobel X, Sobel Y and Sobel into ARGB. 4957// A = 255 4958// R = Sobel X 4959// G = Sobel 4960// B = Sobel Y 4961void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, 4962 uint8* dst_argb, int width) { 4963 asm volatile ( 4964 "sub %0,%1 \n" 4965 "pcmpeqb %%xmm5,%%xmm5 \n" 4966 4967 // 8 pixel loop. 4968 LABELALIGN 4969 "1: \n" 4970 "movdqa " MEMACCESS(0) ",%%xmm0 \n" 4971 MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1 4972 "lea " MEMLEA(0x10,0) ",%0 \n" 4973 "movdqa %%xmm0,%%xmm2 \n" 4974 "paddusb %%xmm1,%%xmm2 \n" 4975 "movdqa %%xmm0,%%xmm3 \n" 4976 "punpcklbw %%xmm5,%%xmm3 \n" 4977 "punpckhbw %%xmm5,%%xmm0 \n" 4978 "movdqa %%xmm1,%%xmm4 \n" 4979 "punpcklbw %%xmm2,%%xmm4 \n" 4980 "punpckhbw %%xmm2,%%xmm1 \n" 4981 "movdqa %%xmm4,%%xmm6 \n" 4982 "punpcklwd %%xmm3,%%xmm6 \n" 4983 "punpckhwd %%xmm3,%%xmm4 \n" 4984 "movdqa %%xmm1,%%xmm7 \n" 4985 "punpcklwd %%xmm0,%%xmm7 \n" 4986 "punpckhwd %%xmm0,%%xmm1 \n" 4987 "sub $0x10,%3 \n" 4988 "movdqa %%xmm6," MEMACCESS(2) " \n" 4989 "movdqa %%xmm4," MEMACCESS2(0x10,2) " \n" 4990 "movdqa %%xmm7," MEMACCESS2(0x20,2) " \n" 4991 "movdqa %%xmm1," MEMACCESS2(0x30,2) " \n" 4992 "lea " MEMLEA(0x40,2) ",%2 \n" 4993 "jg 1b \n" 4994 : "+r"(src_sobelx), // %0 4995 "+r"(src_sobely), // %1 4996 "+r"(dst_argb), // %2 4997 "+r"(width) // %3 4998 : 4999 : "memory", "cc" 5000#if defined(__native_client__) && defined(__x86_64__) 5001 , "r14" 5002#endif 5003#if defined(__SSE2__) 5004 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 5005#endif 5006 ); 5007} 5008#endif // HAS_SOBELXYROW_SSE2 5009 5010#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 5011// Creates a table of cumulative sums where each value is a sum of all values 5012// above and to the left of the value, inclusive of the value. 5013void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, 5014 const int32* previous_cumsum, int width) { 5015 asm volatile ( 5016 "pxor %%xmm0,%%xmm0 \n" 5017 "pxor %%xmm1,%%xmm1 \n" 5018 "sub $0x4,%3 \n" 5019 "jl 49f \n" 5020 "test $0xf,%1 \n" 5021 "jne 49f \n" 5022 5023 // 4 pixel loop \n" 5024 LABELALIGN 5025 "40: \n" 5026 "movdqu " MEMACCESS(0) ",%%xmm2 \n" 5027 "lea " MEMLEA(0x10,0) ",%0 \n" 5028 "movdqa %%xmm2,%%xmm4 \n" 5029 "punpcklbw %%xmm1,%%xmm2 \n" 5030 "movdqa %%xmm2,%%xmm3 \n" 5031 "punpcklwd %%xmm1,%%xmm2 \n" 5032 "punpckhwd %%xmm1,%%xmm3 \n" 5033 "punpckhbw %%xmm1,%%xmm4 \n" 5034 "movdqa %%xmm4,%%xmm5 \n" 5035 "punpcklwd %%xmm1,%%xmm4 \n" 5036 "punpckhwd %%xmm1,%%xmm5 \n" 5037 "paddd %%xmm2,%%xmm0 \n" 5038 "movdqa " MEMACCESS(2) ",%%xmm2 \n" 5039 "paddd %%xmm0,%%xmm2 \n" 5040 "paddd %%xmm3,%%xmm0 \n" 5041 "movdqa " MEMACCESS2(0x10,2) ",%%xmm3 \n" 5042 "paddd %%xmm0,%%xmm3 \n" 5043 "paddd %%xmm4,%%xmm0 \n" 5044 "movdqa " MEMACCESS2(0x20,2) ",%%xmm4 \n" 5045 "paddd %%xmm0,%%xmm4 \n" 5046 "paddd %%xmm5,%%xmm0 \n" 5047 "movdqa " MEMACCESS2(0x30,2) ",%%xmm5 \n" 5048 "lea " MEMLEA(0x40,2) ",%2 \n" 5049 "paddd %%xmm0,%%xmm5 \n" 5050 "movdqa %%xmm2," MEMACCESS(1) " \n" 5051 "movdqa %%xmm3," MEMACCESS2(0x10,1) " \n" 5052 "movdqa %%xmm4," MEMACCESS2(0x20,1) " \n" 5053 "movdqa %%xmm5," MEMACCESS2(0x30,1) " \n" 5054 "lea " MEMLEA(0x40,1) ",%1 \n" 5055 "sub $0x4,%3 \n" 5056 "jge 40b \n" 5057 5058 "49: \n" 5059 "add $0x3,%3 \n" 5060 "jl 19f \n" 5061 5062 // 1 pixel loop \n" 5063 LABELALIGN 5064 "10: \n" 5065 "movd " MEMACCESS(0) ",%%xmm2 \n" 5066 "lea " MEMLEA(0x4,0) ",%0 \n" 5067 "punpcklbw %%xmm1,%%xmm2 \n" 5068 "punpcklwd %%xmm1,%%xmm2 \n" 5069 "paddd %%xmm2,%%xmm0 \n" 5070 "movdqu " MEMACCESS(2) ",%%xmm2 \n" 5071 "lea " MEMLEA(0x10,2) ",%2 \n" 5072 "paddd %%xmm0,%%xmm2 \n" 5073 "movdqu %%xmm2," MEMACCESS(1) " \n" 5074 "lea " MEMLEA(0x10,1) ",%1 \n" 5075 "sub $0x1,%3 \n" 5076 "jge 10b \n" 5077 5078 "19: \n" 5079 : "+r"(row), // %0 5080 "+r"(cumsum), // %1 5081 "+r"(previous_cumsum), // %2 5082 "+r"(width) // %3 5083 : 5084 : "memory", "cc" 5085#if defined(__SSE2__) 5086 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 5087#endif 5088 ); 5089} 5090#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 5091 5092#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 5093void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, 5094 int width, int area, uint8* dst, 5095 int count) { 5096 asm volatile ( 5097 "movd %5,%%xmm5 \n" 5098 "cvtdq2ps %%xmm5,%%xmm5 \n" 5099 "rcpss %%xmm5,%%xmm4 \n" 5100 "pshufd $0x0,%%xmm4,%%xmm4 \n" 5101 "sub $0x4,%3 \n" 5102 "jl 49f \n" 5103 "cmpl $0x80,%5 \n" 5104 "ja 40f \n" 5105 5106 "pshufd $0x0,%%xmm5,%%xmm5 \n" 5107 "pcmpeqb %%xmm6,%%xmm6 \n" 5108 "psrld $0x10,%%xmm6 \n" 5109 "cvtdq2ps %%xmm6,%%xmm6 \n" 5110 "addps %%xmm6,%%xmm5 \n" 5111 "mulps %%xmm4,%%xmm5 \n" 5112 "cvtps2dq %%xmm5,%%xmm5 \n" 5113 "packssdw %%xmm5,%%xmm5 \n" 5114 5115 // 4 pixel small loop \n" 5116 LABELALIGN 5117 "4: \n" 5118 "movdqa " MEMACCESS(0) ",%%xmm0 \n" 5119 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 5120 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" 5121 "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" 5122 BUNDLEALIGN 5123 MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0 5124 MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1 5125 MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2 5126 MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3 5127 "lea " MEMLEA(0x40,0) ",%0 \n" 5128 "psubd " MEMACCESS(1) ",%%xmm0 \n" 5129 "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n" 5130 "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n" 5131 "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n" 5132 BUNDLEALIGN 5133 MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0 5134 MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1 5135 MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2 5136 MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3 5137 "lea " MEMLEA(0x40,1) ",%1 \n" 5138 "packssdw %%xmm1,%%xmm0 \n" 5139 "packssdw %%xmm3,%%xmm2 \n" 5140 "pmulhuw %%xmm5,%%xmm0 \n" 5141 "pmulhuw %%xmm5,%%xmm2 \n" 5142 "packuswb %%xmm2,%%xmm0 \n" 5143 "movdqu %%xmm0," MEMACCESS(2) " \n" 5144 "lea " MEMLEA(0x10,2) ",%2 \n" 5145 "sub $0x4,%3 \n" 5146 "jge 4b \n" 5147 "jmp 49f \n" 5148 5149 // 4 pixel loop \n" 5150 LABELALIGN 5151 "40: \n" 5152 "movdqa " MEMACCESS(0) ",%%xmm0 \n" 5153 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 5154 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" 5155 "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" 5156 BUNDLEALIGN 5157 MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0 5158 MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1 5159 MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2 5160 MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3 5161 "lea " MEMLEA(0x40,0) ",%0 \n" 5162 "psubd " MEMACCESS(1) ",%%xmm0 \n" 5163 "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n" 5164 "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n" 5165 "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n" 5166 BUNDLEALIGN 5167 MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0 5168 MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1 5169 MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2 5170 MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3 5171 "lea " MEMLEA(0x40,1) ",%1 \n" 5172 "cvtdq2ps %%xmm0,%%xmm0 \n" 5173 "cvtdq2ps %%xmm1,%%xmm1 \n" 5174 "mulps %%xmm4,%%xmm0 \n" 5175 "mulps %%xmm4,%%xmm1 \n" 5176 "cvtdq2ps %%xmm2,%%xmm2 \n" 5177 "cvtdq2ps %%xmm3,%%xmm3 \n" 5178 "mulps %%xmm4,%%xmm2 \n" 5179 "mulps %%xmm4,%%xmm3 \n" 5180 "cvtps2dq %%xmm0,%%xmm0 \n" 5181 "cvtps2dq %%xmm1,%%xmm1 \n" 5182 "cvtps2dq %%xmm2,%%xmm2 \n" 5183 "cvtps2dq %%xmm3,%%xmm3 \n" 5184 "packssdw %%xmm1,%%xmm0 \n" 5185 "packssdw %%xmm3,%%xmm2 \n" 5186 "packuswb %%xmm2,%%xmm0 \n" 5187 "movdqu %%xmm0," MEMACCESS(2) " \n" 5188 "lea " MEMLEA(0x10,2) ",%2 \n" 5189 "sub $0x4,%3 \n" 5190 "jge 40b \n" 5191 5192 "49: \n" 5193 "add $0x3,%3 \n" 5194 "jl 19f \n" 5195 5196 // 1 pixel loop \n" 5197 LABELALIGN 5198 "10: \n" 5199 "movdqa " MEMACCESS(0) ",%%xmm0 \n" 5200 MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0 5201 "lea " MEMLEA(0x10,0) ",%0 \n" 5202 "psubd " MEMACCESS(1) ",%%xmm0 \n" 5203 BUNDLEALIGN 5204 MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0 5205 "lea " MEMLEA(0x10,1) ",%1 \n" 5206 "cvtdq2ps %%xmm0,%%xmm0 \n" 5207 "mulps %%xmm4,%%xmm0 \n" 5208 "cvtps2dq %%xmm0,%%xmm0 \n" 5209 "packssdw %%xmm0,%%xmm0 \n" 5210 "packuswb %%xmm0,%%xmm0 \n" 5211 "movd %%xmm0," MEMACCESS(2) " \n" 5212 "lea " MEMLEA(0x4,2) ",%2 \n" 5213 "sub $0x1,%3 \n" 5214 "jge 10b \n" 5215 "19: \n" 5216 : "+r"(topleft), // %0 5217 "+r"(botleft), // %1 5218 "+r"(dst), // %2 5219 "+rm"(count) // %3 5220 : "r"((intptr_t)(width)), // %4 5221 "rm"(area) // %5 5222 : "memory", "cc" 5223#if defined(__native_client__) && defined(__x86_64__) 5224 , "r14" 5225#endif 5226#if defined(__SSE2__) 5227 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" 5228#endif 5229 ); 5230} 5231#endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 5232 5233#ifdef HAS_ARGBAFFINEROW_SSE2 5234// Copy ARGB pixels from source image with slope to a row of destination. 5235LIBYUV_API 5236void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, 5237 uint8* dst_argb, const float* src_dudv, int width) { 5238 intptr_t src_argb_stride_temp = src_argb_stride; 5239 intptr_t temp = 0; 5240 asm volatile ( 5241 "movq " MEMACCESS(3) ",%%xmm2 \n" 5242 "movq " MEMACCESS2(0x08,3) ",%%xmm7 \n" 5243 "shl $0x10,%1 \n" 5244 "add $0x4,%1 \n" 5245 "movd %1,%%xmm5 \n" 5246 "sub $0x4,%4 \n" 5247 "jl 49f \n" 5248 5249 "pshufd $0x44,%%xmm7,%%xmm7 \n" 5250 "pshufd $0x0,%%xmm5,%%xmm5 \n" 5251 "movdqa %%xmm2,%%xmm0 \n" 5252 "addps %%xmm7,%%xmm0 \n" 5253 "movlhps %%xmm0,%%xmm2 \n" 5254 "movdqa %%xmm7,%%xmm4 \n" 5255 "addps %%xmm4,%%xmm4 \n" 5256 "movdqa %%xmm2,%%xmm3 \n" 5257 "addps %%xmm4,%%xmm3 \n" 5258 "addps %%xmm4,%%xmm4 \n" 5259 5260 // 4 pixel loop \n" 5261 LABELALIGN 5262 "40: \n" 5263 "cvttps2dq %%xmm2,%%xmm0 \n" // x, y float to int first 2 5264 "cvttps2dq %%xmm3,%%xmm1 \n" // x, y float to int next 2 5265 "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts 5266 "pmaddwd %%xmm5,%%xmm0 \n" // off = x * 4 + y * stride 5267 "movd %%xmm0,%k1 \n" 5268 "pshufd $0x39,%%xmm0,%%xmm0 \n" 5269 "movd %%xmm0,%k5 \n" 5270 "pshufd $0x39,%%xmm0,%%xmm0 \n" 5271 BUNDLEALIGN 5272 MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1 5273 MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6 5274 "punpckldq %%xmm6,%%xmm1 \n" 5275 "addps %%xmm4,%%xmm2 \n" 5276 "movq %%xmm1," MEMACCESS(2) " \n" 5277 "movd %%xmm0,%k1 \n" 5278 "pshufd $0x39,%%xmm0,%%xmm0 \n" 5279 "movd %%xmm0,%k5 \n" 5280 BUNDLEALIGN 5281 MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0 5282 MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6 5283 "punpckldq %%xmm6,%%xmm0 \n" 5284 "addps %%xmm4,%%xmm3 \n" 5285 "sub $0x4,%4 \n" 5286 "movq %%xmm0," MEMACCESS2(0x08,2) " \n" 5287 "lea " MEMLEA(0x10,2) ",%2 \n" 5288 "jge 40b \n" 5289 5290 "49: \n" 5291 "add $0x3,%4 \n" 5292 "jl 19f \n" 5293 5294 // 1 pixel loop \n" 5295 LABELALIGN 5296 "10: \n" 5297 "cvttps2dq %%xmm2,%%xmm0 \n" 5298 "packssdw %%xmm0,%%xmm0 \n" 5299 "pmaddwd %%xmm5,%%xmm0 \n" 5300 "addps %%xmm7,%%xmm2 \n" 5301 "movd %%xmm0,%k1 \n" 5302 BUNDLEALIGN 5303 MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0 5304 "sub $0x1,%4 \n" 5305 "movd %%xmm0," MEMACCESS(2) " \n" 5306 "lea " MEMLEA(0x04,2) ",%2 \n" 5307 "jge 10b \n" 5308 "19: \n" 5309 : "+r"(src_argb), // %0 5310 "+r"(src_argb_stride_temp), // %1 5311 "+r"(dst_argb), // %2 5312 "+r"(src_dudv), // %3 5313 "+rm"(width), // %4 5314 "+r"(temp) // %5 5315 : 5316 : "memory", "cc" 5317#if defined(__native_client__) && defined(__x86_64__) 5318 , "r14" 5319#endif 5320#if defined(__SSE2__) 5321 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 5322#endif 5323 ); 5324} 5325#endif // HAS_ARGBAFFINEROW_SSE2 5326 5327#ifdef HAS_INTERPOLATEROW_SSSE3 5328// Bilinear filter 16x2 -> 16x1 5329void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, 5330 ptrdiff_t src_stride, int dst_width, 5331 int source_y_fraction) { 5332 asm volatile ( 5333 "sub %1,%0 \n" 5334 "shr %3 \n" 5335 "cmp $0x0,%3 \n" 5336 "je 100f \n" 5337 "cmp $0x20,%3 \n" 5338 "je 75f \n" 5339 "cmp $0x40,%3 \n" 5340 "je 50f \n" 5341 "cmp $0x60,%3 \n" 5342 "je 25f \n" 5343 5344 "movd %3,%%xmm0 \n" 5345 "neg %3 \n" 5346 "add $0x80,%3 \n" 5347 "movd %3,%%xmm5 \n" 5348 "punpcklbw %%xmm0,%%xmm5 \n" 5349 "punpcklwd %%xmm5,%%xmm5 \n" 5350 "pshufd $0x0,%%xmm5,%%xmm5 \n" 5351 5352 // General purpose row blend. 5353 LABELALIGN 5354 "1: \n" 5355 "movdqa " MEMACCESS(1) ",%%xmm0 \n" 5356 MEMOPREG(movdqa,0x00,1,4,1,xmm2) 5357 "movdqa %%xmm0,%%xmm1 \n" 5358 "punpcklbw %%xmm2,%%xmm0 \n" 5359 "punpckhbw %%xmm2,%%xmm1 \n" 5360 "pmaddubsw %%xmm5,%%xmm0 \n" 5361 "pmaddubsw %%xmm5,%%xmm1 \n" 5362 "psrlw $0x7,%%xmm0 \n" 5363 "psrlw $0x7,%%xmm1 \n" 5364 "packuswb %%xmm1,%%xmm0 \n" 5365 "sub $0x10,%2 \n" 5366 BUNDLEALIGN 5367 MEMOPMEM(movdqa,xmm0,0x00,1,0,1) 5368 "lea " MEMLEA(0x10,1) ",%1 \n" 5369 "jg 1b \n" 5370 "jmp 99f \n" 5371 5372 // Blend 25 / 75. 5373 LABELALIGN 5374 "25: \n" 5375 "movdqa " MEMACCESS(1) ",%%xmm0 \n" 5376 MEMOPREG(movdqa,0x00,1,4,1,xmm1) 5377 "pavgb %%xmm1,%%xmm0 \n" 5378 "pavgb %%xmm1,%%xmm0 \n" 5379 "sub $0x10,%2 \n" 5380 BUNDLEALIGN 5381 MEMOPMEM(movdqa,xmm0,0x00,1,0,1) 5382 "lea " MEMLEA(0x10,1) ",%1 \n" 5383 "jg 25b \n" 5384 "jmp 99f \n" 5385 5386 // Blend 50 / 50. 5387 LABELALIGN 5388 "50: \n" 5389 "movdqa " MEMACCESS(1) ",%%xmm0 \n" 5390 MEMOPREG(movdqa,0x00,1,4,1,xmm1) 5391 "pavgb %%xmm1,%%xmm0 \n" 5392 "sub $0x10,%2 \n" 5393 BUNDLEALIGN 5394 MEMOPMEM(movdqa,xmm0,0x00,1,0,1) 5395 "lea " MEMLEA(0x10,1) ",%1 \n" 5396 "jg 50b \n" 5397 "jmp 99f \n" 5398 5399 // Blend 75 / 25. 5400 LABELALIGN 5401 "75: \n" 5402 "movdqa " MEMACCESS(1) ",%%xmm1 \n" 5403 MEMOPREG(movdqa,0x00,1,4,1,xmm0) 5404 "pavgb %%xmm1,%%xmm0 \n" 5405 "pavgb %%xmm1,%%xmm0 \n" 5406 "sub $0x10,%2 \n" 5407 BUNDLEALIGN 5408 MEMOPMEM(movdqa,xmm0,0x00,1,0,1) 5409 "lea " MEMLEA(0x10,1) ",%1 \n" 5410 "jg 75b \n" 5411 "jmp 99f \n" 5412 5413 // Blend 100 / 0 - Copy row unchanged. 5414 LABELALIGN 5415 "100: \n" 5416 "movdqa " MEMACCESS(1) ",%%xmm0 \n" 5417 "sub $0x10,%2 \n" 5418 MEMOPMEM(movdqa,xmm0,0x00,1,0,1) 5419 "lea " MEMLEA(0x10,1) ",%1 \n" 5420 "jg 100b \n" 5421 5422 "99: \n" 5423 : "+r"(dst_ptr), // %0 5424 "+r"(src_ptr), // %1 5425 "+r"(dst_width), // %2 5426 "+r"(source_y_fraction) // %3 5427 : "r"((intptr_t)(src_stride)) // %4 5428 : "memory", "cc" 5429#if defined(__native_client__) && defined(__x86_64__) 5430 , "r14" 5431#endif 5432#if defined(__SSE2__) 5433 , "xmm0", "xmm1", "xmm2", "xmm5" 5434#endif 5435 ); 5436} 5437#endif // HAS_INTERPOLATEROW_SSSE3 5438 5439#ifdef HAS_INTERPOLATEROW_SSE2 5440// Bilinear filter 16x2 -> 16x1 5441void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, 5442 ptrdiff_t src_stride, int dst_width, 5443 int source_y_fraction) { 5444 asm volatile ( 5445 "sub %1,%0 \n" 5446 "shr %3 \n" 5447 "cmp $0x0,%3 \n" 5448 "je 100f \n" 5449 "cmp $0x20,%3 \n" 5450 "je 75f \n" 5451 "cmp $0x40,%3 \n" 5452 "je 50f \n" 5453 "cmp $0x60,%3 \n" 5454 "je 25f \n" 5455 5456 "movd %3,%%xmm0 \n" 5457 "neg %3 \n" 5458 "add $0x80,%3 \n" 5459 "movd %3,%%xmm5 \n" 5460 "punpcklbw %%xmm0,%%xmm5 \n" 5461 "punpcklwd %%xmm5,%%xmm5 \n" 5462 "pshufd $0x0,%%xmm5,%%xmm5 \n" 5463 "pxor %%xmm4,%%xmm4 \n" 5464 5465 // General purpose row blend. 5466 LABELALIGN 5467 "1: \n" 5468 "movdqa " MEMACCESS(1) ",%%xmm0 \n" 5469 MEMOPREG(movdqa,0x00,1,4,1,xmm2) // movdqa (%1,%4,1),%%xmm2 5470 "movdqa %%xmm0,%%xmm1 \n" 5471 "movdqa %%xmm2,%%xmm3 \n" 5472 "punpcklbw %%xmm4,%%xmm2 \n" 5473 "punpckhbw %%xmm4,%%xmm3 \n" 5474 "punpcklbw %%xmm4,%%xmm0 \n" 5475 "punpckhbw %%xmm4,%%xmm1 \n" 5476 "psubw %%xmm0,%%xmm2 \n" 5477 "psubw %%xmm1,%%xmm3 \n" 5478 "paddw %%xmm2,%%xmm2 \n" 5479 "paddw %%xmm3,%%xmm3 \n" 5480 "pmulhw %%xmm5,%%xmm2 \n" 5481 "pmulhw %%xmm5,%%xmm3 \n" 5482 "paddw %%xmm2,%%xmm0 \n" 5483 "paddw %%xmm3,%%xmm1 \n" 5484 "packuswb %%xmm1,%%xmm0 \n" 5485 "sub $0x10,%2 \n" 5486 BUNDLEALIGN 5487 MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1) 5488 "lea " MEMLEA(0x10,1) ",%1 \n" 5489 "jg 1b \n" 5490 "jmp 99f \n" 5491 5492 // Blend 25 / 75. 5493 LABELALIGN 5494 "25: \n" 5495 "movdqa " MEMACCESS(1) ",%%xmm0 \n" 5496 MEMOPREG(movdqa,0x00,1,4,1,xmm1) // movdqa (%1,%4,1),%%xmm1 5497 "pavgb %%xmm1,%%xmm0 \n" 5498 "pavgb %%xmm1,%%xmm0 \n" 5499 "sub $0x10,%2 \n" 5500 BUNDLEALIGN 5501 MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1) 5502 "lea " MEMLEA(0x10,1) ",%1 \n" 5503 "jg 25b \n" 5504 "jmp 99f \n" 5505 5506 // Blend 50 / 50. 5507 LABELALIGN 5508 "50: \n" 5509 "movdqa " MEMACCESS(1) ",%%xmm0 \n" 5510 MEMOPREG(movdqa,0x00,1,4,1,xmm1) // movdqa (%1,%4,1),%%xmm1 5511 "pavgb %%xmm1,%%xmm0 \n" 5512 "sub $0x10,%2 \n" 5513 BUNDLEALIGN 5514 MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1) 5515 "lea " MEMLEA(0x10,1) ",%1 \n" 5516 "jg 50b \n" 5517 "jmp 99f \n" 5518 5519 // Blend 75 / 25. 5520 LABELALIGN 5521 "75: \n" 5522 "movdqa " MEMACCESS(1) ",%%xmm1 \n" 5523 MEMOPREG(movdqa,0x00,1,4,1,xmm0) // movdqa (%1,%4,1),%%xmm0 5524 "pavgb %%xmm1,%%xmm0 \n" 5525 "pavgb %%xmm1,%%xmm0 \n" 5526 "sub $0x10,%2 \n" 5527 BUNDLEALIGN 5528 MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1) 5529 "lea " MEMLEA(0x10,1) ",%1 \n" 5530 "jg 75b \n" 5531 "jmp 99f \n" 5532 5533 // Blend 100 / 0 - Copy row unchanged. 5534 LABELALIGN 5535 "100: \n" 5536 "movdqa " MEMACCESS(1) ",%%xmm0 \n" 5537 "sub $0x10,%2 \n" 5538 MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1) 5539 "lea " MEMLEA(0x10,1) ",%1 \n" 5540 "jg 100b \n" 5541 5542 "99: \n" 5543 : "+r"(dst_ptr), // %0 5544 "+r"(src_ptr), // %1 5545 "+r"(dst_width), // %2 5546 "+r"(source_y_fraction) // %3 5547 : "r"((intptr_t)(src_stride)) // %4 5548 : "memory", "cc" 5549#if defined(__native_client__) && defined(__x86_64__) 5550 , "r14" 5551#endif 5552#if defined(__SSE2__) 5553 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 5554#endif 5555 ); 5556} 5557#endif // HAS_INTERPOLATEROW_SSE2 5558 5559#ifdef HAS_INTERPOLATEROW_SSSE3 5560// Bilinear filter 16x2 -> 16x1 5561void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr, 5562 ptrdiff_t src_stride, int dst_width, 5563 int source_y_fraction) { 5564 asm volatile ( 5565 "sub %1,%0 \n" 5566 "shr %3 \n" 5567 "cmp $0x0,%3 \n" 5568 "je 100f \n" 5569 "cmp $0x20,%3 \n" 5570 "je 75f \n" 5571 "cmp $0x40,%3 \n" 5572 "je 50f \n" 5573 "cmp $0x60,%3 \n" 5574 "je 25f \n" 5575 5576 "movd %3,%%xmm0 \n" 5577 "neg %3 \n" 5578 "add $0x80,%3 \n" 5579 "movd %3,%%xmm5 \n" 5580 "punpcklbw %%xmm0,%%xmm5 \n" 5581 "punpcklwd %%xmm5,%%xmm5 \n" 5582 "pshufd $0x0,%%xmm5,%%xmm5 \n" 5583 5584 // General purpose row blend. 5585 LABELALIGN 5586 "1: \n" 5587 "movdqu " MEMACCESS(1) ",%%xmm0 \n" 5588 MEMOPREG(movdqu,0x00,1,4,1,xmm2) 5589 "movdqu %%xmm0,%%xmm1 \n" 5590 "punpcklbw %%xmm2,%%xmm0 \n" 5591 "punpckhbw %%xmm2,%%xmm1 \n" 5592 "pmaddubsw %%xmm5,%%xmm0 \n" 5593 "pmaddubsw %%xmm5,%%xmm1 \n" 5594 "psrlw $0x7,%%xmm0 \n" 5595 "psrlw $0x7,%%xmm1 \n" 5596 "packuswb %%xmm1,%%xmm0 \n" 5597 "sub $0x10,%2 \n" 5598 BUNDLEALIGN 5599 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) 5600 "lea " MEMLEA(0x10,1) ",%1 \n" 5601 "jg 1b \n" 5602 "jmp 99f \n" 5603 5604 // Blend 25 / 75. 5605 LABELALIGN 5606 "25: \n" 5607 "movdqu " MEMACCESS(1) ",%%xmm0 \n" 5608 MEMOPREG(movdqu,0x00,1,4,1,xmm1) 5609 "pavgb %%xmm1,%%xmm0 \n" 5610 "pavgb %%xmm1,%%xmm0 \n" 5611 "sub $0x10,%2 \n" 5612 BUNDLEALIGN 5613 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) 5614 "lea " MEMLEA(0x10,1) ",%1 \n" 5615 "jg 25b \n" 5616 "jmp 99f \n" 5617 5618 // Blend 50 / 50. 5619 LABELALIGN 5620 "50: \n" 5621 "movdqu " MEMACCESS(1) ",%%xmm0 \n" 5622 MEMOPREG(movdqu,0x00,1,4,1,xmm1) 5623 "pavgb %%xmm1,%%xmm0 \n" 5624 "sub $0x10,%2 \n" 5625 BUNDLEALIGN 5626 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) 5627 "lea " MEMLEA(0x10,1) ",%1 \n" 5628 "jg 50b \n" 5629 "jmp 99f \n" 5630 5631 // Blend 75 / 25. 5632 LABELALIGN 5633 "75: \n" 5634 "movdqu " MEMACCESS(1) ",%%xmm1 \n" 5635 MEMOPREG(movdqu,0x00,1,4,1,xmm0) 5636 "pavgb %%xmm1,%%xmm0 \n" 5637 "pavgb %%xmm1,%%xmm0 \n" 5638 "sub $0x10,%2 \n" 5639 BUNDLEALIGN 5640 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) 5641 "lea " MEMLEA(0x10,1) ",%1 \n" 5642 "jg 75b \n" 5643 "jmp 99f \n" 5644 5645 // Blend 100 / 0 - Copy row unchanged. 5646 LABELALIGN 5647 "100: \n" 5648 "movdqu " MEMACCESS(1) ",%%xmm0 \n" 5649 "sub $0x10,%2 \n" 5650 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) 5651 "lea " MEMLEA(0x10,1) ",%1 \n" 5652 "jg 100b \n" 5653 5654 "99: \n" 5655 : "+r"(dst_ptr), // %0 5656 "+r"(src_ptr), // %1 5657 "+r"(dst_width), // %2 5658 "+r"(source_y_fraction) // %3 5659 : "r"((intptr_t)(src_stride)) // %4 5660 : "memory", "cc" 5661#if defined(__native_client__) && defined(__x86_64__) 5662 , "r14" 5663#endif 5664#if defined(__SSE2__) 5665 , "xmm0", "xmm1", "xmm2", "xmm5" 5666#endif 5667 ); 5668} 5669#endif // HAS_INTERPOLATEROW_SSSE3 5670 5671#ifdef HAS_INTERPOLATEROW_SSE2 5672// Bilinear filter 16x2 -> 16x1 5673void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr, 5674 ptrdiff_t src_stride, int dst_width, 5675 int source_y_fraction) { 5676 asm volatile ( 5677 "sub %1,%0 \n" 5678 "shr %3 \n" 5679 "cmp $0x0,%3 \n" 5680 "je 100f \n" 5681 "cmp $0x20,%3 \n" 5682 "je 75f \n" 5683 "cmp $0x40,%3 \n" 5684 "je 50f \n" 5685 "cmp $0x60,%3 \n" 5686 "je 25f \n" 5687 5688 "movd %3,%%xmm0 \n" 5689 "neg %3 \n" 5690 "add $0x80,%3 \n" 5691 "movd %3,%%xmm5 \n" 5692 "punpcklbw %%xmm0,%%xmm5 \n" 5693 "punpcklwd %%xmm5,%%xmm5 \n" 5694 "pshufd $0x0,%%xmm5,%%xmm5 \n" 5695 "pxor %%xmm4,%%xmm4 \n" 5696 5697 // General purpose row blend. 5698 LABELALIGN 5699 "1: \n" 5700 "movdqu " MEMACCESS(1) ",%%xmm0 \n" 5701 MEMOPREG(movdqu,0x00,1,4,1,xmm2) // movdqu (%1,%4,1),%%xmm2 5702 "movdqu %%xmm0,%%xmm1 \n" 5703 "movdqu %%xmm2,%%xmm3 \n" 5704 "punpcklbw %%xmm4,%%xmm2 \n" 5705 "punpckhbw %%xmm4,%%xmm3 \n" 5706 "punpcklbw %%xmm4,%%xmm0 \n" 5707 "punpckhbw %%xmm4,%%xmm1 \n" 5708 "psubw %%xmm0,%%xmm2 \n" 5709 "psubw %%xmm1,%%xmm3 \n" 5710 "paddw %%xmm2,%%xmm2 \n" 5711 "paddw %%xmm3,%%xmm3 \n" 5712 "pmulhw %%xmm5,%%xmm2 \n" 5713 "pmulhw %%xmm5,%%xmm3 \n" 5714 "paddw %%xmm2,%%xmm0 \n" 5715 "paddw %%xmm3,%%xmm1 \n" 5716 "packuswb %%xmm1,%%xmm0 \n" 5717 "sub $0x10,%2 \n" 5718 BUNDLEALIGN 5719 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) 5720 "lea " MEMLEA(0x10,1) ",%1 \n" 5721 "jg 1b \n" 5722 "jmp 99f \n" 5723 5724 // Blend 25 / 75. 5725 LABELALIGN 5726 "25: \n" 5727 "movdqu " MEMACCESS(1) ",%%xmm0 \n" 5728 MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1 5729 "pavgb %%xmm1,%%xmm0 \n" 5730 "pavgb %%xmm1,%%xmm0 \n" 5731 "sub $0x10,%2 \n" 5732 BUNDLEALIGN 5733 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) 5734 "lea " MEMLEA(0x10,1) ",%1 \n" 5735 "jg 25b \n" 5736 "jmp 99f \n" 5737 5738 // Blend 50 / 50. 5739 LABELALIGN 5740 "50: \n" 5741 "movdqu " MEMACCESS(1) ",%%xmm0 \n" 5742 MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1 5743 "pavgb %%xmm1,%%xmm0 \n" 5744 "sub $0x10,%2 \n" 5745 BUNDLEALIGN 5746 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) 5747 "lea " MEMLEA(0x10,1) ",%1 \n" 5748 "jg 50b \n" 5749 "jmp 99f \n" 5750 5751 // Blend 75 / 25. 5752 LABELALIGN 5753 "75: \n" 5754 "movdqu " MEMACCESS(1) ",%%xmm1 \n" 5755 MEMOPREG(movdqu,0x00,1,4,1,xmm0) // movdqu (%1,%4,1),%%xmm0 5756 "pavgb %%xmm1,%%xmm0 \n" 5757 "pavgb %%xmm1,%%xmm0 \n" 5758 "sub $0x10,%2 \n" 5759 BUNDLEALIGN 5760 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) 5761 "lea " MEMLEA(0x10,1) ",%1 \n" 5762 "jg 75b \n" 5763 "jmp 99f \n" 5764 5765 // Blend 100 / 0 - Copy row unchanged. 5766 LABELALIGN 5767 "100: \n" 5768 "movdqu " MEMACCESS(1) ",%%xmm0 \n" 5769 "sub $0x10,%2 \n" 5770 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) 5771 "lea " MEMLEA(0x10,1) ",%1 \n" 5772 "jg 100b \n" 5773 5774 "99: \n" 5775 : "+r"(dst_ptr), // %0 5776 "+r"(src_ptr), // %1 5777 "+r"(dst_width), // %2 5778 "+r"(source_y_fraction) // %3 5779 : "r"((intptr_t)(src_stride)) // %4 5780 : "memory", "cc" 5781#if defined(__native_client__) && defined(__x86_64__) 5782 , "r14" 5783#endif 5784#if defined(__SSE2__) 5785 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 5786#endif 5787 ); 5788} 5789#endif // HAS_INTERPOLATEROW_SSE2 5790 5791#ifdef HAS_HALFROW_SSE2 5792void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, 5793 uint8* dst_uv, int pix) { 5794 asm volatile ( 5795 "sub %0,%1 \n" 5796 LABELALIGN 5797 "1: \n" 5798 "movdqa " MEMACCESS(0) ",%%xmm0 \n" 5799 MEMOPREG(pavgb,0x00,0,3,1,xmm0) // pavgb (%0,%3),%%xmm0 5800 "sub $0x10,%2 \n" 5801 MEMOPMEM(movdqa,xmm0,0x00,0,1,1) // movdqa %%xmm0,(%0,%1) 5802 "lea " MEMLEA(0x10,0) ",%0 \n" 5803 "jg 1b \n" 5804 : "+r"(src_uv), // %0 5805 "+r"(dst_uv), // %1 5806 "+r"(pix) // %2 5807 : "r"((intptr_t)(src_uv_stride)) // %3 5808 : "memory", "cc" 5809#if defined(__SSE2__) 5810 , "xmm0" 5811#endif 5812 ); 5813} 5814#endif // HAS_HALFROW_SSE2 5815 5816#ifdef HAS_ARGBTOBAYERROW_SSSE3 5817void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, 5818 uint32 selector, int pix) { 5819 asm volatile ( 5820 // NaCL caveat - assumes movd is from GPR 5821 "movd %3,%%xmm5 \n" 5822 "pshufd $0x0,%%xmm5,%%xmm5 \n" 5823 LABELALIGN 5824 "1: \n" 5825 "movdqa " MEMACCESS(0) ",%%xmm0 \n" 5826 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 5827 "lea " MEMLEA(0x20,0) ",%0 \n" 5828 "pshufb %%xmm5,%%xmm0 \n" 5829 "pshufb %%xmm5,%%xmm1 \n" 5830 "punpckldq %%xmm1,%%xmm0 \n" 5831 "sub $0x8,%2 \n" 5832 "movq %%xmm0," MEMACCESS(1) " \n" 5833 "lea " MEMLEA(0x8,1) ",%1 \n" 5834 "jg 1b \n" 5835 : "+r"(src_argb), // %0 5836 "+r"(dst_bayer), // %1 5837 "+r"(pix) // %2 5838 : "g"(selector) // %3 5839 : "memory", "cc" 5840#if defined(__SSE2__) 5841 , "xmm0", "xmm1", "xmm5" 5842#endif 5843 ); 5844} 5845#endif // HAS_ARGBTOBAYERROW_SSSE3 5846 5847#ifdef HAS_ARGBTOBAYERGGROW_SSE2 5848void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer, 5849 uint32 selector, int pix) { 5850 asm volatile ( 5851 "pcmpeqb %%xmm5,%%xmm5 \n" 5852 "psrld $0x18,%%xmm5 \n" 5853 LABELALIGN 5854 "1: \n" 5855 "movdqa " MEMACCESS(0) ",%%xmm0 \n" 5856 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 5857 "lea " MEMLEA(0x20,0) ",%0 \n" 5858 "psrld $0x8,%%xmm0 \n" 5859 "psrld $0x8,%%xmm1 \n" 5860 "pand %%xmm5,%%xmm0 \n" 5861 "pand %%xmm5,%%xmm1 \n" 5862 "packssdw %%xmm1,%%xmm0 \n" 5863 "packuswb %%xmm1,%%xmm0 \n" 5864 "sub $0x8,%2 \n" 5865 "movq %%xmm0," MEMACCESS(1) " \n" 5866 "lea " MEMLEA(0x8,1) ",%1 \n" 5867 "jg 1b \n" 5868 : "+r"(src_argb), // %0 5869 "+r"(dst_bayer), // %1 5870 "+r"(pix) // %2 5871 : 5872 : "memory", "cc" 5873#if defined(__SSE2__) 5874 , "xmm0", "xmm1", "xmm5" 5875#endif 5876 ); 5877} 5878#endif // HAS_ARGBTOBAYERGGROW_SSE2 5879 5880#ifdef HAS_ARGBSHUFFLEROW_SSSE3 5881// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. 5882void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, 5883 const uint8* shuffler, int pix) { 5884 asm volatile ( 5885 "movdqa " MEMACCESS(3) ",%%xmm5 \n" 5886 LABELALIGN 5887 "1: \n" 5888 "movdqa " MEMACCESS(0) ",%%xmm0 \n" 5889 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 5890 "lea " MEMLEA(0x20,0) ",%0 \n" 5891 "pshufb %%xmm5,%%xmm0 \n" 5892 "pshufb %%xmm5,%%xmm1 \n" 5893 "sub $0x8,%2 \n" 5894 "movdqa %%xmm0," MEMACCESS(1) " \n" 5895 "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" 5896 "lea " MEMLEA(0x20,1) ",%1 \n" 5897 "jg 1b \n" 5898 : "+r"(src_argb), // %0 5899 "+r"(dst_argb), // %1 5900 "+r"(pix) // %2 5901 : "r"(shuffler) // %3 5902 : "memory", "cc" 5903#if defined(__SSE2__) 5904 , "xmm0", "xmm1", "xmm5" 5905#endif 5906 ); 5907} 5908 5909void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb, 5910 const uint8* shuffler, int pix) { 5911 asm volatile ( 5912 "movdqa " MEMACCESS(3) ",%%xmm5 \n" 5913 LABELALIGN 5914 "1: \n" 5915 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 5916 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 5917 "lea " MEMLEA(0x20,0) ",%0 \n" 5918 "pshufb %%xmm5,%%xmm0 \n" 5919 "pshufb %%xmm5,%%xmm1 \n" 5920 "sub $0x8,%2 \n" 5921 "movdqu %%xmm0," MEMACCESS(1) " \n" 5922 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" 5923 "lea " MEMLEA(0x20,1) ",%1 \n" 5924 "jg 1b \n" 5925 : "+r"(src_argb), // %0 5926 "+r"(dst_argb), // %1 5927 "+r"(pix) // %2 5928 : "r"(shuffler) // %3 5929 : "memory", "cc" 5930#if defined(__SSE2__) 5931 , "xmm0", "xmm1", "xmm5" 5932#endif 5933 ); 5934} 5935#endif // HAS_ARGBSHUFFLEROW_SSSE3 5936 5937#ifdef HAS_ARGBSHUFFLEROW_AVX2 5938// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. 5939void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, 5940 const uint8* shuffler, int pix) { 5941 asm volatile ( 5942 "vbroadcastf128 " MEMACCESS(3) ",%%ymm5 \n" 5943 LABELALIGN 5944 "1: \n" 5945 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 5946 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" 5947 "lea " MEMLEA(0x40,0) ",%0 \n" 5948 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" 5949 "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" 5950 "sub $0x10,%2 \n" 5951 "vmovdqu %%ymm0," MEMACCESS(1) " \n" 5952 "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n" 5953 "lea " MEMLEA(0x40,1) ",%1 \n" 5954 "jg 1b \n" 5955 : "+r"(src_argb), // %0 5956 "+r"(dst_argb), // %1 5957 "+r"(pix) // %2 5958 : "r"(shuffler) // %3 5959 : "memory", "cc" 5960#if defined(__SSE2__) 5961 , "xmm0", "xmm1", "xmm5" 5962#endif 5963 ); 5964} 5965#endif // HAS_ARGBSHUFFLEROW_AVX2 5966 5967#ifdef HAS_ARGBSHUFFLEROW_SSE2 5968// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. 5969void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, 5970 const uint8* shuffler, int pix) { 5971 uintptr_t pixel_temp = 0u; 5972 asm volatile ( 5973 "pxor %%xmm5,%%xmm5 \n" 5974 "mov " MEMACCESS(4) ",%k2 \n" 5975 "cmp $0x3000102,%k2 \n" 5976 "je 3012f \n" 5977 "cmp $0x10203,%k2 \n" 5978 "je 123f \n" 5979 "cmp $0x30201,%k2 \n" 5980 "je 321f \n" 5981 "cmp $0x2010003,%k2 \n" 5982 "je 2103f \n" 5983 5984 LABELALIGN 5985 "1: \n" 5986 "movzb " MEMACCESS(4) ",%2 \n" 5987 MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 5988 "mov %b2," MEMACCESS(1) " \n" 5989 "movzb " MEMACCESS2(0x1,4) ",%2 \n" 5990 MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 5991 "mov %b2," MEMACCESS2(0x1,1) " \n" 5992 BUNDLEALIGN 5993 "movzb " MEMACCESS2(0x2,4) ",%2 \n" 5994 MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 5995 "mov %b2," MEMACCESS2(0x2,1) " \n" 5996 "movzb " MEMACCESS2(0x3,4) ",%2 \n" 5997 MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 5998 "mov %b2," MEMACCESS2(0x3,1) " \n" 5999 "lea " MEMLEA(0x4,0) ",%0 \n" 6000 "lea " MEMLEA(0x4,1) ",%1 \n" 6001 "sub $0x1,%3 \n" 6002 "jg 1b \n" 6003 "jmp 99f \n" 6004 6005 LABELALIGN 6006 "123: \n" 6007 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 6008 "lea " MEMLEA(0x10,0) ",%0 \n" 6009 "movdqa %%xmm0,%%xmm1 \n" 6010 "punpcklbw %%xmm5,%%xmm0 \n" 6011 "punpckhbw %%xmm5,%%xmm1 \n" 6012 "pshufhw $0x1b,%%xmm0,%%xmm0 \n" 6013 "pshuflw $0x1b,%%xmm0,%%xmm0 \n" 6014 "pshufhw $0x1b,%%xmm1,%%xmm1 \n" 6015 "pshuflw $0x1b,%%xmm1,%%xmm1 \n" 6016 "packuswb %%xmm1,%%xmm0 \n" 6017 "sub $0x4,%3 \n" 6018 "movdqu %%xmm0," MEMACCESS(1) " \n" 6019 "lea " MEMLEA(0x10,1) ",%1 \n" 6020 "jg 123b \n" 6021 "jmp 99f \n" 6022 6023 LABELALIGN 6024 "321: \n" 6025 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 6026 "lea " MEMLEA(0x10,0) ",%0 \n" 6027 "movdqa %%xmm0,%%xmm1 \n" 6028 "punpcklbw %%xmm5,%%xmm0 \n" 6029 "punpckhbw %%xmm5,%%xmm1 \n" 6030 "pshufhw $0x39,%%xmm0,%%xmm0 \n" 6031 "pshuflw $0x39,%%xmm0,%%xmm0 \n" 6032 "pshufhw $0x39,%%xmm1,%%xmm1 \n" 6033 "pshuflw $0x39,%%xmm1,%%xmm1 \n" 6034 "packuswb %%xmm1,%%xmm0 \n" 6035 "sub $0x4,%3 \n" 6036 "movdqu %%xmm0," MEMACCESS(1) " \n" 6037 "lea " MEMLEA(0x10,1) ",%1 \n" 6038 "jg 321b \n" 6039 "jmp 99f \n" 6040 6041 LABELALIGN 6042 "2103: \n" 6043 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 6044 "lea " MEMLEA(0x10,0) ",%0 \n" 6045 "movdqa %%xmm0,%%xmm1 \n" 6046 "punpcklbw %%xmm5,%%xmm0 \n" 6047 "punpckhbw %%xmm5,%%xmm1 \n" 6048 "pshufhw $0x93,%%xmm0,%%xmm0 \n" 6049 "pshuflw $0x93,%%xmm0,%%xmm0 \n" 6050 "pshufhw $0x93,%%xmm1,%%xmm1 \n" 6051 "pshuflw $0x93,%%xmm1,%%xmm1 \n" 6052 "packuswb %%xmm1,%%xmm0 \n" 6053 "sub $0x4,%3 \n" 6054 "movdqu %%xmm0," MEMACCESS(1) " \n" 6055 "lea " MEMLEA(0x10,1) ",%1 \n" 6056 "jg 2103b \n" 6057 "jmp 99f \n" 6058 6059 LABELALIGN 6060 "3012: \n" 6061 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 6062 "lea " MEMLEA(0x10,0) ",%0 \n" 6063 "movdqa %%xmm0,%%xmm1 \n" 6064 "punpcklbw %%xmm5,%%xmm0 \n" 6065 "punpckhbw %%xmm5,%%xmm1 \n" 6066 "pshufhw $0xc6,%%xmm0,%%xmm0 \n" 6067 "pshuflw $0xc6,%%xmm0,%%xmm0 \n" 6068 "pshufhw $0xc6,%%xmm1,%%xmm1 \n" 6069 "pshuflw $0xc6,%%xmm1,%%xmm1 \n" 6070 "packuswb %%xmm1,%%xmm0 \n" 6071 "sub $0x4,%3 \n" 6072 "movdqu %%xmm0," MEMACCESS(1) " \n" 6073 "lea " MEMLEA(0x10,1) ",%1 \n" 6074 "jg 3012b \n" 6075 6076 "99: \n" 6077 : "+r"(src_argb), // %0 6078 "+r"(dst_argb), // %1 6079 "+d"(pixel_temp), // %2 6080 "+r"(pix) // %3 6081 : "r"(shuffler) // %4 6082 : "memory", "cc" 6083#if defined(__native_client__) && defined(__x86_64__) 6084 , "r14" 6085#endif 6086#if defined(__SSE2__) 6087 , "xmm0", "xmm1", "xmm5" 6088#endif 6089 ); 6090} 6091#endif // HAS_ARGBSHUFFLEROW_SSE2 6092 6093#ifdef HAS_I422TOYUY2ROW_SSE2 6094void I422ToYUY2Row_SSE2(const uint8* src_y, 6095 const uint8* src_u, 6096 const uint8* src_v, 6097 uint8* dst_frame, int width) { 6098 asm volatile ( 6099 "sub %1,%2 \n" 6100 LABELALIGN 6101 "1: \n" 6102 "movq " MEMACCESS(1) ",%%xmm2 \n" 6103 MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3 6104 "lea " MEMLEA(0x8,1) ",%1 \n" 6105 "punpcklbw %%xmm3,%%xmm2 \n" 6106 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 6107 "lea " MEMLEA(0x10,0) ",%0 \n" 6108 "movdqa %%xmm0,%%xmm1 \n" 6109 "punpcklbw %%xmm2,%%xmm0 \n" 6110 "punpckhbw %%xmm2,%%xmm1 \n" 6111 "movdqu %%xmm0," MEMACCESS(3) " \n" 6112 "movdqu %%xmm1," MEMACCESS2(0x10,3) " \n" 6113 "lea " MEMLEA(0x20,3) ",%3 \n" 6114 "sub $0x10,%4 \n" 6115 "jg 1b \n" 6116 : "+r"(src_y), // %0 6117 "+r"(src_u), // %1 6118 "+r"(src_v), // %2 6119 "+r"(dst_frame), // %3 6120 "+rm"(width) // %4 6121 : 6122 : "memory", "cc" 6123#if defined(__native_client__) && defined(__x86_64__) 6124 , "r14" 6125#endif 6126#if defined(__SSE2__) 6127 , "xmm0", "xmm1", "xmm2", "xmm3" 6128#endif 6129 ); 6130} 6131#endif // HAS_I422TOYUY2ROW_SSE2 6132 6133#ifdef HAS_I422TOUYVYROW_SSE2 6134void I422ToUYVYRow_SSE2(const uint8* src_y, 6135 const uint8* src_u, 6136 const uint8* src_v, 6137 uint8* dst_frame, int width) { 6138 asm volatile ( 6139 "sub %1,%2 \n" 6140 LABELALIGN 6141 "1: \n" 6142 "movq " MEMACCESS(1) ",%%xmm2 \n" 6143 MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3 6144 "lea " MEMLEA(0x8,1) ",%1 \n" 6145 "punpcklbw %%xmm3,%%xmm2 \n" 6146 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 6147 "movdqa %%xmm2,%%xmm1 \n" 6148 "lea " MEMLEA(0x10,0) ",%0 \n" 6149 "punpcklbw %%xmm0,%%xmm1 \n" 6150 "punpckhbw %%xmm0,%%xmm2 \n" 6151 "movdqu %%xmm1," MEMACCESS(3) " \n" 6152 "movdqu %%xmm2," MEMACCESS2(0x10,3) " \n" 6153 "lea " MEMLEA(0x20,3) ",%3 \n" 6154 "sub $0x10,%4 \n" 6155 "jg 1b \n" 6156 : "+r"(src_y), // %0 6157 "+r"(src_u), // %1 6158 "+r"(src_v), // %2 6159 "+r"(dst_frame), // %3 6160 "+rm"(width) // %4 6161 : 6162 : "memory", "cc" 6163#if defined(__native_client__) && defined(__x86_64__) 6164 , "r14" 6165#endif 6166#if defined(__SSE2__) 6167 , "xmm0", "xmm1", "xmm2", "xmm3" 6168#endif 6169 ); 6170} 6171#endif // HAS_I422TOUYVYROW_SSE2 6172 6173#ifdef HAS_ARGBPOLYNOMIALROW_SSE2 6174void ARGBPolynomialRow_SSE2(const uint8* src_argb, 6175 uint8* dst_argb, const float* poly, 6176 int width) { 6177 asm volatile ( 6178 "pxor %%xmm3,%%xmm3 \n" 6179 6180 // 2 pixel loop. 6181 LABELALIGN 6182 "1: \n" 6183 "movq " MEMACCESS(0) ",%%xmm0 \n" 6184 "lea " MEMLEA(0x8,0) ",%0 \n" 6185 "punpcklbw %%xmm3,%%xmm0 \n" 6186 "movdqa %%xmm0,%%xmm4 \n" 6187 "punpcklwd %%xmm3,%%xmm0 \n" 6188 "punpckhwd %%xmm3,%%xmm4 \n" 6189 "cvtdq2ps %%xmm0,%%xmm0 \n" 6190 "cvtdq2ps %%xmm4,%%xmm4 \n" 6191 "movdqa %%xmm0,%%xmm1 \n" 6192 "movdqa %%xmm4,%%xmm5 \n" 6193 "mulps " MEMACCESS2(0x10,3) ",%%xmm0 \n" 6194 "mulps " MEMACCESS2(0x10,3) ",%%xmm4 \n" 6195 "addps " MEMACCESS(3) ",%%xmm0 \n" 6196 "addps " MEMACCESS(3) ",%%xmm4 \n" 6197 "movdqa %%xmm1,%%xmm2 \n" 6198 "movdqa %%xmm5,%%xmm6 \n" 6199 "mulps %%xmm1,%%xmm2 \n" 6200 "mulps %%xmm5,%%xmm6 \n" 6201 "mulps %%xmm2,%%xmm1 \n" 6202 "mulps %%xmm6,%%xmm5 \n" 6203 "mulps " MEMACCESS2(0x20,3) ",%%xmm2 \n" 6204 "mulps " MEMACCESS2(0x20,3) ",%%xmm6 \n" 6205 "mulps " MEMACCESS2(0x30,3) ",%%xmm1 \n" 6206 "mulps " MEMACCESS2(0x30,3) ",%%xmm5 \n" 6207 "addps %%xmm2,%%xmm0 \n" 6208 "addps %%xmm6,%%xmm4 \n" 6209 "addps %%xmm1,%%xmm0 \n" 6210 "addps %%xmm5,%%xmm4 \n" 6211 "cvttps2dq %%xmm0,%%xmm0 \n" 6212 "cvttps2dq %%xmm4,%%xmm4 \n" 6213 "packuswb %%xmm4,%%xmm0 \n" 6214 "packuswb %%xmm0,%%xmm0 \n" 6215 "sub $0x2,%2 \n" 6216 "movq %%xmm0," MEMACCESS(1) " \n" 6217 "lea " MEMLEA(0x8,1) ",%1 \n" 6218 "jg 1b \n" 6219 : "+r"(src_argb), // %0 6220 "+r"(dst_argb), // %1 6221 "+r"(width) // %2 6222 : "r"(poly) // %3 6223 : "memory", "cc" 6224#if defined(__SSE2__) 6225 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" 6226#endif 6227 ); 6228} 6229#endif // HAS_ARGBPOLYNOMIALROW_SSE2 6230 6231#ifdef HAS_ARGBPOLYNOMIALROW_AVX2 6232void ARGBPolynomialRow_AVX2(const uint8* src_argb, 6233 uint8* dst_argb, const float* poly, 6234 int width) { 6235 asm volatile ( 6236 "vbroadcastf128 " MEMACCESS(3) ",%%ymm4 \n" 6237 "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n" 6238 "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n" 6239 "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n" 6240 6241 // 2 pixel loop. 6242 LABELALIGN 6243 "1: \n" 6244 "vpmovzxbd " MEMACCESS(0) ",%%ymm0 \n" // 2 ARGB pixels 6245 "lea " MEMLEA(0x8,0) ",%0 \n" 6246 "vcvtdq2ps %%ymm0,%%ymm0 \n" // X 8 floats 6247 "vmulps %%ymm0,%%ymm0,%%ymm2 \n" // X * X 6248 "vmulps %%ymm7,%%ymm0,%%ymm3 \n" // C3 * X 6249 "vfmadd132ps %%ymm5,%%ymm4,%%ymm0 \n" // result = C0 + C1 * X 6250 "vfmadd231ps %%ymm6,%%ymm2,%%ymm0 \n" // result += C2 * X * X 6251 "vfmadd231ps %%ymm3,%%ymm2,%%ymm0 \n" // result += C3 * X * X * X 6252 "vcvttps2dq %%ymm0,%%ymm0 \n" 6253 "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n" 6254 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 6255 "vpackuswb %%xmm0,%%xmm0,%%xmm0 \n" 6256 "sub $0x2,%2 \n" 6257 "vmovq %%xmm0," MEMACCESS(1) " \n" 6258 "lea " MEMLEA(0x8,1) ",%1 \n" 6259 "jg 1b \n" 6260 "vzeroupper \n" 6261 : "+r"(src_argb), // %0 6262 "+r"(dst_argb), // %1 6263 "+r"(width) // %2 6264 : "r"(poly) // %3 6265 : "memory", "cc" 6266#if defined(__SSE2__) 6267// TODO(fbarchard): declare ymm usage when applicable. 6268 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 6269#endif 6270 ); 6271} 6272#endif // HAS_ARGBPOLYNOMIALROW_AVX2 6273 6274#ifdef HAS_ARGBCOLORTABLEROW_X86 6275// Tranform ARGB pixels with color table. 6276void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, 6277 int width) { 6278 uintptr_t pixel_temp = 0u; 6279 asm volatile ( 6280 // 1 pixel loop. 6281 LABELALIGN 6282 "1: \n" 6283 "movzb " MEMACCESS(0) ",%1 \n" 6284 "lea " MEMLEA(0x4,0) ",%0 \n" 6285 MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1 6286 "mov %b1," MEMACCESS2(-0x4,0) " \n" 6287 "movzb " MEMACCESS2(-0x3,0) ",%1 \n" 6288 MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1 6289 "mov %b1," MEMACCESS2(-0x3,0) " \n" 6290 "movzb " MEMACCESS2(-0x2,0) ",%1 \n" 6291 MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1 6292 "mov %b1," MEMACCESS2(-0x2,0) " \n" 6293 "movzb " MEMACCESS2(-0x1,0) ",%1 \n" 6294 MEMOPARG(movzb,0x03,3,1,4,1) " \n" // movzb 0x3(%3,%1,4),%1 6295 "mov %b1," MEMACCESS2(-0x1,0) " \n" 6296 "dec %2 \n" 6297 "jg 1b \n" 6298 : "+r"(dst_argb), // %0 6299 "+d"(pixel_temp), // %1 6300 "+r"(width) // %2 6301 : "r"(table_argb) // %3 6302 : "memory", "cc"); 6303} 6304#endif // HAS_ARGBCOLORTABLEROW_X86 6305 6306#ifdef HAS_RGBCOLORTABLEROW_X86 6307// Tranform RGB pixels with color table. 6308void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { 6309 uintptr_t pixel_temp = 0u; 6310 asm volatile ( 6311 // 1 pixel loop. 6312 LABELALIGN 6313 "1: \n" 6314 "movzb " MEMACCESS(0) ",%1 \n" 6315 "lea " MEMLEA(0x4,0) ",%0 \n" 6316 MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1 6317 "mov %b1," MEMACCESS2(-0x4,0) " \n" 6318 "movzb " MEMACCESS2(-0x3,0) ",%1 \n" 6319 MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1 6320 "mov %b1," MEMACCESS2(-0x3,0) " \n" 6321 "movzb " MEMACCESS2(-0x2,0) ",%1 \n" 6322 MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1 6323 "mov %b1," MEMACCESS2(-0x2,0) " \n" 6324 "dec %2 \n" 6325 "jg 1b \n" 6326 : "+r"(dst_argb), // %0 6327 "+d"(pixel_temp), // %1 6328 "+r"(width) // %2 6329 : "r"(table_argb) // %3 6330 : "memory", "cc"); 6331} 6332#endif // HAS_RGBCOLORTABLEROW_X86 6333 6334#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 6335// Tranform RGB pixels with luma table. 6336void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, 6337 int width, 6338 const uint8* luma, uint32 lumacoeff) { 6339 uintptr_t pixel_temp = 0u; 6340 uintptr_t table_temp = 0u; 6341 asm volatile ( 6342 "movd %6,%%xmm3 \n" 6343 "pshufd $0x0,%%xmm3,%%xmm3 \n" 6344 "pcmpeqb %%xmm4,%%xmm4 \n" 6345 "psllw $0x8,%%xmm4 \n" 6346 "pxor %%xmm5,%%xmm5 \n" 6347 6348 // 4 pixel loop. 6349 LABELALIGN 6350 "1: \n" 6351 "movdqu " MEMACCESS(2) ",%%xmm0 \n" 6352 "pmaddubsw %%xmm3,%%xmm0 \n" 6353 "phaddw %%xmm0,%%xmm0 \n" 6354 "pand %%xmm4,%%xmm0 \n" 6355 "punpcklwd %%xmm5,%%xmm0 \n" 6356 "movd %%xmm0,%k1 \n" // 32 bit offset 6357 "add %5,%1 \n" 6358 "pshufd $0x39,%%xmm0,%%xmm0 \n" 6359 6360 "movzb " MEMACCESS(2) ",%0 \n" 6361 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 6362 "mov %b0," MEMACCESS(3) " \n" 6363 "movzb " MEMACCESS2(0x1,2) ",%0 \n" 6364 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 6365 "mov %b0," MEMACCESS2(0x1,3) " \n" 6366 "movzb " MEMACCESS2(0x2,2) ",%0 \n" 6367 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 6368 "mov %b0," MEMACCESS2(0x2,3) " \n" 6369 "movzb " MEMACCESS2(0x3,2) ",%0 \n" 6370 "mov %b0," MEMACCESS2(0x3,3) " \n" 6371 6372 "movd %%xmm0,%k1 \n" // 32 bit offset 6373 "add %5,%1 \n" 6374 "pshufd $0x39,%%xmm0,%%xmm0 \n" 6375 6376 "movzb " MEMACCESS2(0x4,2) ",%0 \n" 6377 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 6378 "mov %b0," MEMACCESS2(0x4,3) " \n" 6379 BUNDLEALIGN 6380 "movzb " MEMACCESS2(0x5,2) ",%0 \n" 6381 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 6382 "mov %b0," MEMACCESS2(0x5,3) " \n" 6383 "movzb " MEMACCESS2(0x6,2) ",%0 \n" 6384 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 6385 "mov %b0," MEMACCESS2(0x6,3) " \n" 6386 "movzb " MEMACCESS2(0x7,2) ",%0 \n" 6387 "mov %b0," MEMACCESS2(0x7,3) " \n" 6388 6389 "movd %%xmm0,%k1 \n" // 32 bit offset 6390 "add %5,%1 \n" 6391 "pshufd $0x39,%%xmm0,%%xmm0 \n" 6392 6393 "movzb " MEMACCESS2(0x8,2) ",%0 \n" 6394 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 6395 "mov %b0," MEMACCESS2(0x8,3) " \n" 6396 "movzb " MEMACCESS2(0x9,2) ",%0 \n" 6397 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 6398 "mov %b0," MEMACCESS2(0x9,3) " \n" 6399 "movzb " MEMACCESS2(0xa,2) ",%0 \n" 6400 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 6401 "mov %b0," MEMACCESS2(0xa,3) " \n" 6402 "movzb " MEMACCESS2(0xb,2) ",%0 \n" 6403 "mov %b0," MEMACCESS2(0xb,3) " \n" 6404 6405 "movd %%xmm0,%k1 \n" // 32 bit offset 6406 "add %5,%1 \n" 6407 6408 "movzb " MEMACCESS2(0xc,2) ",%0 \n" 6409 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 6410 "mov %b0," MEMACCESS2(0xc,3) " \n" 6411 "movzb " MEMACCESS2(0xd,2) ",%0 \n" 6412 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 6413 "mov %b0," MEMACCESS2(0xd,3) " \n" 6414 "movzb " MEMACCESS2(0xe,2) ",%0 \n" 6415 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 6416 "mov %b0," MEMACCESS2(0xe,3) " \n" 6417 "movzb " MEMACCESS2(0xf,2) ",%0 \n" 6418 "mov %b0," MEMACCESS2(0xf,3) " \n" 6419 "sub $0x4,%4 \n" 6420 "lea " MEMLEA(0x10,2) ",%2 \n" 6421 "lea " MEMLEA(0x10,3) ",%3 \n" 6422 "jg 1b \n" 6423 : "+d"(pixel_temp), // %0 6424 "+a"(table_temp), // %1 6425 "+r"(src_argb), // %2 6426 "+r"(dst_argb), // %3 6427 "+rm"(width) // %4 6428 : "r"(luma), // %5 6429 "rm"(lumacoeff) // %6 6430 : "memory", "cc" 6431#if defined(__SSE2__) 6432 , "xmm0", "xmm3", "xmm4", "xmm5" 6433#endif 6434 ); 6435} 6436#endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 6437 6438#endif // defined(__x86_64__) || defined(__i386__) 6439 6440#ifdef __cplusplus 6441} // extern "C" 6442} // namespace libyuv 6443#endif 6444