1/* 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include "libyuv/row.h" 12 13#ifdef __cplusplus 14namespace libyuv { 15extern "C" { 16#endif 17 18// This module is for GCC x86 and x64. 19#if !defined(LIBYUV_DISABLE_X86) && \ 20 (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) 21 22#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) 23 24// Constants for ARGB 25static vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0, 26 13, 65, 33, 0, 13, 65, 33, 0}; 27 28// JPeg full range. 29static vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0, 30 15, 75, 38, 0, 15, 75, 38, 0}; 31#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) 32 33#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) 34 35static vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0, 36 112, -74, -38, 0, 112, -74, -38, 0}; 37 38static vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0, 39 127, -84, -43, 0, 127, -84, -43, 0}; 40 41static vec8 kARGBToV = { 42 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, 43}; 44 45static vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0, 46 -20, -107, 127, 0, -20, -107, 127, 0}; 47 48// Constants for BGRA 49static vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13, 50 0, 33, 65, 13, 0, 33, 65, 13}; 51 52static vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112, 53 0, -38, -74, 112, 0, -38, -74, 112}; 54 55static vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18, 56 0, 112, -94, -18, 0, 112, -94, -18}; 57 58// Constants for ABGR 59static vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0, 60 33, 65, 13, 0, 33, 65, 13, 0}; 61 62static vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0, 63 -38, -74, 112, 0, -38, -74, 112, 0}; 64 65static vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0, 66 112, -94, -18, 0, 112, -94, -18, 0}; 67 68// Constants for RGBA. 69static vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33, 70 0, 13, 65, 33, 0, 13, 65, 33}; 71 72static vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38, 73 0, 112, -74, -38, 0, 112, -74, -38}; 74 75static vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112, 76 0, -18, -94, 112, 0, -18, -94, 112}; 77 78static uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 79 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u}; 80 81// 7 bit fixed point 0.5. 82static vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64}; 83 84static uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 85 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; 86 87static uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u, 88 0x8080u, 0x8080u, 0x8080u, 0x8080u}; 89#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) 90 91#ifdef HAS_RGB24TOARGBROW_SSSE3 92 93// Shuffle table for converting RGB24 to ARGB. 94static uvec8 kShuffleMaskRGB24ToARGB = {0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 95 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u}; 96 97// Shuffle table for converting RAW to ARGB. 98static uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 99 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u}; 100 101// Shuffle table for converting RAW to RGB24. First 8. 102static const uvec8 kShuffleMaskRAWToRGB24_0 = { 103 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u, 104 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; 105 106// Shuffle table for converting RAW to RGB24. Middle 8. 107static const uvec8 kShuffleMaskRAWToRGB24_1 = { 108 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u, 109 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; 110 111// Shuffle table for converting RAW to RGB24. Last 8. 112static const uvec8 kShuffleMaskRAWToRGB24_2 = { 113 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u, 114 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; 115 116// Shuffle table for converting ARGB to RGB24. 117static uvec8 kShuffleMaskARGBToRGB24 = { 118 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u}; 119 120// Shuffle table for converting ARGB to RAW. 121static uvec8 kShuffleMaskARGBToRAW = { 122 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u}; 123 124// Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 125static uvec8 kShuffleMaskARGBToRGB24_0 = { 126 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u}; 127 128// YUY2 shuf 16 Y to 32 Y. 129static const lvec8 kShuffleYUY2Y = {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 130 10, 12, 12, 14, 14, 0, 0, 2, 2, 4, 4, 131 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}; 132 133// YUY2 shuf 8 UV to 16 UV. 134static const lvec8 kShuffleYUY2UV = {1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 135 11, 13, 15, 13, 15, 1, 3, 1, 3, 5, 7, 136 5, 7, 9, 11, 9, 11, 13, 15, 13, 15}; 137 138// UYVY shuf 16 Y to 32 Y. 139static const lvec8 kShuffleUYVYY = {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 140 11, 13, 13, 15, 15, 1, 1, 3, 3, 5, 5, 141 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}; 142 143// UYVY shuf 8 UV to 16 UV. 144static const lvec8 kShuffleUYVYUV = {0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 145 10, 12, 14, 12, 14, 0, 2, 0, 2, 4, 6, 146 4, 6, 8, 10, 8, 10, 12, 14, 12, 14}; 147 148// NV21 shuf 8 VU to 16 UV. 149static const lvec8 kShuffleNV21 = { 150 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, 151 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, 152}; 153#endif // HAS_RGB24TOARGBROW_SSSE3 154 155#ifdef HAS_J400TOARGBROW_SSE2 156void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) { 157 asm volatile ( 158 "pcmpeqb %%xmm5,%%xmm5 \n" 159 "pslld $0x18,%%xmm5 \n" 160 LABELALIGN 161 "1: \n" 162 "movq " MEMACCESS(0) ",%%xmm0 \n" 163 "lea " MEMLEA(0x8,0) ",%0 \n" 164 "punpcklbw %%xmm0,%%xmm0 \n" 165 "movdqa %%xmm0,%%xmm1 \n" 166 "punpcklwd %%xmm0,%%xmm0 \n" 167 "punpckhwd %%xmm1,%%xmm1 \n" 168 "por %%xmm5,%%xmm0 \n" 169 "por %%xmm5,%%xmm1 \n" 170 "movdqu %%xmm0," MEMACCESS(1) " \n" 171 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" 172 "lea " MEMLEA(0x20,1) ",%1 \n" 173 "sub $0x8,%2 \n" 174 "jg 1b \n" 175 : "+r"(src_y), // %0 176 "+r"(dst_argb), // %1 177 "+r"(width) // %2 178 :: "memory", "cc", "xmm0", "xmm1", "xmm5" 179 ); 180} 181#endif // HAS_J400TOARGBROW_SSE2 182 183#ifdef HAS_RGB24TOARGBROW_SSSE3 184void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) { 185 asm volatile ( 186 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000 187 "pslld $0x18,%%xmm5 \n" 188 "movdqa %3,%%xmm4 \n" 189 LABELALIGN 190 "1: \n" 191 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 192 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 193 "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n" 194 "lea " MEMLEA(0x30,0) ",%0 \n" 195 "movdqa %%xmm3,%%xmm2 \n" 196 "palignr $0x8,%%xmm1,%%xmm2 \n" 197 "pshufb %%xmm4,%%xmm2 \n" 198 "por %%xmm5,%%xmm2 \n" 199 "palignr $0xc,%%xmm0,%%xmm1 \n" 200 "pshufb %%xmm4,%%xmm0 \n" 201 "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" 202 "por %%xmm5,%%xmm0 \n" 203 "pshufb %%xmm4,%%xmm1 \n" 204 "movdqu %%xmm0," MEMACCESS(1) " \n" 205 "por %%xmm5,%%xmm1 \n" 206 "palignr $0x4,%%xmm3,%%xmm3 \n" 207 "pshufb %%xmm4,%%xmm3 \n" 208 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" 209 "por %%xmm5,%%xmm3 \n" 210 "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n" 211 "lea " MEMLEA(0x40,1) ",%1 \n" 212 "sub $0x10,%2 \n" 213 "jg 1b \n" 214 : "+r"(src_rgb24), // %0 215 "+r"(dst_argb), // %1 216 "+r"(width) // %2 217 : "m"(kShuffleMaskRGB24ToARGB) // %3 218 : "memory", "cc" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 219 ); 220} 221 222void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int width) { 223 asm volatile ( 224 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000 225 "pslld $0x18,%%xmm5 \n" 226 "movdqa %3,%%xmm4 \n" 227 LABELALIGN 228 "1: \n" 229 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 230 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 231 "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n" 232 "lea " MEMLEA(0x30,0) ",%0 \n" 233 "movdqa %%xmm3,%%xmm2 \n" 234 "palignr $0x8,%%xmm1,%%xmm2 \n" 235 "pshufb %%xmm4,%%xmm2 \n" 236 "por %%xmm5,%%xmm2 \n" 237 "palignr $0xc,%%xmm0,%%xmm1 \n" 238 "pshufb %%xmm4,%%xmm0 \n" 239 "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" 240 "por %%xmm5,%%xmm0 \n" 241 "pshufb %%xmm4,%%xmm1 \n" 242 "movdqu %%xmm0," MEMACCESS(1) " \n" 243 "por %%xmm5,%%xmm1 \n" 244 "palignr $0x4,%%xmm3,%%xmm3 \n" 245 "pshufb %%xmm4,%%xmm3 \n" 246 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" 247 "por %%xmm5,%%xmm3 \n" 248 "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n" 249 "lea " MEMLEA(0x40,1) ",%1 \n" 250 "sub $0x10,%2 \n" 251 "jg 1b \n" 252 : "+r"(src_raw), // %0 253 "+r"(dst_argb), // %1 254 "+r"(width) // %2 255 : "m"(kShuffleMaskRAWToARGB) // %3 256 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 257 ); 258} 259 260void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) { 261 asm volatile ( 262 "movdqa %3,%%xmm3 \n" 263 "movdqa %4,%%xmm4 \n" 264 "movdqa %5,%%xmm5 \n" 265 LABELALIGN 266 "1: \n" 267 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 268 "movdqu " MEMACCESS2(0x4,0) ",%%xmm1 \n" 269 "movdqu " MEMACCESS2(0x8,0) ",%%xmm2 \n" 270 "lea " MEMLEA(0x18,0) ",%0 \n" 271 "pshufb %%xmm3,%%xmm0 \n" 272 "pshufb %%xmm4,%%xmm1 \n" 273 "pshufb %%xmm5,%%xmm2 \n" 274 "movq %%xmm0," MEMACCESS(1) " \n" 275 "movq %%xmm1," MEMACCESS2(0x8,1) " \n" 276 "movq %%xmm2," MEMACCESS2(0x10,1) " \n" 277 "lea " MEMLEA(0x18,1) ",%1 \n" 278 "sub $0x8,%2 \n" 279 "jg 1b \n" 280 : "+r"(src_raw), // %0 281 "+r"(dst_rgb24), // %1 282 "+r"(width) // %2 283 : "m"(kShuffleMaskRAWToRGB24_0), // %3 284 "m"(kShuffleMaskRAWToRGB24_1), // %4 285 "m"(kShuffleMaskRAWToRGB24_2) // %5 286 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 287 ); 288} 289 290void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) { 291 asm volatile ( 292 "mov $0x1080108,%%eax \n" 293 "movd %%eax,%%xmm5 \n" 294 "pshufd $0x0,%%xmm5,%%xmm5 \n" 295 "mov $0x20802080,%%eax \n" 296 "movd %%eax,%%xmm6 \n" 297 "pshufd $0x0,%%xmm6,%%xmm6 \n" 298 "pcmpeqb %%xmm3,%%xmm3 \n" 299 "psllw $0xb,%%xmm3 \n" 300 "pcmpeqb %%xmm4,%%xmm4 \n" 301 "psllw $0xa,%%xmm4 \n" 302 "psrlw $0x5,%%xmm4 \n" 303 "pcmpeqb %%xmm7,%%xmm7 \n" 304 "psllw $0x8,%%xmm7 \n" 305 "sub %0,%1 \n" 306 "sub %0,%1 \n" 307 LABELALIGN 308 "1: \n" 309 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 310 "movdqa %%xmm0,%%xmm1 \n" 311 "movdqa %%xmm0,%%xmm2 \n" 312 "pand %%xmm3,%%xmm1 \n" 313 "psllw $0xb,%%xmm2 \n" 314 "pmulhuw %%xmm5,%%xmm1 \n" 315 "pmulhuw %%xmm5,%%xmm2 \n" 316 "psllw $0x8,%%xmm1 \n" 317 "por %%xmm2,%%xmm1 \n" 318 "pand %%xmm4,%%xmm0 \n" 319 "pmulhuw %%xmm6,%%xmm0 \n" 320 "por %%xmm7,%%xmm0 \n" 321 "movdqa %%xmm1,%%xmm2 \n" 322 "punpcklbw %%xmm0,%%xmm1 \n" 323 "punpckhbw %%xmm0,%%xmm2 \n" 324 MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2) 325 MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2) 326 "lea " MEMLEA(0x10,0) ",%0 \n" 327 "sub $0x8,%2 \n" 328 "jg 1b \n" 329 : "+r"(src), // %0 330 "+r"(dst), // %1 331 "+r"(width) // %2 332 : 333 : "memory", "cc", "eax", NACL_R14 334 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 335 ); 336} 337 338void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) { 339 asm volatile ( 340 "mov $0x1080108,%%eax \n" 341 "movd %%eax,%%xmm5 \n" 342 "pshufd $0x0,%%xmm5,%%xmm5 \n" 343 "mov $0x42004200,%%eax \n" 344 "movd %%eax,%%xmm6 \n" 345 "pshufd $0x0,%%xmm6,%%xmm6 \n" 346 "pcmpeqb %%xmm3,%%xmm3 \n" 347 "psllw $0xb,%%xmm3 \n" 348 "movdqa %%xmm3,%%xmm4 \n" 349 "psrlw $0x6,%%xmm4 \n" 350 "pcmpeqb %%xmm7,%%xmm7 \n" 351 "psllw $0x8,%%xmm7 \n" 352 "sub %0,%1 \n" 353 "sub %0,%1 \n" 354 LABELALIGN 355 "1: \n" 356 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 357 "movdqa %%xmm0,%%xmm1 \n" 358 "movdqa %%xmm0,%%xmm2 \n" 359 "psllw $0x1,%%xmm1 \n" 360 "psllw $0xb,%%xmm2 \n" 361 "pand %%xmm3,%%xmm1 \n" 362 "pmulhuw %%xmm5,%%xmm2 \n" 363 "pmulhuw %%xmm5,%%xmm1 \n" 364 "psllw $0x8,%%xmm1 \n" 365 "por %%xmm2,%%xmm1 \n" 366 "movdqa %%xmm0,%%xmm2 \n" 367 "pand %%xmm4,%%xmm0 \n" 368 "psraw $0x8,%%xmm2 \n" 369 "pmulhuw %%xmm6,%%xmm0 \n" 370 "pand %%xmm7,%%xmm2 \n" 371 "por %%xmm2,%%xmm0 \n" 372 "movdqa %%xmm1,%%xmm2 \n" 373 "punpcklbw %%xmm0,%%xmm1 \n" 374 "punpckhbw %%xmm0,%%xmm2 \n" 375 MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2) 376 MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2) 377 "lea " MEMLEA(0x10,0) ",%0 \n" 378 "sub $0x8,%2 \n" 379 "jg 1b \n" 380 : "+r"(src), // %0 381 "+r"(dst), // %1 382 "+r"(width) // %2 383 : 384 : "memory", "cc", "eax", NACL_R14 385 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 386 ); 387} 388 389void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) { 390 asm volatile ( 391 "mov $0xf0f0f0f,%%eax \n" 392 "movd %%eax,%%xmm4 \n" 393 "pshufd $0x0,%%xmm4,%%xmm4 \n" 394 "movdqa %%xmm4,%%xmm5 \n" 395 "pslld $0x4,%%xmm5 \n" 396 "sub %0,%1 \n" 397 "sub %0,%1 \n" 398 LABELALIGN 399 "1: \n" 400 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 401 "movdqa %%xmm0,%%xmm2 \n" 402 "pand %%xmm4,%%xmm0 \n" 403 "pand %%xmm5,%%xmm2 \n" 404 "movdqa %%xmm0,%%xmm1 \n" 405 "movdqa %%xmm2,%%xmm3 \n" 406 "psllw $0x4,%%xmm1 \n" 407 "psrlw $0x4,%%xmm3 \n" 408 "por %%xmm1,%%xmm0 \n" 409 "por %%xmm3,%%xmm2 \n" 410 "movdqa %%xmm0,%%xmm1 \n" 411 "punpcklbw %%xmm2,%%xmm0 \n" 412 "punpckhbw %%xmm2,%%xmm1 \n" 413 MEMOPMEM(movdqu,xmm0,0x00,1,0,2) // movdqu %%xmm0,(%1,%0,2) 414 MEMOPMEM(movdqu,xmm1,0x10,1,0,2) // movdqu %%xmm1,0x10(%1,%0,2) 415 "lea " MEMLEA(0x10,0) ",%0 \n" 416 "sub $0x8,%2 \n" 417 "jg 1b \n" 418 : "+r"(src), // %0 419 "+r"(dst), // %1 420 "+r"(width) // %2 421 : 422 : "memory", "cc", "eax", NACL_R14 423 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 424 ); 425} 426 427void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int width) { 428 asm volatile ( 429 "movdqa %3,%%xmm6 \n" 430 LABELALIGN 431 "1: \n" 432 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 433 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 434 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 435 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" 436 "lea " MEMLEA(0x40,0) ",%0 \n" 437 "pshufb %%xmm6,%%xmm0 \n" 438 "pshufb %%xmm6,%%xmm1 \n" 439 "pshufb %%xmm6,%%xmm2 \n" 440 "pshufb %%xmm6,%%xmm3 \n" 441 "movdqa %%xmm1,%%xmm4 \n" 442 "psrldq $0x4,%%xmm1 \n" 443 "pslldq $0xc,%%xmm4 \n" 444 "movdqa %%xmm2,%%xmm5 \n" 445 "por %%xmm4,%%xmm0 \n" 446 "pslldq $0x8,%%xmm5 \n" 447 "movdqu %%xmm0," MEMACCESS(1) " \n" 448 "por %%xmm5,%%xmm1 \n" 449 "psrldq $0x8,%%xmm2 \n" 450 "pslldq $0x4,%%xmm3 \n" 451 "por %%xmm3,%%xmm2 \n" 452 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" 453 "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" 454 "lea " MEMLEA(0x30,1) ",%1 \n" 455 "sub $0x10,%2 \n" 456 "jg 1b \n" 457 : "+r"(src), // %0 458 "+r"(dst), // %1 459 "+r"(width) // %2 460 : "m"(kShuffleMaskARGBToRGB24) // %3 461 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" 462 ); 463} 464 465void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int width) { 466 asm volatile ( 467 "movdqa %3,%%xmm6 \n" 468 LABELALIGN 469 "1: \n" 470 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 471 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 472 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 473 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" 474 "lea " MEMLEA(0x40,0) ",%0 \n" 475 "pshufb %%xmm6,%%xmm0 \n" 476 "pshufb %%xmm6,%%xmm1 \n" 477 "pshufb %%xmm6,%%xmm2 \n" 478 "pshufb %%xmm6,%%xmm3 \n" 479 "movdqa %%xmm1,%%xmm4 \n" 480 "psrldq $0x4,%%xmm1 \n" 481 "pslldq $0xc,%%xmm4 \n" 482 "movdqa %%xmm2,%%xmm5 \n" 483 "por %%xmm4,%%xmm0 \n" 484 "pslldq $0x8,%%xmm5 \n" 485 "movdqu %%xmm0," MEMACCESS(1) " \n" 486 "por %%xmm5,%%xmm1 \n" 487 "psrldq $0x8,%%xmm2 \n" 488 "pslldq $0x4,%%xmm3 \n" 489 "por %%xmm3,%%xmm2 \n" 490 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" 491 "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" 492 "lea " MEMLEA(0x30,1) ",%1 \n" 493 "sub $0x10,%2 \n" 494 "jg 1b \n" 495 : "+r"(src), // %0 496 "+r"(dst), // %1 497 "+r"(width) // %2 498 : "m"(kShuffleMaskARGBToRAW) // %3 499 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" 500 ); 501} 502 503void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int width) { 504 asm volatile ( 505 "pcmpeqb %%xmm3,%%xmm3 \n" 506 "psrld $0x1b,%%xmm3 \n" 507 "pcmpeqb %%xmm4,%%xmm4 \n" 508 "psrld $0x1a,%%xmm4 \n" 509 "pslld $0x5,%%xmm4 \n" 510 "pcmpeqb %%xmm5,%%xmm5 \n" 511 "pslld $0xb,%%xmm5 \n" 512 LABELALIGN 513 "1: \n" 514 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 515 "movdqa %%xmm0,%%xmm1 \n" 516 "movdqa %%xmm0,%%xmm2 \n" 517 "pslld $0x8,%%xmm0 \n" 518 "psrld $0x3,%%xmm1 \n" 519 "psrld $0x5,%%xmm2 \n" 520 "psrad $0x10,%%xmm0 \n" 521 "pand %%xmm3,%%xmm1 \n" 522 "pand %%xmm4,%%xmm2 \n" 523 "pand %%xmm5,%%xmm0 \n" 524 "por %%xmm2,%%xmm1 \n" 525 "por %%xmm1,%%xmm0 \n" 526 "packssdw %%xmm0,%%xmm0 \n" 527 "lea " MEMLEA(0x10,0) ",%0 \n" 528 "movq %%xmm0," MEMACCESS(1) " \n" 529 "lea " MEMLEA(0x8,1) ",%1 \n" 530 "sub $0x4,%2 \n" 531 "jg 1b \n" 532 : "+r"(src), // %0 533 "+r"(dst), // %1 534 "+r"(width) // %2 535 :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 536 ); 537} 538 539void ARGBToRGB565DitherRow_SSE2(const uint8* src, 540 uint8* dst, 541 const uint32 dither4, 542 int width) { 543 asm volatile( 544 "movd %3,%%xmm6 \n" 545 "punpcklbw %%xmm6,%%xmm6 \n" 546 "movdqa %%xmm6,%%xmm7 \n" 547 "punpcklwd %%xmm6,%%xmm6 \n" 548 "punpckhwd %%xmm7,%%xmm7 \n" 549 "pcmpeqb %%xmm3,%%xmm3 \n" 550 "psrld $0x1b,%%xmm3 \n" 551 "pcmpeqb %%xmm4,%%xmm4 \n" 552 "psrld $0x1a,%%xmm4 \n" 553 "pslld $0x5,%%xmm4 \n" 554 "pcmpeqb %%xmm5,%%xmm5 \n" 555 "pslld $0xb,%%xmm5 \n" 556 557 LABELALIGN 558 "1: \n" 559 "movdqu (%0),%%xmm0 \n" 560 "paddusb %%xmm6,%%xmm0 \n" 561 "movdqa %%xmm0,%%xmm1 \n" 562 "movdqa %%xmm0,%%xmm2 \n" 563 "pslld $0x8,%%xmm0 \n" 564 "psrld $0x3,%%xmm1 \n" 565 "psrld $0x5,%%xmm2 \n" 566 "psrad $0x10,%%xmm0 \n" 567 "pand %%xmm3,%%xmm1 \n" 568 "pand %%xmm4,%%xmm2 \n" 569 "pand %%xmm5,%%xmm0 \n" 570 "por %%xmm2,%%xmm1 \n" 571 "por %%xmm1,%%xmm0 \n" 572 "packssdw %%xmm0,%%xmm0 \n" 573 "lea 0x10(%0),%0 \n" 574 "movq %%xmm0,(%1) \n" 575 "lea 0x8(%1),%1 \n" 576 "sub $0x4,%2 \n" 577 "jg 1b \n" 578 : "+r"(src), // %0 579 "+r"(dst), // %1 580 "+r"(width) // %2 581 : "m"(dither4) // %3 582 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", 583 "xmm7"); 584} 585 586#ifdef HAS_ARGBTORGB565DITHERROW_AVX2 587void ARGBToRGB565DitherRow_AVX2(const uint8* src, 588 uint8* dst, 589 const uint32 dither4, 590 int width) { 591 asm volatile( 592 "vbroadcastss %3,%%xmm6 \n" 593 "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n" 594 "vpermq $0xd8,%%ymm6,%%ymm6 \n" 595 "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n" 596 "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n" 597 "vpsrld $0x1b,%%ymm3,%%ymm3 \n" 598 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" 599 "vpsrld $0x1a,%%ymm4,%%ymm4 \n" 600 "vpslld $0x5,%%ymm4,%%ymm4 \n" 601 "vpslld $0xb,%%ymm3,%%ymm5 \n" 602 603 LABELALIGN 604 "1: \n" 605 "vmovdqu (%0),%%ymm0 \n" 606 "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n" 607 "vpsrld $0x5,%%ymm0,%%ymm2 \n" 608 "vpsrld $0x3,%%ymm0,%%ymm1 \n" 609 "vpsrld $0x8,%%ymm0,%%ymm0 \n" 610 "vpand %%ymm4,%%ymm2,%%ymm2 \n" 611 "vpand %%ymm3,%%ymm1,%%ymm1 \n" 612 "vpand %%ymm5,%%ymm0,%%ymm0 \n" 613 "vpor %%ymm2,%%ymm1,%%ymm1 \n" 614 "vpor %%ymm1,%%ymm0,%%ymm0 \n" 615 "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n" 616 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 617 "lea 0x20(%0),%0 \n" 618 "vmovdqu %%xmm0,(%1) \n" 619 "lea 0x10(%1),%1 \n" 620 "sub $0x8,%2 \n" 621 "jg 1b \n" 622 "vzeroupper \n" 623 : "+r"(src), // %0 624 "+r"(dst), // %1 625 "+r"(width) // %2 626 : "m"(dither4) // %3 627 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", 628 "xmm7"); 629} 630#endif // HAS_ARGBTORGB565DITHERROW_AVX2 631 632void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int width) { 633 asm volatile ( 634 "pcmpeqb %%xmm4,%%xmm4 \n" 635 "psrld $0x1b,%%xmm4 \n" 636 "movdqa %%xmm4,%%xmm5 \n" 637 "pslld $0x5,%%xmm5 \n" 638 "movdqa %%xmm4,%%xmm6 \n" 639 "pslld $0xa,%%xmm6 \n" 640 "pcmpeqb %%xmm7,%%xmm7 \n" 641 "pslld $0xf,%%xmm7 \n" 642 643 LABELALIGN 644 "1: \n" 645 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 646 "movdqa %%xmm0,%%xmm1 \n" 647 "movdqa %%xmm0,%%xmm2 \n" 648 "movdqa %%xmm0,%%xmm3 \n" 649 "psrad $0x10,%%xmm0 \n" 650 "psrld $0x3,%%xmm1 \n" 651 "psrld $0x6,%%xmm2 \n" 652 "psrld $0x9,%%xmm3 \n" 653 "pand %%xmm7,%%xmm0 \n" 654 "pand %%xmm4,%%xmm1 \n" 655 "pand %%xmm5,%%xmm2 \n" 656 "pand %%xmm6,%%xmm3 \n" 657 "por %%xmm1,%%xmm0 \n" 658 "por %%xmm3,%%xmm2 \n" 659 "por %%xmm2,%%xmm0 \n" 660 "packssdw %%xmm0,%%xmm0 \n" 661 "lea " MEMLEA(0x10,0) ",%0 \n" 662 "movq %%xmm0," MEMACCESS(1) " \n" 663 "lea " MEMLEA(0x8,1) ",%1 \n" 664 "sub $0x4,%2 \n" 665 "jg 1b \n" 666 : "+r"(src), // %0 667 "+r"(dst), // %1 668 "+r"(width) // %2 669 :: "memory", "cc", 670 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 671 ); 672} 673 674void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int width) { 675 asm volatile ( 676 "pcmpeqb %%xmm4,%%xmm4 \n" 677 "psllw $0xc,%%xmm4 \n" 678 "movdqa %%xmm4,%%xmm3 \n" 679 "psrlw $0x8,%%xmm3 \n" 680 681 LABELALIGN 682 "1: \n" 683 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 684 "movdqa %%xmm0,%%xmm1 \n" 685 "pand %%xmm3,%%xmm0 \n" 686 "pand %%xmm4,%%xmm1 \n" 687 "psrlq $0x4,%%xmm0 \n" 688 "psrlq $0x8,%%xmm1 \n" 689 "por %%xmm1,%%xmm0 \n" 690 "packuswb %%xmm0,%%xmm0 \n" 691 "lea " MEMLEA(0x10,0) ",%0 \n" 692 "movq %%xmm0," MEMACCESS(1) " \n" 693 "lea " MEMLEA(0x8,1) ",%1 \n" 694 "sub $0x4,%2 \n" 695 "jg 1b \n" 696 : "+r"(src), // %0 697 "+r"(dst), // %1 698 "+r"(width) // %2 699 :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" 700 ); 701} 702#endif // HAS_RGB24TOARGBROW_SSSE3 703 704#ifdef HAS_ARGBTOYROW_SSSE3 705// Convert 16 ARGB pixels (64 bytes) to 16 Y values. 706void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { 707 asm volatile ( 708 "movdqa %3,%%xmm4 \n" 709 "movdqa %4,%%xmm5 \n" 710 711 LABELALIGN 712 "1: \n" 713 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 714 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 715 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 716 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" 717 "pmaddubsw %%xmm4,%%xmm0 \n" 718 "pmaddubsw %%xmm4,%%xmm1 \n" 719 "pmaddubsw %%xmm4,%%xmm2 \n" 720 "pmaddubsw %%xmm4,%%xmm3 \n" 721 "lea " MEMLEA(0x40,0) ",%0 \n" 722 "phaddw %%xmm1,%%xmm0 \n" 723 "phaddw %%xmm3,%%xmm2 \n" 724 "psrlw $0x7,%%xmm0 \n" 725 "psrlw $0x7,%%xmm2 \n" 726 "packuswb %%xmm2,%%xmm0 \n" 727 "paddb %%xmm5,%%xmm0 \n" 728 "movdqu %%xmm0," MEMACCESS(1) " \n" 729 "lea " MEMLEA(0x10,1) ",%1 \n" 730 "sub $0x10,%2 \n" 731 "jg 1b \n" 732 : "+r"(src_argb), // %0 733 "+r"(dst_y), // %1 734 "+r"(width) // %2 735 : "m"(kARGBToY), // %3 736 "m"(kAddY16) // %4 737 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 738 ); 739} 740#endif // HAS_ARGBTOYROW_SSSE3 741 742#ifdef HAS_ARGBTOYJROW_SSSE3 743// Convert 16 ARGB pixels (64 bytes) to 16 YJ values. 744// Same as ARGBToYRow but different coefficients, no add 16, but do rounding. 745void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) { 746 asm volatile ( 747 "movdqa %3,%%xmm4 \n" 748 "movdqa %4,%%xmm5 \n" 749 750 LABELALIGN 751 "1: \n" 752 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 753 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 754 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 755 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" 756 "pmaddubsw %%xmm4,%%xmm0 \n" 757 "pmaddubsw %%xmm4,%%xmm1 \n" 758 "pmaddubsw %%xmm4,%%xmm2 \n" 759 "pmaddubsw %%xmm4,%%xmm3 \n" 760 "lea " MEMLEA(0x40,0) ",%0 \n" 761 "phaddw %%xmm1,%%xmm0 \n" 762 "phaddw %%xmm3,%%xmm2 \n" 763 "paddw %%xmm5,%%xmm0 \n" 764 "paddw %%xmm5,%%xmm2 \n" 765 "psrlw $0x7,%%xmm0 \n" 766 "psrlw $0x7,%%xmm2 \n" 767 "packuswb %%xmm2,%%xmm0 \n" 768 "movdqu %%xmm0," MEMACCESS(1) " \n" 769 "lea " MEMLEA(0x10,1) ",%1 \n" 770 "sub $0x10,%2 \n" 771 "jg 1b \n" 772 : "+r"(src_argb), // %0 773 "+r"(dst_y), // %1 774 "+r"(width) // %2 775 : "m"(kARGBToYJ), // %3 776 "m"(kAddYJ64) // %4 777 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 778 ); 779} 780#endif // HAS_ARGBTOYJROW_SSSE3 781 782#ifdef HAS_ARGBTOYROW_AVX2 783// vpermd for vphaddw + vpackuswb vpermd. 784static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7}; 785 786// Convert 32 ARGB pixels (128 bytes) to 32 Y values. 787void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) { 788 asm volatile ( 789 "vbroadcastf128 %3,%%ymm4 \n" 790 "vbroadcastf128 %4,%%ymm5 \n" 791 "vmovdqu %5,%%ymm6 \n" 792 793 LABELALIGN 794 "1: \n" 795 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 796 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" 797 "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n" 798 "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n" 799 "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" 800 "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" 801 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" 802 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" 803 "lea " MEMLEA(0x80,0) ",%0 \n" 804 "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates. 805 "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" 806 "vpsrlw $0x7,%%ymm0,%%ymm0 \n" 807 "vpsrlw $0x7,%%ymm2,%%ymm2 \n" 808 "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates. 809 "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate. 810 "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" // add 16 for Y 811 "vmovdqu %%ymm0," MEMACCESS(1) " \n" 812 "lea " MEMLEA(0x20,1) ",%1 \n" 813 "sub $0x20,%2 \n" 814 "jg 1b \n" 815 "vzeroupper \n" 816 : "+r"(src_argb), // %0 817 "+r"(dst_y), // %1 818 "+r"(width) // %2 819 : "m"(kARGBToY), // %3 820 "m"(kAddY16), // %4 821 "m"(kPermdARGBToY_AVX) // %5 822 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" 823 ); 824} 825#endif // HAS_ARGBTOYROW_AVX2 826 827#ifdef HAS_ARGBTOYJROW_AVX2 828// Convert 32 ARGB pixels (128 bytes) to 32 Y values. 829void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) { 830 asm volatile ( 831 "vbroadcastf128 %3,%%ymm4 \n" 832 "vbroadcastf128 %4,%%ymm5 \n" 833 "vmovdqu %5,%%ymm6 \n" 834 835 LABELALIGN 836 "1: \n" 837 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 838 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" 839 "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n" 840 "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n" 841 "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" 842 "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" 843 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" 844 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" 845 "lea " MEMLEA(0x80,0) ",%0 \n" 846 "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates. 847 "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" 848 "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" // Add .5 for rounding. 849 "vpaddw %%ymm5,%%ymm2,%%ymm2 \n" 850 "vpsrlw $0x7,%%ymm0,%%ymm0 \n" 851 "vpsrlw $0x7,%%ymm2,%%ymm2 \n" 852 "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates. 853 "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate. 854 "vmovdqu %%ymm0," MEMACCESS(1) " \n" 855 "lea " MEMLEA(0x20,1) ",%1 \n" 856 "sub $0x20,%2 \n" 857 "jg 1b \n" 858 "vzeroupper \n" 859 : "+r"(src_argb), // %0 860 "+r"(dst_y), // %1 861 "+r"(width) // %2 862 : "m"(kARGBToYJ), // %3 863 "m"(kAddYJ64), // %4 864 "m"(kPermdARGBToY_AVX) // %5 865 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" 866 ); 867} 868#endif // HAS_ARGBTOYJROW_AVX2 869 870#ifdef HAS_ARGBTOUVROW_SSSE3 871void ARGBToUVRow_SSSE3(const uint8* src_argb0, 872 int src_stride_argb, 873 uint8* dst_u, 874 uint8* dst_v, 875 int width) { 876 asm volatile ( 877 "movdqa %5,%%xmm3 \n" 878 "movdqa %6,%%xmm4 \n" 879 "movdqa %7,%%xmm5 \n" 880 "sub %1,%2 \n" 881 882 LABELALIGN 883 "1: \n" 884 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 885 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 886 "pavgb %%xmm7,%%xmm0 \n" 887 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 888 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 889 "pavgb %%xmm7,%%xmm1 \n" 890 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 891 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 892 "pavgb %%xmm7,%%xmm2 \n" 893 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" 894 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 895 "pavgb %%xmm7,%%xmm6 \n" 896 897 "lea " MEMLEA(0x40,0) ",%0 \n" 898 "movdqa %%xmm0,%%xmm7 \n" 899 "shufps $0x88,%%xmm1,%%xmm0 \n" 900 "shufps $0xdd,%%xmm1,%%xmm7 \n" 901 "pavgb %%xmm7,%%xmm0 \n" 902 "movdqa %%xmm2,%%xmm7 \n" 903 "shufps $0x88,%%xmm6,%%xmm2 \n" 904 "shufps $0xdd,%%xmm6,%%xmm7 \n" 905 "pavgb %%xmm7,%%xmm2 \n" 906 "movdqa %%xmm0,%%xmm1 \n" 907 "movdqa %%xmm2,%%xmm6 \n" 908 "pmaddubsw %%xmm4,%%xmm0 \n" 909 "pmaddubsw %%xmm4,%%xmm2 \n" 910 "pmaddubsw %%xmm3,%%xmm1 \n" 911 "pmaddubsw %%xmm3,%%xmm6 \n" 912 "phaddw %%xmm2,%%xmm0 \n" 913 "phaddw %%xmm6,%%xmm1 \n" 914 "psraw $0x8,%%xmm0 \n" 915 "psraw $0x8,%%xmm1 \n" 916 "packsswb %%xmm1,%%xmm0 \n" 917 "paddb %%xmm5,%%xmm0 \n" 918 "movlps %%xmm0," MEMACCESS(1) " \n" 919 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) 920 "lea " MEMLEA(0x8,1) ",%1 \n" 921 "sub $0x10,%3 \n" 922 "jg 1b \n" 923 : "+r"(src_argb0), // %0 924 "+r"(dst_u), // %1 925 "+r"(dst_v), // %2 926 "+rm"(width) // %3 927 : "r"((intptr_t)(src_stride_argb)), // %4 928 "m"(kARGBToV), // %5 929 "m"(kARGBToU), // %6 930 "m"(kAddUV128) // %7 931 : "memory", "cc", NACL_R14 932 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" 933 ); 934} 935#endif // HAS_ARGBTOUVROW_SSSE3 936 937#ifdef HAS_ARGBTOUVROW_AVX2 938// vpshufb for vphaddw + vpackuswb packed to shorts. 939static const lvec8 kShufARGBToUV_AVX = { 940 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, 941 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15}; 942void ARGBToUVRow_AVX2(const uint8* src_argb0, 943 int src_stride_argb, 944 uint8* dst_u, 945 uint8* dst_v, 946 int width) { 947 asm volatile ( 948 "vbroadcastf128 %5,%%ymm5 \n" 949 "vbroadcastf128 %6,%%ymm6 \n" 950 "vbroadcastf128 %7,%%ymm7 \n" 951 "sub %1,%2 \n" 952 953 LABELALIGN 954 "1: \n" 955 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 956 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" 957 "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n" 958 "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n" 959 VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0 960 VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1) 961 VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2) 962 VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3) 963 "lea " MEMLEA(0x80,0) ",%0 \n" 964 "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" 965 "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" 966 "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" 967 "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" 968 "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" 969 "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" 970 971 "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" 972 "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" 973 "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" 974 "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" 975 "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" 976 "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" 977 "vpsraw $0x8,%%ymm1,%%ymm1 \n" 978 "vpsraw $0x8,%%ymm0,%%ymm0 \n" 979 "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" 980 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 981 "vpshufb %8,%%ymm0,%%ymm0 \n" 982 "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" 983 984 "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n" 985 VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1) 986 "lea " MEMLEA(0x10,1) ",%1 \n" 987 "sub $0x20,%3 \n" 988 "jg 1b \n" 989 "vzeroupper \n" 990 : "+r"(src_argb0), // %0 991 "+r"(dst_u), // %1 992 "+r"(dst_v), // %2 993 "+rm"(width) // %3 994 : "r"((intptr_t)(src_stride_argb)), // %4 995 "m"(kAddUV128), // %5 996 "m"(kARGBToV), // %6 997 "m"(kARGBToU), // %7 998 "m"(kShufARGBToUV_AVX) // %8 999 : "memory", "cc", NACL_R14 1000 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 1001 ); 1002} 1003#endif // HAS_ARGBTOUVROW_AVX2 1004 1005#ifdef HAS_ARGBTOUVJROW_AVX2 1006void ARGBToUVJRow_AVX2(const uint8* src_argb0, 1007 int src_stride_argb, 1008 uint8* dst_u, 1009 uint8* dst_v, 1010 int width) { 1011 asm volatile ( 1012 "vbroadcastf128 %5,%%ymm5 \n" 1013 "vbroadcastf128 %6,%%ymm6 \n" 1014 "vbroadcastf128 %7,%%ymm7 \n" 1015 "sub %1,%2 \n" 1016 1017 LABELALIGN 1018 "1: \n" 1019 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 1020 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" 1021 "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n" 1022 "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n" 1023 VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0 1024 VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1) 1025 VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2) 1026 VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3) 1027 "lea " MEMLEA(0x80,0) ",%0 \n" 1028 "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" 1029 "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" 1030 "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" 1031 "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" 1032 "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" 1033 "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" 1034 1035 "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" 1036 "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" 1037 "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" 1038 "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" 1039 "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" 1040 "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" 1041 "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" 1042 "vpaddw %%ymm5,%%ymm1,%%ymm1 \n" 1043 "vpsraw $0x8,%%ymm1,%%ymm1 \n" 1044 "vpsraw $0x8,%%ymm0,%%ymm0 \n" 1045 "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" 1046 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 1047 "vpshufb %8,%%ymm0,%%ymm0 \n" 1048 1049 "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n" 1050 VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1) 1051 "lea " MEMLEA(0x10,1) ",%1 \n" 1052 "sub $0x20,%3 \n" 1053 "jg 1b \n" 1054 "vzeroupper \n" 1055 : "+r"(src_argb0), // %0 1056 "+r"(dst_u), // %1 1057 "+r"(dst_v), // %2 1058 "+rm"(width) // %3 1059 : "r"((intptr_t)(src_stride_argb)), // %4 1060 "m"(kAddUVJ128), // %5 1061 "m"(kARGBToVJ), // %6 1062 "m"(kARGBToUJ), // %7 1063 "m"(kShufARGBToUV_AVX) // %8 1064 : "memory", "cc", NACL_R14 1065 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 1066 ); 1067} 1068#endif // HAS_ARGBTOUVJROW_AVX2 1069 1070#ifdef HAS_ARGBTOUVJROW_SSSE3 1071void ARGBToUVJRow_SSSE3(const uint8* src_argb0, 1072 int src_stride_argb, 1073 uint8* dst_u, 1074 uint8* dst_v, 1075 int width) { 1076 asm volatile ( 1077 "movdqa %5,%%xmm3 \n" 1078 "movdqa %6,%%xmm4 \n" 1079 "movdqa %7,%%xmm5 \n" 1080 "sub %1,%2 \n" 1081 1082 LABELALIGN 1083 "1: \n" 1084 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1085 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 1086 "pavgb %%xmm7,%%xmm0 \n" 1087 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1088 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 1089 "pavgb %%xmm7,%%xmm1 \n" 1090 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1091 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 1092 "pavgb %%xmm7,%%xmm2 \n" 1093 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" 1094 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 1095 "pavgb %%xmm7,%%xmm6 \n" 1096 1097 "lea " MEMLEA(0x40,0) ",%0 \n" 1098 "movdqa %%xmm0,%%xmm7 \n" 1099 "shufps $0x88,%%xmm1,%%xmm0 \n" 1100 "shufps $0xdd,%%xmm1,%%xmm7 \n" 1101 "pavgb %%xmm7,%%xmm0 \n" 1102 "movdqa %%xmm2,%%xmm7 \n" 1103 "shufps $0x88,%%xmm6,%%xmm2 \n" 1104 "shufps $0xdd,%%xmm6,%%xmm7 \n" 1105 "pavgb %%xmm7,%%xmm2 \n" 1106 "movdqa %%xmm0,%%xmm1 \n" 1107 "movdqa %%xmm2,%%xmm6 \n" 1108 "pmaddubsw %%xmm4,%%xmm0 \n" 1109 "pmaddubsw %%xmm4,%%xmm2 \n" 1110 "pmaddubsw %%xmm3,%%xmm1 \n" 1111 "pmaddubsw %%xmm3,%%xmm6 \n" 1112 "phaddw %%xmm2,%%xmm0 \n" 1113 "phaddw %%xmm6,%%xmm1 \n" 1114 "paddw %%xmm5,%%xmm0 \n" 1115 "paddw %%xmm5,%%xmm1 \n" 1116 "psraw $0x8,%%xmm0 \n" 1117 "psraw $0x8,%%xmm1 \n" 1118 "packsswb %%xmm1,%%xmm0 \n" 1119 "movlps %%xmm0," MEMACCESS(1) " \n" 1120 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) 1121 "lea " MEMLEA(0x8,1) ",%1 \n" 1122 "sub $0x10,%3 \n" 1123 "jg 1b \n" 1124 : "+r"(src_argb0), // %0 1125 "+r"(dst_u), // %1 1126 "+r"(dst_v), // %2 1127 "+rm"(width) // %3 1128 : "r"((intptr_t)(src_stride_argb)), // %4 1129 "m"(kARGBToVJ), // %5 1130 "m"(kARGBToUJ), // %6 1131 "m"(kAddUVJ128) // %7 1132 : "memory", "cc", NACL_R14 1133 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" 1134 ); 1135} 1136#endif // HAS_ARGBTOUVJROW_SSSE3 1137 1138#ifdef HAS_ARGBTOUV444ROW_SSSE3 1139void ARGBToUV444Row_SSSE3(const uint8* src_argb, 1140 uint8* dst_u, 1141 uint8* dst_v, 1142 int width) { 1143 asm volatile ( 1144 "movdqa %4,%%xmm3 \n" 1145 "movdqa %5,%%xmm4 \n" 1146 "movdqa %6,%%xmm5 \n" 1147 "sub %1,%2 \n" 1148 1149 LABELALIGN 1150 "1: \n" 1151 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1152 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1153 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1154 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" 1155 "pmaddubsw %%xmm4,%%xmm0 \n" 1156 "pmaddubsw %%xmm4,%%xmm1 \n" 1157 "pmaddubsw %%xmm4,%%xmm2 \n" 1158 "pmaddubsw %%xmm4,%%xmm6 \n" 1159 "phaddw %%xmm1,%%xmm0 \n" 1160 "phaddw %%xmm6,%%xmm2 \n" 1161 "psraw $0x8,%%xmm0 \n" 1162 "psraw $0x8,%%xmm2 \n" 1163 "packsswb %%xmm2,%%xmm0 \n" 1164 "paddb %%xmm5,%%xmm0 \n" 1165 "movdqu %%xmm0," MEMACCESS(1) " \n" 1166 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1167 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1168 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1169 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" 1170 "pmaddubsw %%xmm3,%%xmm0 \n" 1171 "pmaddubsw %%xmm3,%%xmm1 \n" 1172 "pmaddubsw %%xmm3,%%xmm2 \n" 1173 "pmaddubsw %%xmm3,%%xmm6 \n" 1174 "phaddw %%xmm1,%%xmm0 \n" 1175 "phaddw %%xmm6,%%xmm2 \n" 1176 "psraw $0x8,%%xmm0 \n" 1177 "psraw $0x8,%%xmm2 \n" 1178 "packsswb %%xmm2,%%xmm0 \n" 1179 "paddb %%xmm5,%%xmm0 \n" 1180 "lea " MEMLEA(0x40,0) ",%0 \n" 1181 MEMOPMEM(movdqu,xmm0,0x00,1,2,1) // movdqu %%xmm0,(%1,%2,1) 1182 "lea " MEMLEA(0x10,1) ",%1 \n" 1183 "sub $0x10,%3 \n" 1184 "jg 1b \n" 1185 : "+r"(src_argb), // %0 1186 "+r"(dst_u), // %1 1187 "+r"(dst_v), // %2 1188 "+rm"(width) // %3 1189 : "m"(kARGBToV), // %4 1190 "m"(kARGBToU), // %5 1191 "m"(kAddUV128) // %6 1192 : "memory", "cc", NACL_R14 1193 "xmm0", "xmm1", "xmm2", "xmm6" 1194 ); 1195} 1196#endif // HAS_ARGBTOUV444ROW_SSSE3 1197 1198void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int width) { 1199 asm volatile ( 1200 "movdqa %4,%%xmm5 \n" 1201 "movdqa %3,%%xmm4 \n" 1202 1203 LABELALIGN 1204 "1: \n" 1205 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1206 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1207 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1208 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" 1209 "pmaddubsw %%xmm4,%%xmm0 \n" 1210 "pmaddubsw %%xmm4,%%xmm1 \n" 1211 "pmaddubsw %%xmm4,%%xmm2 \n" 1212 "pmaddubsw %%xmm4,%%xmm3 \n" 1213 "lea " MEMLEA(0x40,0) ",%0 \n" 1214 "phaddw %%xmm1,%%xmm0 \n" 1215 "phaddw %%xmm3,%%xmm2 \n" 1216 "psrlw $0x7,%%xmm0 \n" 1217 "psrlw $0x7,%%xmm2 \n" 1218 "packuswb %%xmm2,%%xmm0 \n" 1219 "paddb %%xmm5,%%xmm0 \n" 1220 "movdqu %%xmm0," MEMACCESS(1) " \n" 1221 "lea " MEMLEA(0x10,1) ",%1 \n" 1222 "sub $0x10,%2 \n" 1223 "jg 1b \n" 1224 : "+r"(src_bgra), // %0 1225 "+r"(dst_y), // %1 1226 "+r"(width) // %2 1227 : "m"(kBGRAToY), // %3 1228 "m"(kAddY16) // %4 1229 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1230 ); 1231} 1232 1233void BGRAToUVRow_SSSE3(const uint8* src_bgra0, 1234 int src_stride_bgra, 1235 uint8* dst_u, 1236 uint8* dst_v, 1237 int width) { 1238 asm volatile ( 1239 "movdqa %5,%%xmm3 \n" 1240 "movdqa %6,%%xmm4 \n" 1241 "movdqa %7,%%xmm5 \n" 1242 "sub %1,%2 \n" 1243 1244 LABELALIGN 1245 "1: \n" 1246 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1247 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 1248 "pavgb %%xmm7,%%xmm0 \n" 1249 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1250 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 1251 "pavgb %%xmm7,%%xmm1 \n" 1252 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1253 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 1254 "pavgb %%xmm7,%%xmm2 \n" 1255 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" 1256 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 1257 "pavgb %%xmm7,%%xmm6 \n" 1258 1259 "lea " MEMLEA(0x40,0) ",%0 \n" 1260 "movdqa %%xmm0,%%xmm7 \n" 1261 "shufps $0x88,%%xmm1,%%xmm0 \n" 1262 "shufps $0xdd,%%xmm1,%%xmm7 \n" 1263 "pavgb %%xmm7,%%xmm0 \n" 1264 "movdqa %%xmm2,%%xmm7 \n" 1265 "shufps $0x88,%%xmm6,%%xmm2 \n" 1266 "shufps $0xdd,%%xmm6,%%xmm7 \n" 1267 "pavgb %%xmm7,%%xmm2 \n" 1268 "movdqa %%xmm0,%%xmm1 \n" 1269 "movdqa %%xmm2,%%xmm6 \n" 1270 "pmaddubsw %%xmm4,%%xmm0 \n" 1271 "pmaddubsw %%xmm4,%%xmm2 \n" 1272 "pmaddubsw %%xmm3,%%xmm1 \n" 1273 "pmaddubsw %%xmm3,%%xmm6 \n" 1274 "phaddw %%xmm2,%%xmm0 \n" 1275 "phaddw %%xmm6,%%xmm1 \n" 1276 "psraw $0x8,%%xmm0 \n" 1277 "psraw $0x8,%%xmm1 \n" 1278 "packsswb %%xmm1,%%xmm0 \n" 1279 "paddb %%xmm5,%%xmm0 \n" 1280 "movlps %%xmm0," MEMACCESS(1) " \n" 1281 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) 1282 "lea " MEMLEA(0x8,1) ",%1 \n" 1283 "sub $0x10,%3 \n" 1284 "jg 1b \n" 1285 : "+r"(src_bgra0), // %0 1286 "+r"(dst_u), // %1 1287 "+r"(dst_v), // %2 1288 "+rm"(width) // %3 1289 : "r"((intptr_t)(src_stride_bgra)), // %4 1290 "m"(kBGRAToV), // %5 1291 "m"(kBGRAToU), // %6 1292 "m"(kAddUV128) // %7 1293 : "memory", "cc", NACL_R14 1294 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" 1295 ); 1296} 1297 1298void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int width) { 1299 asm volatile ( 1300 "movdqa %4,%%xmm5 \n" 1301 "movdqa %3,%%xmm4 \n" 1302 1303 LABELALIGN 1304 "1: \n" 1305 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1306 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1307 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1308 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" 1309 "pmaddubsw %%xmm4,%%xmm0 \n" 1310 "pmaddubsw %%xmm4,%%xmm1 \n" 1311 "pmaddubsw %%xmm4,%%xmm2 \n" 1312 "pmaddubsw %%xmm4,%%xmm3 \n" 1313 "lea " MEMLEA(0x40,0) ",%0 \n" 1314 "phaddw %%xmm1,%%xmm0 \n" 1315 "phaddw %%xmm3,%%xmm2 \n" 1316 "psrlw $0x7,%%xmm0 \n" 1317 "psrlw $0x7,%%xmm2 \n" 1318 "packuswb %%xmm2,%%xmm0 \n" 1319 "paddb %%xmm5,%%xmm0 \n" 1320 "movdqu %%xmm0," MEMACCESS(1) " \n" 1321 "lea " MEMLEA(0x10,1) ",%1 \n" 1322 "sub $0x10,%2 \n" 1323 "jg 1b \n" 1324 : "+r"(src_abgr), // %0 1325 "+r"(dst_y), // %1 1326 "+r"(width) // %2 1327 : "m"(kABGRToY), // %3 1328 "m"(kAddY16) // %4 1329 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1330 ); 1331} 1332 1333void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int width) { 1334 asm volatile ( 1335 "movdqa %4,%%xmm5 \n" 1336 "movdqa %3,%%xmm4 \n" 1337 1338 LABELALIGN 1339 "1: \n" 1340 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1341 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1342 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1343 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" 1344 "pmaddubsw %%xmm4,%%xmm0 \n" 1345 "pmaddubsw %%xmm4,%%xmm1 \n" 1346 "pmaddubsw %%xmm4,%%xmm2 \n" 1347 "pmaddubsw %%xmm4,%%xmm3 \n" 1348 "lea " MEMLEA(0x40,0) ",%0 \n" 1349 "phaddw %%xmm1,%%xmm0 \n" 1350 "phaddw %%xmm3,%%xmm2 \n" 1351 "psrlw $0x7,%%xmm0 \n" 1352 "psrlw $0x7,%%xmm2 \n" 1353 "packuswb %%xmm2,%%xmm0 \n" 1354 "paddb %%xmm5,%%xmm0 \n" 1355 "movdqu %%xmm0," MEMACCESS(1) " \n" 1356 "lea " MEMLEA(0x10,1) ",%1 \n" 1357 "sub $0x10,%2 \n" 1358 "jg 1b \n" 1359 : "+r"(src_rgba), // %0 1360 "+r"(dst_y), // %1 1361 "+r"(width) // %2 1362 : "m"(kRGBAToY), // %3 1363 "m"(kAddY16) // %4 1364 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1365 ); 1366} 1367 1368void ABGRToUVRow_SSSE3(const uint8* src_abgr0, 1369 int src_stride_abgr, 1370 uint8* dst_u, 1371 uint8* dst_v, 1372 int width) { 1373 asm volatile ( 1374 "movdqa %5,%%xmm3 \n" 1375 "movdqa %6,%%xmm4 \n" 1376 "movdqa %7,%%xmm5 \n" 1377 "sub %1,%2 \n" 1378 1379 LABELALIGN 1380 "1: \n" 1381 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1382 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 1383 "pavgb %%xmm7,%%xmm0 \n" 1384 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1385 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 1386 "pavgb %%xmm7,%%xmm1 \n" 1387 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1388 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 1389 "pavgb %%xmm7,%%xmm2 \n" 1390 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" 1391 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 1392 "pavgb %%xmm7,%%xmm6 \n" 1393 1394 "lea " MEMLEA(0x40,0) ",%0 \n" 1395 "movdqa %%xmm0,%%xmm7 \n" 1396 "shufps $0x88,%%xmm1,%%xmm0 \n" 1397 "shufps $0xdd,%%xmm1,%%xmm7 \n" 1398 "pavgb %%xmm7,%%xmm0 \n" 1399 "movdqa %%xmm2,%%xmm7 \n" 1400 "shufps $0x88,%%xmm6,%%xmm2 \n" 1401 "shufps $0xdd,%%xmm6,%%xmm7 \n" 1402 "pavgb %%xmm7,%%xmm2 \n" 1403 "movdqa %%xmm0,%%xmm1 \n" 1404 "movdqa %%xmm2,%%xmm6 \n" 1405 "pmaddubsw %%xmm4,%%xmm0 \n" 1406 "pmaddubsw %%xmm4,%%xmm2 \n" 1407 "pmaddubsw %%xmm3,%%xmm1 \n" 1408 "pmaddubsw %%xmm3,%%xmm6 \n" 1409 "phaddw %%xmm2,%%xmm0 \n" 1410 "phaddw %%xmm6,%%xmm1 \n" 1411 "psraw $0x8,%%xmm0 \n" 1412 "psraw $0x8,%%xmm1 \n" 1413 "packsswb %%xmm1,%%xmm0 \n" 1414 "paddb %%xmm5,%%xmm0 \n" 1415 "movlps %%xmm0," MEMACCESS(1) " \n" 1416 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) 1417 "lea " MEMLEA(0x8,1) ",%1 \n" 1418 "sub $0x10,%3 \n" 1419 "jg 1b \n" 1420 : "+r"(src_abgr0), // %0 1421 "+r"(dst_u), // %1 1422 "+r"(dst_v), // %2 1423 "+rm"(width) // %3 1424 : "r"((intptr_t)(src_stride_abgr)), // %4 1425 "m"(kABGRToV), // %5 1426 "m"(kABGRToU), // %6 1427 "m"(kAddUV128) // %7 1428 : "memory", "cc", NACL_R14 1429 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" 1430 ); 1431} 1432 1433void RGBAToUVRow_SSSE3(const uint8* src_rgba0, 1434 int src_stride_rgba, 1435 uint8* dst_u, 1436 uint8* dst_v, 1437 int width) { 1438 asm volatile ( 1439 "movdqa %5,%%xmm3 \n" 1440 "movdqa %6,%%xmm4 \n" 1441 "movdqa %7,%%xmm5 \n" 1442 "sub %1,%2 \n" 1443 1444 LABELALIGN 1445 "1: \n" 1446 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1447 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 1448 "pavgb %%xmm7,%%xmm0 \n" 1449 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1450 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 1451 "pavgb %%xmm7,%%xmm1 \n" 1452 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1453 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 1454 "pavgb %%xmm7,%%xmm2 \n" 1455 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" 1456 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 1457 "pavgb %%xmm7,%%xmm6 \n" 1458 1459 "lea " MEMLEA(0x40,0) ",%0 \n" 1460 "movdqa %%xmm0,%%xmm7 \n" 1461 "shufps $0x88,%%xmm1,%%xmm0 \n" 1462 "shufps $0xdd,%%xmm1,%%xmm7 \n" 1463 "pavgb %%xmm7,%%xmm0 \n" 1464 "movdqa %%xmm2,%%xmm7 \n" 1465 "shufps $0x88,%%xmm6,%%xmm2 \n" 1466 "shufps $0xdd,%%xmm6,%%xmm7 \n" 1467 "pavgb %%xmm7,%%xmm2 \n" 1468 "movdqa %%xmm0,%%xmm1 \n" 1469 "movdqa %%xmm2,%%xmm6 \n" 1470 "pmaddubsw %%xmm4,%%xmm0 \n" 1471 "pmaddubsw %%xmm4,%%xmm2 \n" 1472 "pmaddubsw %%xmm3,%%xmm1 \n" 1473 "pmaddubsw %%xmm3,%%xmm6 \n" 1474 "phaddw %%xmm2,%%xmm0 \n" 1475 "phaddw %%xmm6,%%xmm1 \n" 1476 "psraw $0x8,%%xmm0 \n" 1477 "psraw $0x8,%%xmm1 \n" 1478 "packsswb %%xmm1,%%xmm0 \n" 1479 "paddb %%xmm5,%%xmm0 \n" 1480 "movlps %%xmm0," MEMACCESS(1) " \n" 1481 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) 1482 "lea " MEMLEA(0x8,1) ",%1 \n" 1483 "sub $0x10,%3 \n" 1484 "jg 1b \n" 1485 : "+r"(src_rgba0), // %0 1486 "+r"(dst_u), // %1 1487 "+r"(dst_v), // %2 1488 "+rm"(width) // %3 1489 : "r"((intptr_t)(src_stride_rgba)), // %4 1490 "m"(kRGBAToV), // %5 1491 "m"(kRGBAToU), // %6 1492 "m"(kAddUV128) // %7 1493 : "memory", "cc", NACL_R14 1494 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" 1495 ); 1496} 1497 1498#if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2) 1499 1500// Read 8 UV from 444 1501#define READYUV444 \ 1502 "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ 1503 MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \ 1504 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ 1505 "punpcklbw %%xmm1,%%xmm0 \n" \ 1506 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ 1507 "punpcklbw %%xmm4,%%xmm4 \n" \ 1508 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" 1509 1510// Read 4 UV from 422, upsample to 8 UV 1511#define READYUV422 \ 1512 "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ 1513 MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ 1514 "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \ 1515 "punpcklbw %%xmm1,%%xmm0 \n" \ 1516 "punpcklwd %%xmm0,%%xmm0 \n" \ 1517 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ 1518 "punpcklbw %%xmm4,%%xmm4 \n" \ 1519 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" 1520 1521// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. 1522#define READYUVA422 \ 1523 "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ 1524 MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ 1525 "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \ 1526 "punpcklbw %%xmm1,%%xmm0 \n" \ 1527 "punpcklwd %%xmm0,%%xmm0 \n" \ 1528 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ 1529 "punpcklbw %%xmm4,%%xmm4 \n" \ 1530 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \ 1531 "movq " MEMACCESS([a_buf]) ",%%xmm5 \n" \ 1532 "lea " MEMLEA(0x8, [a_buf]) ",%[a_buf] \n" 1533 1534// Read 4 UV from NV12, upsample to 8 UV 1535#define READNV12 \ 1536 "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ 1537 "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \ 1538 "punpcklwd %%xmm0,%%xmm0 \n" \ 1539 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ 1540 "punpcklbw %%xmm4,%%xmm4 \n" \ 1541 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" 1542 1543// Read 4 VU from NV21, upsample to 8 UV 1544#define READNV21 \ 1545 "movq " MEMACCESS([vu_buf]) ",%%xmm0 \n" \ 1546 "lea " MEMLEA(0x8, [vu_buf]) ",%[vu_buf] \n" \ 1547 "pshufb %[kShuffleNV21], %%xmm0 \n" \ 1548 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ 1549 "punpcklbw %%xmm4,%%xmm4 \n" \ 1550 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" 1551 1552// Read 4 YUY2 with 8 Y and update 4 UV to 8 UV. 1553#define READYUY2 \ 1554 "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm4 \n" \ 1555 "pshufb %[kShuffleYUY2Y], %%xmm4 \n" \ 1556 "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm0 \n" \ 1557 "pshufb %[kShuffleYUY2UV], %%xmm0 \n" \ 1558 "lea " MEMLEA(0x10, [yuy2_buf]) ",%[yuy2_buf] \n" 1559 1560// Read 4 UYVY with 8 Y and update 4 UV to 8 UV. 1561#define READUYVY \ 1562 "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm4 \n" \ 1563 "pshufb %[kShuffleUYVYY], %%xmm4 \n" \ 1564 "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm0 \n" \ 1565 "pshufb %[kShuffleUYVYUV], %%xmm0 \n" \ 1566 "lea " MEMLEA(0x10, [uyvy_buf]) ",%[uyvy_buf] \n" 1567 1568#if defined(__x86_64__) 1569#define YUVTORGB_SETUP(yuvconstants) \ 1570 "movdqa " MEMACCESS([yuvconstants]) ",%%xmm8 \n" \ 1571 "movdqa " MEMACCESS2(32, [yuvconstants]) ",%%xmm9 \n" \ 1572 "movdqa " MEMACCESS2(64, [yuvconstants]) ",%%xmm10 \n" \ 1573 "movdqa " MEMACCESS2(96, [yuvconstants]) ",%%xmm11 \n" \ 1574 "movdqa " MEMACCESS2(128, [yuvconstants]) ",%%xmm12 \n" \ 1575 "movdqa " MEMACCESS2(160, [yuvconstants]) ",%%xmm13 \n" \ 1576 "movdqa " MEMACCESS2(192, [yuvconstants]) ",%%xmm14 \n" 1577// Convert 8 pixels: 8 UV and 8 Y 1578#define YUVTORGB(yuvconstants) \ 1579 "movdqa %%xmm0,%%xmm1 \n" \ 1580 "movdqa %%xmm0,%%xmm2 \n" \ 1581 "movdqa %%xmm0,%%xmm3 \n" \ 1582 "movdqa %%xmm11,%%xmm0 \n" \ 1583 "pmaddubsw %%xmm8,%%xmm1 \n" \ 1584 "psubw %%xmm1,%%xmm0 \n" \ 1585 "movdqa %%xmm12,%%xmm1 \n" \ 1586 "pmaddubsw %%xmm9,%%xmm2 \n" \ 1587 "psubw %%xmm2,%%xmm1 \n" \ 1588 "movdqa %%xmm13,%%xmm2 \n" \ 1589 "pmaddubsw %%xmm10,%%xmm3 \n" \ 1590 "psubw %%xmm3,%%xmm2 \n" \ 1591 "pmulhuw %%xmm14,%%xmm4 \n" \ 1592 "paddsw %%xmm4,%%xmm0 \n" \ 1593 "paddsw %%xmm4,%%xmm1 \n" \ 1594 "paddsw %%xmm4,%%xmm2 \n" \ 1595 "psraw $0x6,%%xmm0 \n" \ 1596 "psraw $0x6,%%xmm1 \n" \ 1597 "psraw $0x6,%%xmm2 \n" \ 1598 "packuswb %%xmm0,%%xmm0 \n" \ 1599 "packuswb %%xmm1,%%xmm1 \n" \ 1600 "packuswb %%xmm2,%%xmm2 \n" 1601#define YUVTORGB_REGS \ 1602 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", 1603 1604#else 1605#define YUVTORGB_SETUP(yuvconstants) 1606// Convert 8 pixels: 8 UV and 8 Y 1607#define YUVTORGB(yuvconstants) \ 1608 "movdqa %%xmm0,%%xmm1 \n" \ 1609 "movdqa %%xmm0,%%xmm2 \n" \ 1610 "movdqa %%xmm0,%%xmm3 \n" \ 1611 "movdqa " MEMACCESS2(96, [yuvconstants]) ",%%xmm0 \n" \ 1612 "pmaddubsw " MEMACCESS([yuvconstants]) ",%%xmm1 \n" \ 1613 "psubw %%xmm1,%%xmm0 \n" \ 1614 "movdqa " MEMACCESS2(128, [yuvconstants]) ",%%xmm1 \n" \ 1615 "pmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%xmm2 \n" \ 1616 "psubw %%xmm2,%%xmm1 \n" \ 1617 "movdqa " MEMACCESS2(160, [yuvconstants]) ",%%xmm2 \n" \ 1618 "pmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%xmm3 \n" \ 1619 "psubw %%xmm3,%%xmm2 \n" \ 1620 "pmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%xmm4 \n" \ 1621 "paddsw %%xmm4,%%xmm0 \n" \ 1622 "paddsw %%xmm4,%%xmm1 \n" \ 1623 "paddsw %%xmm4,%%xmm2 \n" \ 1624 "psraw $0x6,%%xmm0 \n" \ 1625 "psraw $0x6,%%xmm1 \n" \ 1626 "psraw $0x6,%%xmm2 \n" \ 1627 "packuswb %%xmm0,%%xmm0 \n" \ 1628 "packuswb %%xmm1,%%xmm1 \n" \ 1629 "packuswb %%xmm2,%%xmm2 \n" 1630#define YUVTORGB_REGS 1631#endif 1632 1633// Store 8 ARGB values. 1634#define STOREARGB \ 1635 "punpcklbw %%xmm1,%%xmm0 \n" \ 1636 "punpcklbw %%xmm5,%%xmm2 \n" \ 1637 "movdqa %%xmm0,%%xmm1 \n" \ 1638 "punpcklwd %%xmm2,%%xmm0 \n" \ 1639 "punpckhwd %%xmm2,%%xmm1 \n" \ 1640 "movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" \ 1641 "movdqu %%xmm1," MEMACCESS2(0x10, [dst_argb]) " \n" \ 1642 "lea " MEMLEA(0x20, [dst_argb]) ", %[dst_argb] \n" 1643 1644// Store 8 RGBA values. 1645#define STORERGBA \ 1646 "pcmpeqb %%xmm5,%%xmm5 \n" \ 1647 "punpcklbw %%xmm2,%%xmm1 \n" \ 1648 "punpcklbw %%xmm0,%%xmm5 \n" \ 1649 "movdqa %%xmm5,%%xmm0 \n" \ 1650 "punpcklwd %%xmm1,%%xmm5 \n" \ 1651 "punpckhwd %%xmm1,%%xmm0 \n" \ 1652 "movdqu %%xmm5," MEMACCESS([dst_rgba]) " \n" \ 1653 "movdqu %%xmm0," MEMACCESS2(0x10, [dst_rgba]) " \n" \ 1654 "lea " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba] \n" 1655 1656void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, 1657 const uint8* u_buf, 1658 const uint8* v_buf, 1659 uint8* dst_argb, 1660 const struct YuvConstants* yuvconstants, 1661 int width) { 1662 asm volatile ( 1663 YUVTORGB_SETUP(yuvconstants) 1664 "sub %[u_buf],%[v_buf] \n" 1665 "pcmpeqb %%xmm5,%%xmm5 \n" 1666 1667 LABELALIGN 1668 "1: \n" 1669 READYUV444 1670 YUVTORGB(yuvconstants) 1671 STOREARGB 1672 "sub $0x8,%[width] \n" 1673 "jg 1b \n" 1674 : [y_buf]"+r"(y_buf), // %[y_buf] 1675 [u_buf]"+r"(u_buf), // %[u_buf] 1676 [v_buf]"+r"(v_buf), // %[v_buf] 1677 [dst_argb]"+r"(dst_argb), // %[dst_argb] 1678 [width]"+rm"(width) // %[width] 1679 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] 1680 : "memory", "cc", NACL_R14 YUVTORGB_REGS 1681 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1682 ); 1683} 1684 1685void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf, 1686 const uint8* u_buf, 1687 const uint8* v_buf, 1688 uint8* dst_rgb24, 1689 const struct YuvConstants* yuvconstants, 1690 int width) { 1691 asm volatile ( 1692 YUVTORGB_SETUP(yuvconstants) 1693 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n" 1694 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n" 1695 "sub %[u_buf],%[v_buf] \n" 1696 1697 LABELALIGN 1698 "1: \n" 1699 READYUV422 1700 YUVTORGB(yuvconstants) 1701 "punpcklbw %%xmm1,%%xmm0 \n" 1702 "punpcklbw %%xmm2,%%xmm2 \n" 1703 "movdqa %%xmm0,%%xmm1 \n" 1704 "punpcklwd %%xmm2,%%xmm0 \n" 1705 "punpckhwd %%xmm2,%%xmm1 \n" 1706 "pshufb %%xmm5,%%xmm0 \n" 1707 "pshufb %%xmm6,%%xmm1 \n" 1708 "palignr $0xc,%%xmm0,%%xmm1 \n" 1709 "movq %%xmm0," MEMACCESS([dst_rgb24]) "\n" 1710 "movdqu %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n" 1711 "lea " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n" 1712 "subl $0x8,%[width] \n" 1713 "jg 1b \n" 1714 : [y_buf]"+r"(y_buf), // %[y_buf] 1715 [u_buf]"+r"(u_buf), // %[u_buf] 1716 [v_buf]"+r"(v_buf), // %[v_buf] 1717 [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24] 1718#if defined(__i386__) 1719 [width]"+m"(width) // %[width] 1720#else 1721 [width]"+rm"(width) // %[width] 1722#endif 1723 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] 1724 [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0), 1725 [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24) 1726 : "memory", "cc", NACL_R14 YUVTORGB_REGS 1727 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" 1728 ); 1729} 1730 1731void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf, 1732 const uint8* u_buf, 1733 const uint8* v_buf, 1734 uint8* dst_argb, 1735 const struct YuvConstants* yuvconstants, 1736 int width) { 1737 asm volatile ( 1738 YUVTORGB_SETUP(yuvconstants) 1739 "sub %[u_buf],%[v_buf] \n" 1740 "pcmpeqb %%xmm5,%%xmm5 \n" 1741 1742 LABELALIGN 1743 "1: \n" 1744 READYUV422 1745 YUVTORGB(yuvconstants) 1746 STOREARGB 1747 "sub $0x8,%[width] \n" 1748 "jg 1b \n" 1749 : [y_buf]"+r"(y_buf), // %[y_buf] 1750 [u_buf]"+r"(u_buf), // %[u_buf] 1751 [v_buf]"+r"(v_buf), // %[v_buf] 1752 [dst_argb]"+r"(dst_argb), // %[dst_argb] 1753 [width]"+rm"(width) // %[width] 1754 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] 1755 : "memory", "cc", NACL_R14 YUVTORGB_REGS 1756 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1757 ); 1758} 1759 1760#ifdef HAS_I422ALPHATOARGBROW_SSSE3 1761void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf, 1762 const uint8* u_buf, 1763 const uint8* v_buf, 1764 const uint8* a_buf, 1765 uint8* dst_argb, 1766 const struct YuvConstants* yuvconstants, 1767 int width) { 1768 // clang-format off 1769 asm volatile ( 1770 YUVTORGB_SETUP(yuvconstants) 1771 "sub %[u_buf],%[v_buf] \n" 1772 1773 LABELALIGN 1774 "1: \n" 1775 READYUVA422 1776 YUVTORGB(yuvconstants) 1777 STOREARGB 1778 "subl $0x8,%[width] \n" 1779 "jg 1b \n" 1780 : [y_buf]"+r"(y_buf), // %[y_buf] 1781 [u_buf]"+r"(u_buf), // %[u_buf] 1782 [v_buf]"+r"(v_buf), // %[v_buf] 1783 [a_buf]"+r"(a_buf), // %[a_buf] 1784 [dst_argb]"+r"(dst_argb), // %[dst_argb] 1785#if defined(__i386__) 1786 [width]"+m"(width) // %[width] 1787#else 1788 [width]"+rm"(width) // %[width] 1789#endif 1790 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] 1791 : "memory", "cc", NACL_R14 YUVTORGB_REGS 1792 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1793 ); 1794 // clang-format on 1795} 1796#endif // HAS_I422ALPHATOARGBROW_SSSE3 1797 1798void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf, 1799 const uint8* uv_buf, 1800 uint8* dst_argb, 1801 const struct YuvConstants* yuvconstants, 1802 int width) { 1803 // clang-format off 1804 asm volatile ( 1805 YUVTORGB_SETUP(yuvconstants) 1806 "pcmpeqb %%xmm5,%%xmm5 \n" 1807 1808 LABELALIGN 1809 "1: \n" 1810 READNV12 1811 YUVTORGB(yuvconstants) 1812 STOREARGB 1813 "sub $0x8,%[width] \n" 1814 "jg 1b \n" 1815 : [y_buf]"+r"(y_buf), // %[y_buf] 1816 [uv_buf]"+r"(uv_buf), // %[uv_buf] 1817 [dst_argb]"+r"(dst_argb), // %[dst_argb] 1818 [width]"+rm"(width) // %[width] 1819 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] 1820 : "memory", "cc", YUVTORGB_REGS // Does not use r14. 1821 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1822 ); 1823 // clang-format on 1824} 1825 1826void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf, 1827 const uint8* vu_buf, 1828 uint8* dst_argb, 1829 const struct YuvConstants* yuvconstants, 1830 int width) { 1831 // clang-format off 1832 asm volatile ( 1833 YUVTORGB_SETUP(yuvconstants) 1834 "pcmpeqb %%xmm5,%%xmm5 \n" 1835 1836 LABELALIGN 1837 "1: \n" 1838 READNV21 1839 YUVTORGB(yuvconstants) 1840 STOREARGB 1841 "sub $0x8,%[width] \n" 1842 "jg 1b \n" 1843 : [y_buf]"+r"(y_buf), // %[y_buf] 1844 [vu_buf]"+r"(vu_buf), // %[vu_buf] 1845 [dst_argb]"+r"(dst_argb), // %[dst_argb] 1846 [width]"+rm"(width) // %[width] 1847 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] 1848 [kShuffleNV21]"m"(kShuffleNV21) 1849 : "memory", "cc", YUVTORGB_REGS // Does not use r14. 1850 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1851 ); 1852 // clang-format on 1853} 1854 1855void OMITFP YUY2ToARGBRow_SSSE3(const uint8* yuy2_buf, 1856 uint8* dst_argb, 1857 const struct YuvConstants* yuvconstants, 1858 int width) { 1859 // clang-format off 1860 asm volatile ( 1861 YUVTORGB_SETUP(yuvconstants) 1862 "pcmpeqb %%xmm5,%%xmm5 \n" 1863 1864 LABELALIGN 1865 "1: \n" 1866 READYUY2 1867 YUVTORGB(yuvconstants) 1868 STOREARGB 1869 "sub $0x8,%[width] \n" 1870 "jg 1b \n" 1871 : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf] 1872 [dst_argb]"+r"(dst_argb), // %[dst_argb] 1873 [width]"+rm"(width) // %[width] 1874 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] 1875 [kShuffleYUY2Y]"m"(kShuffleYUY2Y), 1876 [kShuffleYUY2UV]"m"(kShuffleYUY2UV) 1877 : "memory", "cc", YUVTORGB_REGS // Does not use r14. 1878 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1879 ); 1880 // clang-format on 1881} 1882 1883void OMITFP UYVYToARGBRow_SSSE3(const uint8* uyvy_buf, 1884 uint8* dst_argb, 1885 const struct YuvConstants* yuvconstants, 1886 int width) { 1887 // clang-format off 1888 asm volatile ( 1889 YUVTORGB_SETUP(yuvconstants) 1890 "pcmpeqb %%xmm5,%%xmm5 \n" 1891 1892 LABELALIGN 1893 "1: \n" 1894 READUYVY 1895 YUVTORGB(yuvconstants) 1896 STOREARGB 1897 "sub $0x8,%[width] \n" 1898 "jg 1b \n" 1899 : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf] 1900 [dst_argb]"+r"(dst_argb), // %[dst_argb] 1901 [width]"+rm"(width) // %[width] 1902 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] 1903 [kShuffleUYVYY]"m"(kShuffleUYVYY), 1904 [kShuffleUYVYUV]"m"(kShuffleUYVYUV) 1905 : "memory", "cc", YUVTORGB_REGS // Does not use r14. 1906 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1907 ); 1908 // clang-format on 1909} 1910 1911void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, 1912 const uint8* u_buf, 1913 const uint8* v_buf, 1914 uint8* dst_rgba, 1915 const struct YuvConstants* yuvconstants, 1916 int width) { 1917 asm volatile ( 1918 YUVTORGB_SETUP(yuvconstants) 1919 "sub %[u_buf],%[v_buf] \n" 1920 "pcmpeqb %%xmm5,%%xmm5 \n" 1921 1922 LABELALIGN 1923 "1: \n" 1924 READYUV422 1925 YUVTORGB(yuvconstants) 1926 STORERGBA 1927 "sub $0x8,%[width] \n" 1928 "jg 1b \n" 1929 : [y_buf]"+r"(y_buf), // %[y_buf] 1930 [u_buf]"+r"(u_buf), // %[u_buf] 1931 [v_buf]"+r"(v_buf), // %[v_buf] 1932 [dst_rgba]"+r"(dst_rgba), // %[dst_rgba] 1933 [width]"+rm"(width) // %[width] 1934 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] 1935 : "memory", "cc", NACL_R14 YUVTORGB_REGS 1936 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1937 ); 1938} 1939 1940#endif // HAS_I422TOARGBROW_SSSE3 1941 1942// Read 16 UV from 444 1943#define READYUV444_AVX2 \ 1944 "vmovdqu " MEMACCESS([u_buf]) ",%%xmm0 \n" \ 1945 MEMOPREG(vmovdqu, 0x00, [u_buf], [v_buf], 1, xmm1) \ 1946 "lea " MEMLEA(0x10, [u_buf]) ",%[u_buf] \n" \ 1947 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ 1948 "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ 1949 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ 1950 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ 1951 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ 1952 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ 1953 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" 1954 1955// Read 8 UV from 422, upsample to 16 UV. 1956#define READYUV422_AVX2 \ 1957 "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ 1958 MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \ 1959 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ 1960 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ 1961 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ 1962 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ 1963 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ 1964 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ 1965 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ 1966 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" 1967 1968// Read 8 UV from 422, upsample to 16 UV. With 16 Alpha. 1969#define READYUVA422_AVX2 \ 1970 "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ 1971 MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \ 1972 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ 1973 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ 1974 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ 1975 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ 1976 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ 1977 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ 1978 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ 1979 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \ 1980 "vmovdqu " MEMACCESS([a_buf]) ",%%xmm5 \n" \ 1981 "vpermq $0xd8,%%ymm5,%%ymm5 \n" \ 1982 "lea " MEMLEA(0x10, [a_buf]) ",%[a_buf] \n" 1983 1984// Read 8 UV from NV12, upsample to 16 UV. 1985#define READNV12_AVX2 \ 1986 "vmovdqu " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ 1987 "lea " MEMLEA(0x10, [uv_buf]) ",%[uv_buf] \n" \ 1988 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ 1989 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ 1990 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ 1991 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ 1992 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ 1993 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" 1994 1995// Read 8 VU from NV21, upsample to 16 UV. 1996#define READNV21_AVX2 \ 1997 "vmovdqu " MEMACCESS([vu_buf]) ",%%xmm0 \n" \ 1998 "lea " MEMLEA(0x10, [vu_buf]) ",%[vu_buf] \n" \ 1999 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ 2000 "vpshufb %[kShuffleNV21], %%ymm0, %%ymm0 \n" \ 2001 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ 2002 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ 2003 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ 2004 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" 2005 2006// Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV. 2007#define READYUY2_AVX2 \ 2008 "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm4 \n" \ 2009 "vpshufb %[kShuffleYUY2Y], %%ymm4, %%ymm4 \n" \ 2010 "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm0 \n" \ 2011 "vpshufb %[kShuffleYUY2UV], %%ymm0, %%ymm0 \n" \ 2012 "lea " MEMLEA(0x20, [yuy2_buf]) ",%[yuy2_buf] \n" 2013 2014// Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV. 2015#define READUYVY_AVX2 \ 2016 "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm4 \n" \ 2017 "vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \ 2018 "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm0 \n" \ 2019 "vpshufb %[kShuffleUYVYUV], %%ymm0, %%ymm0 \n" \ 2020 "lea " MEMLEA(0x20, [uyvy_buf]) ",%[uyvy_buf] \n" 2021 2022#if defined(__x86_64__) 2023#define YUVTORGB_SETUP_AVX2(yuvconstants) \ 2024 "vmovdqa " MEMACCESS([yuvconstants]) ",%%ymm8 \n" \ 2025 "vmovdqa " MEMACCESS2(32, [yuvconstants]) ",%%ymm9 \n" \ 2026 "vmovdqa " MEMACCESS2(64, [yuvconstants]) ",%%ymm10 \n" \ 2027 "vmovdqa " MEMACCESS2(96, [yuvconstants]) ",%%ymm11 \n" \ 2028 "vmovdqa " MEMACCESS2(128, [yuvconstants]) ",%%ymm12 \n" \ 2029 "vmovdqa " MEMACCESS2(160, [yuvconstants]) ",%%ymm13 \n" \ 2030 "vmovdqa " MEMACCESS2(192, [yuvconstants]) ",%%ymm14 \n" 2031 2032#define YUVTORGB_AVX2(yuvconstants) \ 2033 "vpmaddubsw %%ymm10,%%ymm0,%%ymm2 \n" \ 2034 "vpmaddubsw %%ymm9,%%ymm0,%%ymm1 \n" \ 2035 "vpmaddubsw %%ymm8,%%ymm0,%%ymm0 \n" \ 2036 "vpsubw %%ymm2,%%ymm13,%%ymm2 \n" \ 2037 "vpsubw %%ymm1,%%ymm12,%%ymm1 \n" \ 2038 "vpsubw %%ymm0,%%ymm11,%%ymm0 \n" \ 2039 "vpmulhuw %%ymm14,%%ymm4,%%ymm4 \n" \ 2040 "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ 2041 "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \ 2042 "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \ 2043 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ 2044 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ 2045 "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ 2046 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ 2047 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ 2048 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" 2049 2050#define YUVTORGB_REGS_AVX2 \ 2051 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", 2052 2053#else // Convert 16 pixels: 16 UV and 16 Y. 2054 2055#define YUVTORGB_SETUP_AVX2(yuvconstants) 2056#define YUVTORGB_AVX2(yuvconstants) \ 2057 "vpmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%ymm0,%%ymm2 \n" \ 2058 "vpmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%ymm0,%%ymm1 \n" \ 2059 "vpmaddubsw " MEMACCESS([yuvconstants]) ",%%ymm0,%%ymm0 \n" \ 2060 "vmovdqu " MEMACCESS2(160, [yuvconstants]) ",%%ymm3 \n" \ 2061 "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \ 2062 "vmovdqu " MEMACCESS2(128, [yuvconstants]) ",%%ymm3 \n" \ 2063 "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \ 2064 "vmovdqu " MEMACCESS2(96, [yuvconstants]) ",%%ymm3 \n" \ 2065 "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \ 2066 "vpmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%ymm4,%%ymm4 \n" \ 2067 "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ 2068 "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \ 2069 "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \ 2070 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ 2071 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ 2072 "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ 2073 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ 2074 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ 2075 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" 2076#define YUVTORGB_REGS_AVX2 2077#endif 2078 2079// Store 16 ARGB values. 2080#define STOREARGB_AVX2 \ 2081 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ 2082 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ 2083 "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \ 2084 "vpermq $0xd8,%%ymm2,%%ymm2 \n" \ 2085 "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \ 2086 "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \ 2087 "vmovdqu %%ymm1," MEMACCESS([dst_argb]) " \n" \ 2088 "vmovdqu %%ymm0," MEMACCESS2(0x20, [dst_argb]) " \n" \ 2089 "lea " MEMLEA(0x40, [dst_argb]) ", %[dst_argb] \n" 2090 2091#ifdef HAS_I444TOARGBROW_AVX2 2092// 16 pixels 2093// 16 UV values with 16 Y producing 16 ARGB (64 bytes). 2094void OMITFP I444ToARGBRow_AVX2(const uint8* y_buf, 2095 const uint8* u_buf, 2096 const uint8* v_buf, 2097 uint8* dst_argb, 2098 const struct YuvConstants* yuvconstants, 2099 int width) { 2100 asm volatile ( 2101 YUVTORGB_SETUP_AVX2(yuvconstants) 2102 "sub %[u_buf],%[v_buf] \n" 2103 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 2104 2105 LABELALIGN 2106 "1: \n" 2107 READYUV444_AVX2 2108 YUVTORGB_AVX2(yuvconstants) 2109 STOREARGB_AVX2 2110 "sub $0x10,%[width] \n" 2111 "jg 1b \n" 2112 "vzeroupper \n" 2113 : [y_buf]"+r"(y_buf), // %[y_buf] 2114 [u_buf]"+r"(u_buf), // %[u_buf] 2115 [v_buf]"+r"(v_buf), // %[v_buf] 2116 [dst_argb]"+r"(dst_argb), // %[dst_argb] 2117 [width]"+rm"(width) // %[width] 2118 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] 2119 : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2 2120 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2121 ); 2122} 2123#endif // HAS_I444TOARGBROW_AVX2 2124 2125#if defined(HAS_I422TOARGBROW_AVX2) 2126// 16 pixels 2127// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). 2128void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf, 2129 const uint8* u_buf, 2130 const uint8* v_buf, 2131 uint8* dst_argb, 2132 const struct YuvConstants* yuvconstants, 2133 int width) { 2134 asm volatile ( 2135 YUVTORGB_SETUP_AVX2(yuvconstants) 2136 "sub %[u_buf],%[v_buf] \n" 2137 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 2138 2139 LABELALIGN 2140 "1: \n" 2141 READYUV422_AVX2 2142 YUVTORGB_AVX2(yuvconstants) 2143 STOREARGB_AVX2 2144 "sub $0x10,%[width] \n" 2145 "jg 1b \n" 2146 2147 "vzeroupper \n" 2148 : [y_buf]"+r"(y_buf), // %[y_buf] 2149 [u_buf]"+r"(u_buf), // %[u_buf] 2150 [v_buf]"+r"(v_buf), // %[v_buf] 2151 [dst_argb]"+r"(dst_argb), // %[dst_argb] 2152 [width]"+rm"(width) // %[width] 2153 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] 2154 : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2 2155 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2156 ); 2157} 2158#endif // HAS_I422TOARGBROW_AVX2 2159 2160#if defined(HAS_I422ALPHATOARGBROW_AVX2) 2161// 16 pixels 2162// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB. 2163void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf, 2164 const uint8* u_buf, 2165 const uint8* v_buf, 2166 const uint8* a_buf, 2167 uint8* dst_argb, 2168 const struct YuvConstants* yuvconstants, 2169 int width) { 2170 // clang-format off 2171 asm volatile ( 2172 YUVTORGB_SETUP_AVX2(yuvconstants) 2173 "sub %[u_buf],%[v_buf] \n" 2174 2175 LABELALIGN 2176 "1: \n" 2177 READYUVA422_AVX2 2178 YUVTORGB_AVX2(yuvconstants) 2179 STOREARGB_AVX2 2180 "subl $0x10,%[width] \n" 2181 "jg 1b \n" 2182 "vzeroupper \n" 2183 : [y_buf]"+r"(y_buf), // %[y_buf] 2184 [u_buf]"+r"(u_buf), // %[u_buf] 2185 [v_buf]"+r"(v_buf), // %[v_buf] 2186 [a_buf]"+r"(a_buf), // %[a_buf] 2187 [dst_argb]"+r"(dst_argb), // %[dst_argb] 2188#if defined(__i386__) 2189 [width]"+m"(width) // %[width] 2190#else 2191 [width]"+rm"(width) // %[width] 2192#endif 2193 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] 2194 : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2 2195 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2196 ); 2197 // clang-format on 2198} 2199#endif // HAS_I422ALPHATOARGBROW_AVX2 2200 2201#if defined(HAS_I422TORGBAROW_AVX2) 2202// 16 pixels 2203// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). 2204void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf, 2205 const uint8* u_buf, 2206 const uint8* v_buf, 2207 uint8* dst_argb, 2208 const struct YuvConstants* yuvconstants, 2209 int width) { 2210 asm volatile ( 2211 YUVTORGB_SETUP_AVX2(yuvconstants) 2212 "sub %[u_buf],%[v_buf] \n" 2213 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 2214 2215 LABELALIGN 2216 "1: \n" 2217 READYUV422_AVX2 2218 YUVTORGB_AVX2(yuvconstants) 2219 2220 // Step 3: Weave into RGBA 2221 "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n" 2222 "vpermq $0xd8,%%ymm1,%%ymm1 \n" 2223 "vpunpcklbw %%ymm0,%%ymm5,%%ymm2 \n" 2224 "vpermq $0xd8,%%ymm2,%%ymm2 \n" 2225 "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n" 2226 "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n" 2227 "vmovdqu %%ymm0," MEMACCESS([dst_argb]) "\n" 2228 "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n" 2229 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" 2230 "sub $0x10,%[width] \n" 2231 "jg 1b \n" 2232 "vzeroupper \n" 2233 : [y_buf]"+r"(y_buf), // %[y_buf] 2234 [u_buf]"+r"(u_buf), // %[u_buf] 2235 [v_buf]"+r"(v_buf), // %[v_buf] 2236 [dst_argb]"+r"(dst_argb), // %[dst_argb] 2237 [width]"+rm"(width) // %[width] 2238 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] 2239 : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2 2240 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2241 ); 2242} 2243#endif // HAS_I422TORGBAROW_AVX2 2244 2245#if defined(HAS_NV12TOARGBROW_AVX2) 2246// 16 pixels. 2247// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). 2248void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf, 2249 const uint8* uv_buf, 2250 uint8* dst_argb, 2251 const struct YuvConstants* yuvconstants, 2252 int width) { 2253 // clang-format off 2254 asm volatile ( 2255 YUVTORGB_SETUP_AVX2(yuvconstants) 2256 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 2257 2258 LABELALIGN 2259 "1: \n" 2260 READNV12_AVX2 2261 YUVTORGB_AVX2(yuvconstants) 2262 STOREARGB_AVX2 2263 "sub $0x10,%[width] \n" 2264 "jg 1b \n" 2265 "vzeroupper \n" 2266 : [y_buf]"+r"(y_buf), // %[y_buf] 2267 [uv_buf]"+r"(uv_buf), // %[uv_buf] 2268 [dst_argb]"+r"(dst_argb), // %[dst_argb] 2269 [width]"+rm"(width) // %[width] 2270 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] 2271 : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14. 2272 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2273 ); 2274 // clang-format on 2275} 2276#endif // HAS_NV12TOARGBROW_AVX2 2277 2278#if defined(HAS_NV21TOARGBROW_AVX2) 2279// 16 pixels. 2280// 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). 2281void OMITFP NV21ToARGBRow_AVX2(const uint8* y_buf, 2282 const uint8* vu_buf, 2283 uint8* dst_argb, 2284 const struct YuvConstants* yuvconstants, 2285 int width) { 2286 // clang-format off 2287 asm volatile ( 2288 YUVTORGB_SETUP_AVX2(yuvconstants) 2289 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 2290 2291 LABELALIGN 2292 "1: \n" 2293 READNV21_AVX2 2294 YUVTORGB_AVX2(yuvconstants) 2295 STOREARGB_AVX2 2296 "sub $0x10,%[width] \n" 2297 "jg 1b \n" 2298 "vzeroupper \n" 2299 : [y_buf]"+r"(y_buf), // %[y_buf] 2300 [vu_buf]"+r"(vu_buf), // %[vu_buf] 2301 [dst_argb]"+r"(dst_argb), // %[dst_argb] 2302 [width]"+rm"(width) // %[width] 2303 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] 2304 [kShuffleNV21]"m"(kShuffleNV21) 2305 : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14. 2306 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2307 ); 2308 // clang-format on 2309} 2310#endif // HAS_NV21TOARGBROW_AVX2 2311 2312#if defined(HAS_YUY2TOARGBROW_AVX2) 2313// 16 pixels. 2314// 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes). 2315void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf, 2316 uint8* dst_argb, 2317 const struct YuvConstants* yuvconstants, 2318 int width) { 2319 // clang-format off 2320 asm volatile ( 2321 YUVTORGB_SETUP_AVX2(yuvconstants) 2322 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 2323 2324 LABELALIGN 2325 "1: \n" 2326 READYUY2_AVX2 2327 YUVTORGB_AVX2(yuvconstants) 2328 STOREARGB_AVX2 2329 "sub $0x10,%[width] \n" 2330 "jg 1b \n" 2331 "vzeroupper \n" 2332 : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf] 2333 [dst_argb]"+r"(dst_argb), // %[dst_argb] 2334 [width]"+rm"(width) // %[width] 2335 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] 2336 [kShuffleYUY2Y]"m"(kShuffleYUY2Y), 2337 [kShuffleYUY2UV]"m"(kShuffleYUY2UV) 2338 : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14. 2339 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2340 ); 2341 // clang-format on 2342} 2343#endif // HAS_YUY2TOARGBROW_AVX2 2344 2345#if defined(HAS_UYVYTOARGBROW_AVX2) 2346// 16 pixels. 2347// 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes). 2348void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf, 2349 uint8* dst_argb, 2350 const struct YuvConstants* yuvconstants, 2351 int width) { 2352 // clang-format off 2353 asm volatile ( 2354 YUVTORGB_SETUP_AVX2(yuvconstants) 2355 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 2356 2357 LABELALIGN 2358 "1: \n" 2359 READUYVY_AVX2 2360 YUVTORGB_AVX2(yuvconstants) 2361 STOREARGB_AVX2 2362 "sub $0x10,%[width] \n" 2363 "jg 1b \n" 2364 "vzeroupper \n" 2365 : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf] 2366 [dst_argb]"+r"(dst_argb), // %[dst_argb] 2367 [width]"+rm"(width) // %[width] 2368 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] 2369 [kShuffleUYVYY]"m"(kShuffleUYVYY), 2370 [kShuffleUYVYUV]"m"(kShuffleUYVYUV) 2371 : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14. 2372 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2373 ); 2374 // clang-format on 2375} 2376#endif // HAS_UYVYTOARGBROW_AVX2 2377 2378#ifdef HAS_I400TOARGBROW_SSE2 2379void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) { 2380 asm volatile ( 2381 "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164 2382 "movd %%eax,%%xmm2 \n" 2383 "pshufd $0x0,%%xmm2,%%xmm2 \n" 2384 "mov $0x04880488,%%eax \n" // 0488 = 1160 = 1.164 * 16 2385 "movd %%eax,%%xmm3 \n" 2386 "pshufd $0x0,%%xmm3,%%xmm3 \n" 2387 "pcmpeqb %%xmm4,%%xmm4 \n" 2388 "pslld $0x18,%%xmm4 \n" 2389 2390 LABELALIGN 2391 "1: \n" 2392 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 2393 "movq " MEMACCESS(0) ",%%xmm0 \n" 2394 "lea " MEMLEA(0x8,0) ",%0 \n" 2395 "punpcklbw %%xmm0,%%xmm0 \n" 2396 "pmulhuw %%xmm2,%%xmm0 \n" 2397 "psubusw %%xmm3,%%xmm0 \n" 2398 "psrlw $6, %%xmm0 \n" 2399 "packuswb %%xmm0,%%xmm0 \n" 2400 2401 // Step 2: Weave into ARGB 2402 "punpcklbw %%xmm0,%%xmm0 \n" 2403 "movdqa %%xmm0,%%xmm1 \n" 2404 "punpcklwd %%xmm0,%%xmm0 \n" 2405 "punpckhwd %%xmm1,%%xmm1 \n" 2406 "por %%xmm4,%%xmm0 \n" 2407 "por %%xmm4,%%xmm1 \n" 2408 "movdqu %%xmm0," MEMACCESS(1) " \n" 2409 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" 2410 "lea " MEMLEA(0x20,1) ",%1 \n" 2411 2412 "sub $0x8,%2 \n" 2413 "jg 1b \n" 2414 : "+r"(y_buf), // %0 2415 "+r"(dst_argb), // %1 2416 "+rm"(width) // %2 2417 : 2418 : "memory", "cc", "eax" 2419 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" 2420 ); 2421} 2422#endif // HAS_I400TOARGBROW_SSE2 2423 2424#ifdef HAS_I400TOARGBROW_AVX2 2425// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes). 2426// note: vpunpcklbw mutates and vpackuswb unmutates. 2427void I400ToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) { 2428 asm volatile ( 2429 "mov $0x4a354a35,%%eax \n" // 0488 = 1160 = 1.164 * 16 2430 "vmovd %%eax,%%xmm2 \n" 2431 "vbroadcastss %%xmm2,%%ymm2 \n" 2432 "mov $0x4880488,%%eax \n" // 4a35 = 18997 = 1.164 2433 "vmovd %%eax,%%xmm3 \n" 2434 "vbroadcastss %%xmm3,%%ymm3 \n" 2435 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" 2436 "vpslld $0x18,%%ymm4,%%ymm4 \n" 2437 2438 LABELALIGN 2439 "1: \n" 2440 // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164 2441 "vmovdqu " MEMACCESS(0) ",%%xmm0 \n" 2442 "lea " MEMLEA(0x10,0) ",%0 \n" 2443 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 2444 "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" 2445 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" 2446 "vpsubusw %%ymm3,%%ymm0,%%ymm0 \n" 2447 "vpsrlw $0x6,%%ymm0,%%ymm0 \n" 2448 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" 2449 "vpunpcklbw %%ymm0,%%ymm0,%%ymm1 \n" 2450 "vpermq $0xd8,%%ymm1,%%ymm1 \n" 2451 "vpunpcklwd %%ymm1,%%ymm1,%%ymm0 \n" 2452 "vpunpckhwd %%ymm1,%%ymm1,%%ymm1 \n" 2453 "vpor %%ymm4,%%ymm0,%%ymm0 \n" 2454 "vpor %%ymm4,%%ymm1,%%ymm1 \n" 2455 "vmovdqu %%ymm0," MEMACCESS(1) " \n" 2456 "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n" 2457 "lea " MEMLEA(0x40,1) ",%1 \n" 2458 "sub $0x10,%2 \n" 2459 "jg 1b \n" 2460 "vzeroupper \n" 2461 : "+r"(y_buf), // %0 2462 "+r"(dst_argb), // %1 2463 "+rm"(width) // %2 2464 : 2465 : "memory", "cc", "eax" 2466 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" 2467 ); 2468} 2469#endif // HAS_I400TOARGBROW_AVX2 2470 2471#ifdef HAS_MIRRORROW_SSSE3 2472// Shuffle table for reversing the bytes. 2473static uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 2474 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; 2475 2476void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { 2477 intptr_t temp_width = (intptr_t)(width); 2478 asm volatile ( 2479 "movdqa %3,%%xmm5 \n" 2480 2481 LABELALIGN 2482 "1: \n" 2483 MEMOPREG(movdqu,-0x10,0,2,1,xmm0) // movdqu -0x10(%0,%2),%%xmm0 2484 "pshufb %%xmm5,%%xmm0 \n" 2485 "movdqu %%xmm0," MEMACCESS(1) " \n" 2486 "lea " MEMLEA(0x10,1) ",%1 \n" 2487 "sub $0x10,%2 \n" 2488 "jg 1b \n" 2489 : "+r"(src), // %0 2490 "+r"(dst), // %1 2491 "+r"(temp_width) // %2 2492 : "m"(kShuffleMirror) // %3 2493 : "memory", "cc", NACL_R14 2494 "xmm0", "xmm5" 2495 ); 2496} 2497#endif // HAS_MIRRORROW_SSSE3 2498 2499#ifdef HAS_MIRRORROW_AVX2 2500void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { 2501 intptr_t temp_width = (intptr_t)(width); 2502 asm volatile ( 2503 "vbroadcastf128 %3,%%ymm5 \n" 2504 2505 LABELALIGN 2506 "1: \n" 2507 MEMOPREG(vmovdqu,-0x20,0,2,1,ymm0) // vmovdqu -0x20(%0,%2),%%ymm0 2508 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" 2509 "vpermq $0x4e,%%ymm0,%%ymm0 \n" 2510 "vmovdqu %%ymm0," MEMACCESS(1) " \n" 2511 "lea " MEMLEA(0x20,1) ",%1 \n" 2512 "sub $0x20,%2 \n" 2513 "jg 1b \n" 2514 "vzeroupper \n" 2515 : "+r"(src), // %0 2516 "+r"(dst), // %1 2517 "+r"(temp_width) // %2 2518 : "m"(kShuffleMirror) // %3 2519 : "memory", "cc", NACL_R14 2520 "xmm0", "xmm5" 2521 ); 2522} 2523#endif // HAS_MIRRORROW_AVX2 2524 2525#ifdef HAS_MIRRORUVROW_SSSE3 2526// Shuffle table for reversing the bytes of UV channels. 2527static uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 2528 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u}; 2529void MirrorUVRow_SSSE3(const uint8* src, 2530 uint8* dst_u, 2531 uint8* dst_v, 2532 int width) { 2533 intptr_t temp_width = (intptr_t)(width); 2534 asm volatile ( 2535 "movdqa %4,%%xmm1 \n" 2536 "lea " MEMLEA4(-0x10,0,3,2) ",%0 \n" 2537 "sub %1,%2 \n" 2538 2539 LABELALIGN 2540 "1: \n" 2541 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 2542 "lea " MEMLEA(-0x10,0) ",%0 \n" 2543 "pshufb %%xmm1,%%xmm0 \n" 2544 "movlpd %%xmm0," MEMACCESS(1) " \n" 2545 MEMOPMEM(movhpd,xmm0,0x00,1,2,1) // movhpd %%xmm0,(%1,%2) 2546 "lea " MEMLEA(0x8,1) ",%1 \n" 2547 "sub $8,%3 \n" 2548 "jg 1b \n" 2549 : "+r"(src), // %0 2550 "+r"(dst_u), // %1 2551 "+r"(dst_v), // %2 2552 "+r"(temp_width) // %3 2553 : "m"(kShuffleMirrorUV) // %4 2554 : "memory", "cc", NACL_R14 2555 "xmm0", "xmm1" 2556 ); 2557} 2558#endif // HAS_MIRRORUVROW_SSSE3 2559 2560#ifdef HAS_ARGBMIRRORROW_SSE2 2561 2562void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) { 2563 intptr_t temp_width = (intptr_t)(width); 2564 asm volatile ( 2565 "lea " MEMLEA4(-0x10,0,2,4) ",%0 \n" 2566 2567 LABELALIGN 2568 "1: \n" 2569 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 2570 "pshufd $0x1b,%%xmm0,%%xmm0 \n" 2571 "lea " MEMLEA(-0x10,0) ",%0 \n" 2572 "movdqu %%xmm0," MEMACCESS(1) " \n" 2573 "lea " MEMLEA(0x10,1) ",%1 \n" 2574 "sub $0x4,%2 \n" 2575 "jg 1b \n" 2576 : "+r"(src), // %0 2577 "+r"(dst), // %1 2578 "+r"(temp_width) // %2 2579 : 2580 : "memory", "cc" 2581 , "xmm0" 2582 ); 2583} 2584#endif // HAS_ARGBMIRRORROW_SSE2 2585 2586#ifdef HAS_ARGBMIRRORROW_AVX2 2587// Shuffle table for reversing the bytes. 2588static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; 2589void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { 2590 intptr_t temp_width = (intptr_t)(width); 2591 asm volatile ( 2592 "vmovdqu %3,%%ymm5 \n" 2593 2594 LABELALIGN 2595 "1: \n" 2596 VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0 2597 "vmovdqu %%ymm0," MEMACCESS(1) " \n" 2598 "lea " MEMLEA(0x20,1) ",%1 \n" 2599 "sub $0x8,%2 \n" 2600 "jg 1b \n" 2601 "vzeroupper \n" 2602 : "+r"(src), // %0 2603 "+r"(dst), // %1 2604 "+r"(temp_width) // %2 2605 : "m"(kARGBShuffleMirror_AVX2) // %3 2606 : "memory", "cc", NACL_R14 2607 "xmm0", "xmm5" 2608 ); 2609} 2610#endif // HAS_ARGBMIRRORROW_AVX2 2611 2612#ifdef HAS_SPLITUVROW_AVX2 2613void SplitUVRow_AVX2(const uint8* src_uv, 2614 uint8* dst_u, 2615 uint8* dst_v, 2616 int width) { 2617 asm volatile ( 2618 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 2619 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" 2620 "sub %1,%2 \n" 2621 2622 LABELALIGN 2623 "1: \n" 2624 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 2625 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" 2626 "lea " MEMLEA(0x40,0) ",%0 \n" 2627 "vpsrlw $0x8,%%ymm0,%%ymm2 \n" 2628 "vpsrlw $0x8,%%ymm1,%%ymm3 \n" 2629 "vpand %%ymm5,%%ymm0,%%ymm0 \n" 2630 "vpand %%ymm5,%%ymm1,%%ymm1 \n" 2631 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" 2632 "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n" 2633 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 2634 "vpermq $0xd8,%%ymm2,%%ymm2 \n" 2635 "vmovdqu %%ymm0," MEMACCESS(1) " \n" 2636 MEMOPMEM(vmovdqu,ymm2,0x00,1,2,1) // vmovdqu %%ymm2,(%1,%2) 2637 "lea " MEMLEA(0x20,1) ",%1 \n" 2638 "sub $0x20,%3 \n" 2639 "jg 1b \n" 2640 "vzeroupper \n" 2641 : "+r"(src_uv), // %0 2642 "+r"(dst_u), // %1 2643 "+r"(dst_v), // %2 2644 "+r"(width) // %3 2645 : 2646 : "memory", "cc", NACL_R14 2647 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 2648 ); 2649} 2650#endif // HAS_SPLITUVROW_AVX2 2651 2652#ifdef HAS_SPLITUVROW_SSE2 2653void SplitUVRow_SSE2(const uint8* src_uv, 2654 uint8* dst_u, 2655 uint8* dst_v, 2656 int width) { 2657 asm volatile ( 2658 "pcmpeqb %%xmm5,%%xmm5 \n" 2659 "psrlw $0x8,%%xmm5 \n" 2660 "sub %1,%2 \n" 2661 2662 LABELALIGN 2663 "1: \n" 2664 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 2665 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 2666 "lea " MEMLEA(0x20,0) ",%0 \n" 2667 "movdqa %%xmm0,%%xmm2 \n" 2668 "movdqa %%xmm1,%%xmm3 \n" 2669 "pand %%xmm5,%%xmm0 \n" 2670 "pand %%xmm5,%%xmm1 \n" 2671 "packuswb %%xmm1,%%xmm0 \n" 2672 "psrlw $0x8,%%xmm2 \n" 2673 "psrlw $0x8,%%xmm3 \n" 2674 "packuswb %%xmm3,%%xmm2 \n" 2675 "movdqu %%xmm0," MEMACCESS(1) " \n" 2676 MEMOPMEM(movdqu,xmm2,0x00,1,2,1) // movdqu %%xmm2,(%1,%2) 2677 "lea " MEMLEA(0x10,1) ",%1 \n" 2678 "sub $0x10,%3 \n" 2679 "jg 1b \n" 2680 : "+r"(src_uv), // %0 2681 "+r"(dst_u), // %1 2682 "+r"(dst_v), // %2 2683 "+r"(width) // %3 2684 : 2685 : "memory", "cc", NACL_R14 2686 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 2687 ); 2688} 2689#endif // HAS_SPLITUVROW_SSE2 2690 2691#ifdef HAS_MERGEUVROW_AVX2 2692void MergeUVRow_AVX2(const uint8* src_u, 2693 const uint8* src_v, 2694 uint8* dst_uv, 2695 int width) { 2696 asm volatile ( 2697 "sub %0,%1 \n" 2698 2699 LABELALIGN 2700 "1: \n" 2701 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 2702 MEMOPREG(vmovdqu,0x00,0,1,1,ymm1) // vmovdqu (%0,%1,1),%%ymm1 2703 "lea " MEMLEA(0x20,0) ",%0 \n" 2704 "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n" 2705 "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n" 2706 "vextractf128 $0x0,%%ymm2," MEMACCESS(2) " \n" 2707 "vextractf128 $0x0,%%ymm0," MEMACCESS2(0x10,2) "\n" 2708 "vextractf128 $0x1,%%ymm2," MEMACCESS2(0x20,2) "\n" 2709 "vextractf128 $0x1,%%ymm0," MEMACCESS2(0x30,2) "\n" 2710 "lea " MEMLEA(0x40,2) ",%2 \n" 2711 "sub $0x20,%3 \n" 2712 "jg 1b \n" 2713 "vzeroupper \n" 2714 : "+r"(src_u), // %0 2715 "+r"(src_v), // %1 2716 "+r"(dst_uv), // %2 2717 "+r"(width) // %3 2718 : 2719 : "memory", "cc", NACL_R14 2720 "xmm0", "xmm1", "xmm2" 2721 ); 2722} 2723#endif // HAS_MERGEUVROW_AVX2 2724 2725#ifdef HAS_MERGEUVROW_SSE2 2726void MergeUVRow_SSE2(const uint8* src_u, 2727 const uint8* src_v, 2728 uint8* dst_uv, 2729 int width) { 2730 asm volatile ( 2731 "sub %0,%1 \n" 2732 2733 LABELALIGN 2734 "1: \n" 2735 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 2736 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 2737 "lea " MEMLEA(0x10,0) ",%0 \n" 2738 "movdqa %%xmm0,%%xmm2 \n" 2739 "punpcklbw %%xmm1,%%xmm0 \n" 2740 "punpckhbw %%xmm1,%%xmm2 \n" 2741 "movdqu %%xmm0," MEMACCESS(2) " \n" 2742 "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n" 2743 "lea " MEMLEA(0x20,2) ",%2 \n" 2744 "sub $0x10,%3 \n" 2745 "jg 1b \n" 2746 : "+r"(src_u), // %0 2747 "+r"(src_v), // %1 2748 "+r"(dst_uv), // %2 2749 "+r"(width) // %3 2750 : 2751 : "memory", "cc", NACL_R14 2752 "xmm0", "xmm1", "xmm2" 2753 ); 2754} 2755#endif // HAS_MERGEUVROW_SSE2 2756 2757#ifdef HAS_COPYROW_SSE2 2758void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { 2759 asm volatile ( 2760 "test $0xf,%0 \n" 2761 "jne 2f \n" 2762 "test $0xf,%1 \n" 2763 "jne 2f \n" 2764 2765 LABELALIGN 2766 "1: \n" 2767 "movdqa " MEMACCESS(0) ",%%xmm0 \n" 2768 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 2769 "lea " MEMLEA(0x20,0) ",%0 \n" 2770 "movdqa %%xmm0," MEMACCESS(1) " \n" 2771 "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" 2772 "lea " MEMLEA(0x20,1) ",%1 \n" 2773 "sub $0x20,%2 \n" 2774 "jg 1b \n" 2775 "jmp 9f \n" 2776 2777 LABELALIGN 2778 "2: \n" 2779 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 2780 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 2781 "lea " MEMLEA(0x20,0) ",%0 \n" 2782 "movdqu %%xmm0," MEMACCESS(1) " \n" 2783 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" 2784 "lea " MEMLEA(0x20,1) ",%1 \n" 2785 "sub $0x20,%2 \n" 2786 "jg 2b \n" 2787 "9: \n" 2788 : "+r"(src), // %0 2789 "+r"(dst), // %1 2790 "+r"(count) // %2 2791 : 2792 : "memory", "cc" 2793 , "xmm0", "xmm1" 2794 ); 2795} 2796#endif // HAS_COPYROW_SSE2 2797 2798#ifdef HAS_COPYROW_AVX 2799void CopyRow_AVX(const uint8* src, uint8* dst, int count) { 2800 asm volatile ( 2801 LABELALIGN 2802 "1: \n" 2803 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 2804 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" 2805 "lea " MEMLEA(0x40,0) ",%0 \n" 2806 "vmovdqu %%ymm0," MEMACCESS(1) " \n" 2807 "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n" 2808 "lea " MEMLEA(0x40,1) ",%1 \n" 2809 "sub $0x40,%2 \n" 2810 "jg 1b \n" 2811 : "+r"(src), // %0 2812 "+r"(dst), // %1 2813 "+r"(count) // %2 2814 : 2815 : "memory", "cc" 2816 , "xmm0", "xmm1" 2817 ); 2818} 2819#endif // HAS_COPYROW_AVX 2820 2821#ifdef HAS_COPYROW_ERMS 2822// Multiple of 1. 2823void CopyRow_ERMS(const uint8* src, uint8* dst, int width) { 2824 size_t width_tmp = (size_t)(width); 2825 asm volatile("rep movsb " MEMMOVESTRING(0, 1) " \n" 2826 : "+S"(src), // %0 2827 "+D"(dst), // %1 2828 "+c"(width_tmp) // %2 2829 : 2830 : "memory", "cc"); 2831} 2832#endif // HAS_COPYROW_ERMS 2833 2834#ifdef HAS_ARGBCOPYALPHAROW_SSE2 2835// width in pixels 2836void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { 2837 asm volatile ( 2838 "pcmpeqb %%xmm0,%%xmm0 \n" 2839 "pslld $0x18,%%xmm0 \n" 2840 "pcmpeqb %%xmm1,%%xmm1 \n" 2841 "psrld $0x8,%%xmm1 \n" 2842 2843 LABELALIGN 2844 "1: \n" 2845 "movdqu " MEMACCESS(0) ",%%xmm2 \n" 2846 "movdqu " MEMACCESS2(0x10,0) ",%%xmm3 \n" 2847 "lea " MEMLEA(0x20,0) ",%0 \n" 2848 "movdqu " MEMACCESS(1) ",%%xmm4 \n" 2849 "movdqu " MEMACCESS2(0x10,1) ",%%xmm5 \n" 2850 "pand %%xmm0,%%xmm2 \n" 2851 "pand %%xmm0,%%xmm3 \n" 2852 "pand %%xmm1,%%xmm4 \n" 2853 "pand %%xmm1,%%xmm5 \n" 2854 "por %%xmm4,%%xmm2 \n" 2855 "por %%xmm5,%%xmm3 \n" 2856 "movdqu %%xmm2," MEMACCESS(1) " \n" 2857 "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n" 2858 "lea " MEMLEA(0x20,1) ",%1 \n" 2859 "sub $0x8,%2 \n" 2860 "jg 1b \n" 2861 : "+r"(src), // %0 2862 "+r"(dst), // %1 2863 "+r"(width) // %2 2864 : 2865 : "memory", "cc" 2866 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2867 ); 2868} 2869#endif // HAS_ARGBCOPYALPHAROW_SSE2 2870 2871#ifdef HAS_ARGBCOPYALPHAROW_AVX2 2872// width in pixels 2873void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { 2874 asm volatile ( 2875 "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" 2876 "vpsrld $0x8,%%ymm0,%%ymm0 \n" 2877 2878 LABELALIGN 2879 "1: \n" 2880 "vmovdqu " MEMACCESS(0) ",%%ymm1 \n" 2881 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm2 \n" 2882 "lea " MEMLEA(0x40,0) ",%0 \n" 2883 "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n" 2884 "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n" 2885 "vmovdqu %%ymm1," MEMACCESS(1) " \n" 2886 "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n" 2887 "lea " MEMLEA(0x40,1) ",%1 \n" 2888 "sub $0x10,%2 \n" 2889 "jg 1b \n" 2890 "vzeroupper \n" 2891 : "+r"(src), // %0 2892 "+r"(dst), // %1 2893 "+r"(width) // %2 2894 : 2895 : "memory", "cc" 2896 , "xmm0", "xmm1", "xmm2" 2897 ); 2898} 2899#endif // HAS_ARGBCOPYALPHAROW_AVX2 2900 2901#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2 2902// width in pixels 2903void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) { 2904 asm volatile ( 2905 LABELALIGN 2906 "1: \n" 2907 "movdqu " MEMACCESS(0) ", %%xmm0 \n" 2908 "movdqu " MEMACCESS2(0x10, 0) ", %%xmm1 \n" 2909 "lea " MEMLEA(0x20, 0) ", %0 \n" 2910 "psrld $0x18, %%xmm0 \n" 2911 "psrld $0x18, %%xmm1 \n" 2912 "packssdw %%xmm1, %%xmm0 \n" 2913 "packuswb %%xmm0, %%xmm0 \n" 2914 "movq %%xmm0," MEMACCESS(1) " \n" 2915 "lea " MEMLEA(0x8, 1) ", %1 \n" 2916 "sub $0x8, %2 \n" 2917 "jg 1b \n" 2918 : "+r"(src_argb), // %0 2919 "+r"(dst_a), // %1 2920 "+rm"(width) // %2 2921 : 2922 : "memory", "cc" 2923 , "xmm0", "xmm1" 2924 ); 2925} 2926#endif // HAS_ARGBEXTRACTALPHAROW_SSE2 2927 2928#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2 2929static const uvec8 kShuffleAlphaShort_AVX2 = { 2930 3u, 128u, 128u, 128u, 7u, 128u, 128u, 128u, 2931 11u, 128u, 128u, 128u, 15u, 128u, 128u, 128u}; 2932 2933void ARGBExtractAlphaRow_AVX2(const uint8* src_argb, uint8* dst_a, int width) { 2934 asm volatile ( 2935 "vmovdqa %3,%%ymm4 \n" 2936 "vbroadcastf128 %4,%%ymm5 \n" 2937 2938 LABELALIGN 2939 "1: \n" 2940 "vmovdqu " MEMACCESS(0) ", %%ymm0 \n" 2941 "vmovdqu " MEMACCESS2(0x20, 0) ", %%ymm1 \n" 2942 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // vpsrld $0x18, %%ymm0 2943 "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" 2944 "vmovdqu " MEMACCESS2(0x40, 0) ", %%ymm2 \n" 2945 "vmovdqu " MEMACCESS2(0x60, 0) ", %%ymm3 \n" 2946 "lea " MEMLEA(0x80, 0) ", %0 \n" 2947 "vpackssdw %%ymm1, %%ymm0, %%ymm0 \n" // mutates 2948 "vpshufb %%ymm5,%%ymm2,%%ymm2 \n" 2949 "vpshufb %%ymm5,%%ymm3,%%ymm3 \n" 2950 "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // mutates 2951 "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates. 2952 "vpermd %%ymm0,%%ymm4,%%ymm0 \n" // unmutate. 2953 "vmovdqu %%ymm0," MEMACCESS(1) " \n" 2954 "lea " MEMLEA(0x20,1) ",%1 \n" 2955 "sub $0x20, %2 \n" 2956 "jg 1b \n" 2957 "vzeroupper \n" 2958 : "+r"(src_argb), // %0 2959 "+r"(dst_a), // %1 2960 "+rm"(width) // %2 2961 : "m"(kPermdARGBToY_AVX), // %3 2962 "m"(kShuffleAlphaShort_AVX2) // %4 2963 : "memory", "cc" 2964 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 2965 ); 2966} 2967#endif // HAS_ARGBEXTRACTALPHAROW_AVX2 2968 2969#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 2970// width in pixels 2971void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { 2972 asm volatile ( 2973 "pcmpeqb %%xmm0,%%xmm0 \n" 2974 "pslld $0x18,%%xmm0 \n" 2975 "pcmpeqb %%xmm1,%%xmm1 \n" 2976 "psrld $0x8,%%xmm1 \n" 2977 2978 LABELALIGN 2979 "1: \n" 2980 "movq " MEMACCESS(0) ",%%xmm2 \n" 2981 "lea " MEMLEA(0x8,0) ",%0 \n" 2982 "punpcklbw %%xmm2,%%xmm2 \n" 2983 "punpckhwd %%xmm2,%%xmm3 \n" 2984 "punpcklwd %%xmm2,%%xmm2 \n" 2985 "movdqu " MEMACCESS(1) ",%%xmm4 \n" 2986 "movdqu " MEMACCESS2(0x10,1) ",%%xmm5 \n" 2987 "pand %%xmm0,%%xmm2 \n" 2988 "pand %%xmm0,%%xmm3 \n" 2989 "pand %%xmm1,%%xmm4 \n" 2990 "pand %%xmm1,%%xmm5 \n" 2991 "por %%xmm4,%%xmm2 \n" 2992 "por %%xmm5,%%xmm3 \n" 2993 "movdqu %%xmm2," MEMACCESS(1) " \n" 2994 "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n" 2995 "lea " MEMLEA(0x20,1) ",%1 \n" 2996 "sub $0x8,%2 \n" 2997 "jg 1b \n" 2998 : "+r"(src), // %0 2999 "+r"(dst), // %1 3000 "+r"(width) // %2 3001 : 3002 : "memory", "cc" 3003 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 3004 ); 3005} 3006#endif // HAS_ARGBCOPYYTOALPHAROW_SSE2 3007 3008#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 3009// width in pixels 3010void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { 3011 asm volatile ( 3012 "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" 3013 "vpsrld $0x8,%%ymm0,%%ymm0 \n" 3014 3015 LABELALIGN 3016 "1: \n" 3017 "vpmovzxbd " MEMACCESS(0) ",%%ymm1 \n" 3018 "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2 \n" 3019 "lea " MEMLEA(0x10,0) ",%0 \n" 3020 "vpslld $0x18,%%ymm1,%%ymm1 \n" 3021 "vpslld $0x18,%%ymm2,%%ymm2 \n" 3022 "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n" 3023 "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n" 3024 "vmovdqu %%ymm1," MEMACCESS(1) " \n" 3025 "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n" 3026 "lea " MEMLEA(0x40,1) ",%1 \n" 3027 "sub $0x10,%2 \n" 3028 "jg 1b \n" 3029 "vzeroupper \n" 3030 : "+r"(src), // %0 3031 "+r"(dst), // %1 3032 "+r"(width) // %2 3033 : 3034 : "memory", "cc" 3035 , "xmm0", "xmm1", "xmm2" 3036 ); 3037} 3038#endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 3039 3040#ifdef HAS_SETROW_X86 3041void SetRow_X86(uint8* dst, uint8 v8, int width) { 3042 size_t width_tmp = (size_t)(width >> 2); 3043 const uint32 v32 = v8 * 0x01010101u; // Duplicate byte to all bytes. 3044 asm volatile("rep stosl " MEMSTORESTRING(eax, 0) " \n" 3045 : "+D"(dst), // %0 3046 "+c"(width_tmp) // %1 3047 : "a"(v32) // %2 3048 : "memory", "cc"); 3049} 3050 3051void SetRow_ERMS(uint8* dst, uint8 v8, int width) { 3052 size_t width_tmp = (size_t)(width); 3053 asm volatile("rep stosb " MEMSTORESTRING(al, 0) " \n" 3054 : "+D"(dst), // %0 3055 "+c"(width_tmp) // %1 3056 : "a"(v8) // %2 3057 : "memory", "cc"); 3058} 3059 3060void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) { 3061 size_t width_tmp = (size_t)(width); 3062 asm volatile("rep stosl " MEMSTORESTRING(eax, 0) " \n" 3063 : "+D"(dst_argb), // %0 3064 "+c"(width_tmp) // %1 3065 : "a"(v32) // %2 3066 : "memory", "cc"); 3067} 3068#endif // HAS_SETROW_X86 3069 3070#ifdef HAS_YUY2TOYROW_SSE2 3071void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int width) { 3072 asm volatile ( 3073 "pcmpeqb %%xmm5,%%xmm5 \n" 3074 "psrlw $0x8,%%xmm5 \n" 3075 3076 LABELALIGN 3077 "1: \n" 3078 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 3079 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 3080 "lea " MEMLEA(0x20,0) ",%0 \n" 3081 "pand %%xmm5,%%xmm0 \n" 3082 "pand %%xmm5,%%xmm1 \n" 3083 "packuswb %%xmm1,%%xmm0 \n" 3084 "movdqu %%xmm0," MEMACCESS(1) " \n" 3085 "lea " MEMLEA(0x10,1) ",%1 \n" 3086 "sub $0x10,%2 \n" 3087 "jg 1b \n" 3088 : "+r"(src_yuy2), // %0 3089 "+r"(dst_y), // %1 3090 "+r"(width) // %2 3091 : 3092 : "memory", "cc" 3093 , "xmm0", "xmm1", "xmm5" 3094 ); 3095} 3096 3097void YUY2ToUVRow_SSE2(const uint8* src_yuy2, 3098 int stride_yuy2, 3099 uint8* dst_u, 3100 uint8* dst_v, 3101 int width) { 3102 asm volatile ( 3103 "pcmpeqb %%xmm5,%%xmm5 \n" 3104 "psrlw $0x8,%%xmm5 \n" 3105 "sub %1,%2 \n" 3106 3107 LABELALIGN 3108 "1: \n" 3109 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 3110 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 3111 MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 3112 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 3113 "lea " MEMLEA(0x20,0) ",%0 \n" 3114 "pavgb %%xmm2,%%xmm0 \n" 3115 "pavgb %%xmm3,%%xmm1 \n" 3116 "psrlw $0x8,%%xmm0 \n" 3117 "psrlw $0x8,%%xmm1 \n" 3118 "packuswb %%xmm1,%%xmm0 \n" 3119 "movdqa %%xmm0,%%xmm1 \n" 3120 "pand %%xmm5,%%xmm0 \n" 3121 "packuswb %%xmm0,%%xmm0 \n" 3122 "psrlw $0x8,%%xmm1 \n" 3123 "packuswb %%xmm1,%%xmm1 \n" 3124 "movq %%xmm0," MEMACCESS(1) " \n" 3125 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) 3126 "lea " MEMLEA(0x8,1) ",%1 \n" 3127 "sub $0x10,%3 \n" 3128 "jg 1b \n" 3129 : "+r"(src_yuy2), // %0 3130 "+r"(dst_u), // %1 3131 "+r"(dst_v), // %2 3132 "+r"(width) // %3 3133 : "r"((intptr_t)(stride_yuy2)) // %4 3134 : "memory", "cc", NACL_R14 3135 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 3136 ); 3137} 3138 3139void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, 3140 uint8* dst_u, 3141 uint8* dst_v, 3142 int width) { 3143 asm volatile ( 3144 "pcmpeqb %%xmm5,%%xmm5 \n" 3145 "psrlw $0x8,%%xmm5 \n" 3146 "sub %1,%2 \n" 3147 3148 LABELALIGN 3149 "1: \n" 3150 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 3151 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 3152 "lea " MEMLEA(0x20,0) ",%0 \n" 3153 "psrlw $0x8,%%xmm0 \n" 3154 "psrlw $0x8,%%xmm1 \n" 3155 "packuswb %%xmm1,%%xmm0 \n" 3156 "movdqa %%xmm0,%%xmm1 \n" 3157 "pand %%xmm5,%%xmm0 \n" 3158 "packuswb %%xmm0,%%xmm0 \n" 3159 "psrlw $0x8,%%xmm1 \n" 3160 "packuswb %%xmm1,%%xmm1 \n" 3161 "movq %%xmm0," MEMACCESS(1) " \n" 3162 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) 3163 "lea " MEMLEA(0x8,1) ",%1 \n" 3164 "sub $0x10,%3 \n" 3165 "jg 1b \n" 3166 : "+r"(src_yuy2), // %0 3167 "+r"(dst_u), // %1 3168 "+r"(dst_v), // %2 3169 "+r"(width) // %3 3170 : 3171 : "memory", "cc", NACL_R14 3172 "xmm0", "xmm1", "xmm5" 3173 ); 3174} 3175 3176void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int width) { 3177 asm volatile ( 3178 LABELALIGN 3179 "1: \n" 3180 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 3181 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 3182 "lea " MEMLEA(0x20,0) ",%0 \n" 3183 "psrlw $0x8,%%xmm0 \n" 3184 "psrlw $0x8,%%xmm1 \n" 3185 "packuswb %%xmm1,%%xmm0 \n" 3186 "movdqu %%xmm0," MEMACCESS(1) " \n" 3187 "lea " MEMLEA(0x10,1) ",%1 \n" 3188 "sub $0x10,%2 \n" 3189 "jg 1b \n" 3190 : "+r"(src_uyvy), // %0 3191 "+r"(dst_y), // %1 3192 "+r"(width) // %2 3193 : 3194 : "memory", "cc" 3195 , "xmm0", "xmm1" 3196 ); 3197} 3198 3199void UYVYToUVRow_SSE2(const uint8* src_uyvy, 3200 int stride_uyvy, 3201 uint8* dst_u, 3202 uint8* dst_v, 3203 int width) { 3204 asm volatile ( 3205 "pcmpeqb %%xmm5,%%xmm5 \n" 3206 "psrlw $0x8,%%xmm5 \n" 3207 "sub %1,%2 \n" 3208 3209 LABELALIGN 3210 "1: \n" 3211 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 3212 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 3213 MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 3214 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 3215 "lea " MEMLEA(0x20,0) ",%0 \n" 3216 "pavgb %%xmm2,%%xmm0 \n" 3217 "pavgb %%xmm3,%%xmm1 \n" 3218 "pand %%xmm5,%%xmm0 \n" 3219 "pand %%xmm5,%%xmm1 \n" 3220 "packuswb %%xmm1,%%xmm0 \n" 3221 "movdqa %%xmm0,%%xmm1 \n" 3222 "pand %%xmm5,%%xmm0 \n" 3223 "packuswb %%xmm0,%%xmm0 \n" 3224 "psrlw $0x8,%%xmm1 \n" 3225 "packuswb %%xmm1,%%xmm1 \n" 3226 "movq %%xmm0," MEMACCESS(1) " \n" 3227 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) 3228 "lea " MEMLEA(0x8,1) ",%1 \n" 3229 "sub $0x10,%3 \n" 3230 "jg 1b \n" 3231 : "+r"(src_uyvy), // %0 3232 "+r"(dst_u), // %1 3233 "+r"(dst_v), // %2 3234 "+r"(width) // %3 3235 : "r"((intptr_t)(stride_uyvy)) // %4 3236 : "memory", "cc", NACL_R14 3237 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 3238 ); 3239} 3240 3241void UYVYToUV422Row_SSE2(const uint8* src_uyvy, 3242 uint8* dst_u, 3243 uint8* dst_v, 3244 int width) { 3245 asm volatile ( 3246 "pcmpeqb %%xmm5,%%xmm5 \n" 3247 "psrlw $0x8,%%xmm5 \n" 3248 "sub %1,%2 \n" 3249 3250 LABELALIGN 3251 "1: \n" 3252 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 3253 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 3254 "lea " MEMLEA(0x20,0) ",%0 \n" 3255 "pand %%xmm5,%%xmm0 \n" 3256 "pand %%xmm5,%%xmm1 \n" 3257 "packuswb %%xmm1,%%xmm0 \n" 3258 "movdqa %%xmm0,%%xmm1 \n" 3259 "pand %%xmm5,%%xmm0 \n" 3260 "packuswb %%xmm0,%%xmm0 \n" 3261 "psrlw $0x8,%%xmm1 \n" 3262 "packuswb %%xmm1,%%xmm1 \n" 3263 "movq %%xmm0," MEMACCESS(1) " \n" 3264 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) 3265 "lea " MEMLEA(0x8,1) ",%1 \n" 3266 "sub $0x10,%3 \n" 3267 "jg 1b \n" 3268 : "+r"(src_uyvy), // %0 3269 "+r"(dst_u), // %1 3270 "+r"(dst_v), // %2 3271 "+r"(width) // %3 3272 : 3273 : "memory", "cc", NACL_R14 3274 "xmm0", "xmm1", "xmm5" 3275 ); 3276} 3277#endif // HAS_YUY2TOYROW_SSE2 3278 3279#ifdef HAS_YUY2TOYROW_AVX2 3280void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) { 3281 asm volatile ( 3282 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 3283 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" 3284 3285 LABELALIGN 3286 "1: \n" 3287 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 3288 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" 3289 "lea " MEMLEA(0x40,0) ",%0 \n" 3290 "vpand %%ymm5,%%ymm0,%%ymm0 \n" 3291 "vpand %%ymm5,%%ymm1,%%ymm1 \n" 3292 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" 3293 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 3294 "vmovdqu %%ymm0," MEMACCESS(1) " \n" 3295 "lea " MEMLEA(0x20,1) ",%1 \n" 3296 "sub $0x20,%2 \n" 3297 "jg 1b \n" 3298 "vzeroupper \n" 3299 : "+r"(src_yuy2), // %0 3300 "+r"(dst_y), // %1 3301 "+r"(width) // %2 3302 : 3303 : "memory", "cc" 3304 , "xmm0", "xmm1", "xmm5" 3305 ); 3306} 3307 3308void YUY2ToUVRow_AVX2(const uint8* src_yuy2, 3309 int stride_yuy2, 3310 uint8* dst_u, 3311 uint8* dst_v, 3312 int width) { 3313 asm volatile ( 3314 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 3315 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" 3316 "sub %1,%2 \n" 3317 3318 LABELALIGN 3319 "1: \n" 3320 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 3321 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" 3322 VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0 3323 VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1) 3324 "lea " MEMLEA(0x40,0) ",%0 \n" 3325 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" 3326 "vpsrlw $0x8,%%ymm1,%%ymm1 \n" 3327 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" 3328 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 3329 "vpand %%ymm5,%%ymm0,%%ymm1 \n" 3330 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" 3331 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" 3332 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" 3333 "vpermq $0xd8,%%ymm1,%%ymm1 \n" 3334 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 3335 "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n" 3336 VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1) 3337 "lea " MEMLEA(0x10,1) ",%1 \n" 3338 "sub $0x20,%3 \n" 3339 "jg 1b \n" 3340 "vzeroupper \n" 3341 : "+r"(src_yuy2), // %0 3342 "+r"(dst_u), // %1 3343 "+r"(dst_v), // %2 3344 "+r"(width) // %3 3345 : "r"((intptr_t)(stride_yuy2)) // %4 3346 : "memory", "cc", NACL_R14 3347 "xmm0", "xmm1", "xmm5" 3348 ); 3349} 3350 3351void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, 3352 uint8* dst_u, 3353 uint8* dst_v, 3354 int width) { 3355 asm volatile ( 3356 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 3357 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" 3358 "sub %1,%2 \n" 3359 3360 LABELALIGN 3361 "1: \n" 3362 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 3363 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" 3364 "lea " MEMLEA(0x40,0) ",%0 \n" 3365 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" 3366 "vpsrlw $0x8,%%ymm1,%%ymm1 \n" 3367 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" 3368 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 3369 "vpand %%ymm5,%%ymm0,%%ymm1 \n" 3370 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" 3371 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" 3372 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" 3373 "vpermq $0xd8,%%ymm1,%%ymm1 \n" 3374 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 3375 "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n" 3376 VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1) 3377 "lea " MEMLEA(0x10,1) ",%1 \n" 3378 "sub $0x20,%3 \n" 3379 "jg 1b \n" 3380 "vzeroupper \n" 3381 : "+r"(src_yuy2), // %0 3382 "+r"(dst_u), // %1 3383 "+r"(dst_v), // %2 3384 "+r"(width) // %3 3385 : 3386 : "memory", "cc", NACL_R14 3387 "xmm0", "xmm1", "xmm5" 3388 ); 3389} 3390 3391void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width) { 3392 asm volatile ( 3393 LABELALIGN 3394 "1: \n" 3395 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 3396 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" 3397 "lea " MEMLEA(0x40,0) ",%0 \n" 3398 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" 3399 "vpsrlw $0x8,%%ymm1,%%ymm1 \n" 3400 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" 3401 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 3402 "vmovdqu %%ymm0," MEMACCESS(1) " \n" 3403 "lea " MEMLEA(0x20,1) ",%1 \n" 3404 "sub $0x20,%2 \n" 3405 "jg 1b \n" 3406 "vzeroupper \n" 3407 : "+r"(src_uyvy), // %0 3408 "+r"(dst_y), // %1 3409 "+r"(width) // %2 3410 : 3411 : "memory", "cc" 3412 , "xmm0", "xmm1", "xmm5" 3413 ); 3414} 3415void UYVYToUVRow_AVX2(const uint8* src_uyvy, 3416 int stride_uyvy, 3417 uint8* dst_u, 3418 uint8* dst_v, 3419 int width) { 3420 asm volatile ( 3421 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 3422 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" 3423 "sub %1,%2 \n" 3424 3425 LABELALIGN 3426 "1: \n" 3427 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 3428 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" 3429 VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0 3430 VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1) 3431 "lea " MEMLEA(0x40,0) ",%0 \n" 3432 "vpand %%ymm5,%%ymm0,%%ymm0 \n" 3433 "vpand %%ymm5,%%ymm1,%%ymm1 \n" 3434 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" 3435 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 3436 "vpand %%ymm5,%%ymm0,%%ymm1 \n" 3437 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" 3438 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" 3439 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" 3440 "vpermq $0xd8,%%ymm1,%%ymm1 \n" 3441 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 3442 "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n" 3443 VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1) 3444 "lea " MEMLEA(0x10,1) ",%1 \n" 3445 "sub $0x20,%3 \n" 3446 "jg 1b \n" 3447 "vzeroupper \n" 3448 : "+r"(src_uyvy), // %0 3449 "+r"(dst_u), // %1 3450 "+r"(dst_v), // %2 3451 "+r"(width) // %3 3452 : "r"((intptr_t)(stride_uyvy)) // %4 3453 : "memory", "cc", NACL_R14 3454 "xmm0", "xmm1", "xmm5" 3455 ); 3456} 3457 3458void UYVYToUV422Row_AVX2(const uint8* src_uyvy, 3459 uint8* dst_u, 3460 uint8* dst_v, 3461 int width) { 3462 asm volatile ( 3463 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 3464 "vpsrlw $0x8,%%ymm5,%%ymm5 \n" 3465 "sub %1,%2 \n" 3466 3467 LABELALIGN 3468 "1: \n" 3469 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 3470 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" 3471 "lea " MEMLEA(0x40,0) ",%0 \n" 3472 "vpand %%ymm5,%%ymm0,%%ymm0 \n" 3473 "vpand %%ymm5,%%ymm1,%%ymm1 \n" 3474 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" 3475 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 3476 "vpand %%ymm5,%%ymm0,%%ymm1 \n" 3477 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" 3478 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" 3479 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" 3480 "vpermq $0xd8,%%ymm1,%%ymm1 \n" 3481 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 3482 "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n" 3483 VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1) 3484 "lea " MEMLEA(0x10,1) ",%1 \n" 3485 "sub $0x20,%3 \n" 3486 "jg 1b \n" 3487 "vzeroupper \n" 3488 : "+r"(src_uyvy), // %0 3489 "+r"(dst_u), // %1 3490 "+r"(dst_v), // %2 3491 "+r"(width) // %3 3492 : 3493 : "memory", "cc", NACL_R14 3494 "xmm0", "xmm1", "xmm5" 3495 ); 3496} 3497#endif // HAS_YUY2TOYROW_AVX2 3498 3499#ifdef HAS_ARGBBLENDROW_SSSE3 3500// Shuffle table for isolating alpha. 3501static uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, 3502 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80}; 3503 3504// Blend 8 pixels at a time 3505void ARGBBlendRow_SSSE3(const uint8* src_argb0, 3506 const uint8* src_argb1, 3507 uint8* dst_argb, 3508 int width) { 3509 asm volatile ( 3510 "pcmpeqb %%xmm7,%%xmm7 \n" 3511 "psrlw $0xf,%%xmm7 \n" 3512 "pcmpeqb %%xmm6,%%xmm6 \n" 3513 "psrlw $0x8,%%xmm6 \n" 3514 "pcmpeqb %%xmm5,%%xmm5 \n" 3515 "psllw $0x8,%%xmm5 \n" 3516 "pcmpeqb %%xmm4,%%xmm4 \n" 3517 "pslld $0x18,%%xmm4 \n" 3518 "sub $0x4,%3 \n" 3519 "jl 49f \n" 3520 3521 // 4 pixel loop. 3522 LABELALIGN 3523 "40: \n" 3524 "movdqu " MEMACCESS(0) ",%%xmm3 \n" 3525 "lea " MEMLEA(0x10,0) ",%0 \n" 3526 "movdqa %%xmm3,%%xmm0 \n" 3527 "pxor %%xmm4,%%xmm3 \n" 3528 "movdqu " MEMACCESS(1) ",%%xmm2 \n" 3529 "pshufb %4,%%xmm3 \n" 3530 "pand %%xmm6,%%xmm2 \n" 3531 "paddw %%xmm7,%%xmm3 \n" 3532 "pmullw %%xmm3,%%xmm2 \n" 3533 "movdqu " MEMACCESS(1) ",%%xmm1 \n" 3534 "lea " MEMLEA(0x10,1) ",%1 \n" 3535 "psrlw $0x8,%%xmm1 \n" 3536 "por %%xmm4,%%xmm0 \n" 3537 "pmullw %%xmm3,%%xmm1 \n" 3538 "psrlw $0x8,%%xmm2 \n" 3539 "paddusb %%xmm2,%%xmm0 \n" 3540 "pand %%xmm5,%%xmm1 \n" 3541 "paddusb %%xmm1,%%xmm0 \n" 3542 "movdqu %%xmm0," MEMACCESS(2) " \n" 3543 "lea " MEMLEA(0x10,2) ",%2 \n" 3544 "sub $0x4,%3 \n" 3545 "jge 40b \n" 3546 3547 "49: \n" 3548 "add $0x3,%3 \n" 3549 "jl 99f \n" 3550 3551 // 1 pixel loop. 3552 "91: \n" 3553 "movd " MEMACCESS(0) ",%%xmm3 \n" 3554 "lea " MEMLEA(0x4,0) ",%0 \n" 3555 "movdqa %%xmm3,%%xmm0 \n" 3556 "pxor %%xmm4,%%xmm3 \n" 3557 "movd " MEMACCESS(1) ",%%xmm2 \n" 3558 "pshufb %4,%%xmm3 \n" 3559 "pand %%xmm6,%%xmm2 \n" 3560 "paddw %%xmm7,%%xmm3 \n" 3561 "pmullw %%xmm3,%%xmm2 \n" 3562 "movd " MEMACCESS(1) ",%%xmm1 \n" 3563 "lea " MEMLEA(0x4,1) ",%1 \n" 3564 "psrlw $0x8,%%xmm1 \n" 3565 "por %%xmm4,%%xmm0 \n" 3566 "pmullw %%xmm3,%%xmm1 \n" 3567 "psrlw $0x8,%%xmm2 \n" 3568 "paddusb %%xmm2,%%xmm0 \n" 3569 "pand %%xmm5,%%xmm1 \n" 3570 "paddusb %%xmm1,%%xmm0 \n" 3571 "movd %%xmm0," MEMACCESS(2) " \n" 3572 "lea " MEMLEA(0x4,2) ",%2 \n" 3573 "sub $0x1,%3 \n" 3574 "jge 91b \n" 3575 "99: \n" 3576 : "+r"(src_argb0), // %0 3577 "+r"(src_argb1), // %1 3578 "+r"(dst_argb), // %2 3579 "+r"(width) // %3 3580 : "m"(kShuffleAlpha) // %4 3581 : "memory", "cc" 3582 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 3583 ); 3584} 3585#endif // HAS_ARGBBLENDROW_SSSE3 3586 3587#ifdef HAS_BLENDPLANEROW_SSSE3 3588// Blend 8 pixels at a time. 3589// unsigned version of math 3590// =((A2*C2)+(B2*(255-C2))+255)/256 3591// signed version of math 3592// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 3593void BlendPlaneRow_SSSE3(const uint8* src0, 3594 const uint8* src1, 3595 const uint8* alpha, 3596 uint8* dst, 3597 int width) { 3598 asm volatile( 3599 "pcmpeqb %%xmm5,%%xmm5 \n" 3600 "psllw $0x8,%%xmm5 \n" 3601 "mov $0x80808080,%%eax \n" 3602 "movd %%eax,%%xmm6 \n" 3603 "pshufd $0x0,%%xmm6,%%xmm6 \n" 3604 "mov $0x807f807f,%%eax \n" 3605 "movd %%eax,%%xmm7 \n" 3606 "pshufd $0x0,%%xmm7,%%xmm7 \n" 3607 "sub %2,%0 \n" 3608 "sub %2,%1 \n" 3609 "sub %2,%3 \n" 3610 3611 // 8 pixel loop. 3612 LABELALIGN 3613 "1: \n" 3614 "movq (%2),%%xmm0 \n" 3615 "punpcklbw %%xmm0,%%xmm0 \n" 3616 "pxor %%xmm5,%%xmm0 \n" 3617 "movq (%0,%2,1),%%xmm1 \n" 3618 "movq (%1,%2,1),%%xmm2 \n" 3619 "punpcklbw %%xmm2,%%xmm1 \n" 3620 "psubb %%xmm6,%%xmm1 \n" 3621 "pmaddubsw %%xmm1,%%xmm0 \n" 3622 "paddw %%xmm7,%%xmm0 \n" 3623 "psrlw $0x8,%%xmm0 \n" 3624 "packuswb %%xmm0,%%xmm0 \n" 3625 "movq %%xmm0,(%3,%2,1) \n" 3626 "lea 0x8(%2),%2 \n" 3627 "sub $0x8,%4 \n" 3628 "jg 1b \n" 3629 : "+r"(src0), // %0 3630 "+r"(src1), // %1 3631 "+r"(alpha), // %2 3632 "+r"(dst), // %3 3633 "+rm"(width) // %4 3634 ::"memory", 3635 "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7"); 3636} 3637#endif // HAS_BLENDPLANEROW_SSSE3 3638 3639#ifdef HAS_BLENDPLANEROW_AVX2 3640// Blend 32 pixels at a time. 3641// unsigned version of math 3642// =((A2*C2)+(B2*(255-C2))+255)/256 3643// signed version of math 3644// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 3645void BlendPlaneRow_AVX2(const uint8* src0, 3646 const uint8* src1, 3647 const uint8* alpha, 3648 uint8* dst, 3649 int width) { 3650 asm volatile( 3651 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 3652 "vpsllw $0x8,%%ymm5,%%ymm5 \n" 3653 "mov $0x80808080,%%eax \n" 3654 "vmovd %%eax,%%xmm6 \n" 3655 "vbroadcastss %%xmm6,%%ymm6 \n" 3656 "mov $0x807f807f,%%eax \n" 3657 "vmovd %%eax,%%xmm7 \n" 3658 "vbroadcastss %%xmm7,%%ymm7 \n" 3659 "sub %2,%0 \n" 3660 "sub %2,%1 \n" 3661 "sub %2,%3 \n" 3662 3663 // 32 pixel loop. 3664 LABELALIGN 3665 "1: \n" 3666 "vmovdqu (%2),%%ymm0 \n" 3667 "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n" 3668 "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" 3669 "vpxor %%ymm5,%%ymm3,%%ymm3 \n" 3670 "vpxor %%ymm5,%%ymm0,%%ymm0 \n" 3671 "vmovdqu (%0,%2,1),%%ymm1 \n" 3672 "vmovdqu (%1,%2,1),%%ymm2 \n" 3673 "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n" 3674 "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n" 3675 "vpsubb %%ymm6,%%ymm4,%%ymm4 \n" 3676 "vpsubb %%ymm6,%%ymm1,%%ymm1 \n" 3677 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" 3678 "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n" 3679 "vpaddw %%ymm7,%%ymm3,%%ymm3 \n" 3680 "vpaddw %%ymm7,%%ymm0,%%ymm0 \n" 3681 "vpsrlw $0x8,%%ymm3,%%ymm3 \n" 3682 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" 3683 "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n" 3684 "vmovdqu %%ymm0,(%3,%2,1) \n" 3685 "lea 0x20(%2),%2 \n" 3686 "sub $0x20,%4 \n" 3687 "jg 1b \n" 3688 "vzeroupper \n" 3689 : "+r"(src0), // %0 3690 "+r"(src1), // %1 3691 "+r"(alpha), // %2 3692 "+r"(dst), // %3 3693 "+rm"(width) // %4 3694 ::"memory", 3695 "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", 3696 "xmm7"); 3697} 3698#endif // HAS_BLENDPLANEROW_AVX2 3699 3700#ifdef HAS_ARGBATTENUATEROW_SSSE3 3701// Shuffle table duplicating alpha 3702static uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 3703 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u}; 3704static uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, 3705 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u}; 3706// Attenuate 4 pixels at a time. 3707void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { 3708 asm volatile ( 3709 "pcmpeqb %%xmm3,%%xmm3 \n" 3710 "pslld $0x18,%%xmm3 \n" 3711 "movdqa %3,%%xmm4 \n" 3712 "movdqa %4,%%xmm5 \n" 3713 3714 // 4 pixel loop. 3715 LABELALIGN 3716 "1: \n" 3717 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 3718 "pshufb %%xmm4,%%xmm0 \n" 3719 "movdqu " MEMACCESS(0) ",%%xmm1 \n" 3720 "punpcklbw %%xmm1,%%xmm1 \n" 3721 "pmulhuw %%xmm1,%%xmm0 \n" 3722 "movdqu " MEMACCESS(0) ",%%xmm1 \n" 3723 "pshufb %%xmm5,%%xmm1 \n" 3724 "movdqu " MEMACCESS(0) ",%%xmm2 \n" 3725 "punpckhbw %%xmm2,%%xmm2 \n" 3726 "pmulhuw %%xmm2,%%xmm1 \n" 3727 "movdqu " MEMACCESS(0) ",%%xmm2 \n" 3728 "lea " MEMLEA(0x10,0) ",%0 \n" 3729 "pand %%xmm3,%%xmm2 \n" 3730 "psrlw $0x8,%%xmm0 \n" 3731 "psrlw $0x8,%%xmm1 \n" 3732 "packuswb %%xmm1,%%xmm0 \n" 3733 "por %%xmm2,%%xmm0 \n" 3734 "movdqu %%xmm0," MEMACCESS(1) " \n" 3735 "lea " MEMLEA(0x10,1) ",%1 \n" 3736 "sub $0x4,%2 \n" 3737 "jg 1b \n" 3738 : "+r"(src_argb), // %0 3739 "+r"(dst_argb), // %1 3740 "+r"(width) // %2 3741 : "m"(kShuffleAlpha0), // %3 3742 "m"(kShuffleAlpha1) // %4 3743 : "memory", "cc" 3744 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 3745 ); 3746} 3747#endif // HAS_ARGBATTENUATEROW_SSSE3 3748 3749#ifdef HAS_ARGBATTENUATEROW_AVX2 3750// Shuffle table duplicating alpha. 3751static const uvec8 kShuffleAlpha_AVX2 = {6u, 7u, 6u, 7u, 6u, 7u, 3752 128u, 128u, 14u, 15u, 14u, 15u, 3753 14u, 15u, 128u, 128u}; 3754// Attenuate 8 pixels at a time. 3755void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { 3756 asm volatile ( 3757 "vbroadcastf128 %3,%%ymm4 \n" 3758 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" 3759 "vpslld $0x18,%%ymm5,%%ymm5 \n" 3760 "sub %0,%1 \n" 3761 3762 // 8 pixel loop. 3763 LABELALIGN 3764 "1: \n" 3765 "vmovdqu " MEMACCESS(0) ",%%ymm6 \n" 3766 "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n" 3767 "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n" 3768 "vpshufb %%ymm4,%%ymm0,%%ymm2 \n" 3769 "vpshufb %%ymm4,%%ymm1,%%ymm3 \n" 3770 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" 3771 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" 3772 "vpand %%ymm5,%%ymm6,%%ymm6 \n" 3773 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" 3774 "vpsrlw $0x8,%%ymm1,%%ymm1 \n" 3775 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" 3776 "vpor %%ymm6,%%ymm0,%%ymm0 \n" 3777 MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1) 3778 "lea " MEMLEA(0x20,0) ",%0 \n" 3779 "sub $0x8,%2 \n" 3780 "jg 1b \n" 3781 "vzeroupper \n" 3782 : "+r"(src_argb), // %0 3783 "+r"(dst_argb), // %1 3784 "+r"(width) // %2 3785 : "m"(kShuffleAlpha_AVX2) // %3 3786 : "memory", "cc" 3787 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" 3788 ); 3789} 3790#endif // HAS_ARGBATTENUATEROW_AVX2 3791 3792#ifdef HAS_ARGBUNATTENUATEROW_SSE2 3793// Unattenuate 4 pixels at a time. 3794void ARGBUnattenuateRow_SSE2(const uint8* src_argb, 3795 uint8* dst_argb, 3796 int width) { 3797 uintptr_t alpha; 3798 asm volatile ( 3799 // 4 pixel loop. 3800 LABELALIGN 3801 "1: \n" 3802 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 3803 "movzb " MEMACCESS2(0x03,0) ",%3 \n" 3804 "punpcklbw %%xmm0,%%xmm0 \n" 3805 MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2 3806 "movzb " MEMACCESS2(0x07,0) ",%3 \n" 3807 MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3 3808 "pshuflw $0x40,%%xmm2,%%xmm2 \n" 3809 "pshuflw $0x40,%%xmm3,%%xmm3 \n" 3810 "movlhps %%xmm3,%%xmm2 \n" 3811 "pmulhuw %%xmm2,%%xmm0 \n" 3812 "movdqu " MEMACCESS(0) ",%%xmm1 \n" 3813 "movzb " MEMACCESS2(0x0b,0) ",%3 \n" 3814 "punpckhbw %%xmm1,%%xmm1 \n" 3815 MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2 3816 "movzb " MEMACCESS2(0x0f,0) ",%3 \n" 3817 MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3 3818 "pshuflw $0x40,%%xmm2,%%xmm2 \n" 3819 "pshuflw $0x40,%%xmm3,%%xmm3 \n" 3820 "movlhps %%xmm3,%%xmm2 \n" 3821 "pmulhuw %%xmm2,%%xmm1 \n" 3822 "lea " MEMLEA(0x10,0) ",%0 \n" 3823 "packuswb %%xmm1,%%xmm0 \n" 3824 "movdqu %%xmm0," MEMACCESS(1) " \n" 3825 "lea " MEMLEA(0x10,1) ",%1 \n" 3826 "sub $0x4,%2 \n" 3827 "jg 1b \n" 3828 : "+r"(src_argb), // %0 3829 "+r"(dst_argb), // %1 3830 "+r"(width), // %2 3831 "=&r"(alpha) // %3 3832 : "r"(fixed_invtbl8) // %4 3833 : "memory", "cc", NACL_R14 3834 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 3835 ); 3836} 3837#endif // HAS_ARGBUNATTENUATEROW_SSE2 3838 3839#ifdef HAS_ARGBUNATTENUATEROW_AVX2 3840// Shuffle table duplicating alpha. 3841static const uvec8 kUnattenShuffleAlpha_AVX2 = { 3842 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u}; 3843// Unattenuate 8 pixels at a time. 3844void ARGBUnattenuateRow_AVX2(const uint8* src_argb, 3845 uint8* dst_argb, 3846 int width) { 3847 uintptr_t alpha; 3848 asm volatile ( 3849 "sub %0,%1 \n" 3850 "vbroadcastf128 %5,%%ymm5 \n" 3851 3852 // 8 pixel loop. 3853 LABELALIGN 3854 "1: \n" 3855 // replace VPGATHER 3856 "movzb " MEMACCESS2(0x03,0) ",%3 \n" 3857 MEMOPREG(vmovd,0x00,4,3,4,xmm0) // vmovd 0x0(%4,%3,4),%%xmm0 3858 "movzb " MEMACCESS2(0x07,0) ",%3 \n" 3859 MEMOPREG(vmovd,0x00,4,3,4,xmm1) // vmovd 0x0(%4,%3,4),%%xmm1 3860 "movzb " MEMACCESS2(0x0b,0) ",%3 \n" 3861 "vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n" 3862 MEMOPREG(vmovd,0x00,4,3,4,xmm2) // vmovd 0x0(%4,%3,4),%%xmm2 3863 "movzb " MEMACCESS2(0x0f,0) ",%3 \n" 3864 MEMOPREG(vmovd,0x00,4,3,4,xmm3) // vmovd 0x0(%4,%3,4),%%xmm3 3865 "movzb " MEMACCESS2(0x13,0) ",%3 \n" 3866 "vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n" 3867 MEMOPREG(vmovd,0x00,4,3,4,xmm0) // vmovd 0x0(%4,%3,4),%%xmm0 3868 "movzb " MEMACCESS2(0x17,0) ",%3 \n" 3869 MEMOPREG(vmovd,0x00,4,3,4,xmm1) // vmovd 0x0(%4,%3,4),%%xmm1 3870 "movzb " MEMACCESS2(0x1b,0) ",%3 \n" 3871 "vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n" 3872 MEMOPREG(vmovd,0x00,4,3,4,xmm2) // vmovd 0x0(%4,%3,4),%%xmm2 3873 "movzb " MEMACCESS2(0x1f,0) ",%3 \n" 3874 MEMOPREG(vmovd,0x00,4,3,4,xmm3) // vmovd 0x0(%4,%3,4),%%xmm3 3875 "vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n" 3876 "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3 \n" 3877 "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0 \n" 3878 "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3 \n" 3879 // end of VPGATHER 3880 3881 "vmovdqu " MEMACCESS(0) ",%%ymm6 \n" 3882 "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n" 3883 "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n" 3884 "vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n" 3885 "vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n" 3886 "vpshufb %%ymm5,%%ymm2,%%ymm2 \n" 3887 "vpshufb %%ymm5,%%ymm3,%%ymm3 \n" 3888 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" 3889 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" 3890 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" 3891 MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1) 3892 "lea " MEMLEA(0x20,0) ",%0 \n" 3893 "sub $0x8,%2 \n" 3894 "jg 1b \n" 3895 "vzeroupper \n" 3896 : "+r"(src_argb), // %0 3897 "+r"(dst_argb), // %1 3898 "+r"(width), // %2 3899 "=&r"(alpha) // %3 3900 : "r"(fixed_invtbl8), // %4 3901 "m"(kUnattenShuffleAlpha_AVX2) // %5 3902 : "memory", "cc", NACL_R14 3903 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 3904 ); 3905} 3906#endif // HAS_ARGBUNATTENUATEROW_AVX2 3907 3908#ifdef HAS_ARGBGRAYROW_SSSE3 3909// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels 3910void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { 3911 asm volatile ( 3912 "movdqa %3,%%xmm4 \n" 3913 "movdqa %4,%%xmm5 \n" 3914 3915 // 8 pixel loop. 3916 LABELALIGN 3917 "1: \n" 3918 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 3919 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 3920 "pmaddubsw %%xmm4,%%xmm0 \n" 3921 "pmaddubsw %%xmm4,%%xmm1 \n" 3922 "phaddw %%xmm1,%%xmm0 \n" 3923 "paddw %%xmm5,%%xmm0 \n" 3924 "psrlw $0x7,%%xmm0 \n" 3925 "packuswb %%xmm0,%%xmm0 \n" 3926 "movdqu " MEMACCESS(0) ",%%xmm2 \n" 3927 "movdqu " MEMACCESS2(0x10,0) ",%%xmm3 \n" 3928 "lea " MEMLEA(0x20,0) ",%0 \n" 3929 "psrld $0x18,%%xmm2 \n" 3930 "psrld $0x18,%%xmm3 \n" 3931 "packuswb %%xmm3,%%xmm2 \n" 3932 "packuswb %%xmm2,%%xmm2 \n" 3933 "movdqa %%xmm0,%%xmm3 \n" 3934 "punpcklbw %%xmm0,%%xmm0 \n" 3935 "punpcklbw %%xmm2,%%xmm3 \n" 3936 "movdqa %%xmm0,%%xmm1 \n" 3937 "punpcklwd %%xmm3,%%xmm0 \n" 3938 "punpckhwd %%xmm3,%%xmm1 \n" 3939 "movdqu %%xmm0," MEMACCESS(1) " \n" 3940 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" 3941 "lea " MEMLEA(0x20,1) ",%1 \n" 3942 "sub $0x8,%2 \n" 3943 "jg 1b \n" 3944 : "+r"(src_argb), // %0 3945 "+r"(dst_argb), // %1 3946 "+r"(width) // %2 3947 : "m"(kARGBToYJ), // %3 3948 "m"(kAddYJ64) // %4 3949 : "memory", "cc" 3950 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 3951 ); 3952} 3953#endif // HAS_ARGBGRAYROW_SSSE3 3954 3955#ifdef HAS_ARGBSEPIAROW_SSSE3 3956// b = (r * 35 + g * 68 + b * 17) >> 7 3957// g = (r * 45 + g * 88 + b * 22) >> 7 3958// r = (r * 50 + g * 98 + b * 24) >> 7 3959// Constant for ARGB color to sepia tone 3960static vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0, 3961 17, 68, 35, 0, 17, 68, 35, 0}; 3962 3963static vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0, 3964 22, 88, 45, 0, 22, 88, 45, 0}; 3965 3966static vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0, 3967 24, 98, 50, 0, 24, 98, 50, 0}; 3968 3969// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. 3970void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { 3971 asm volatile ( 3972 "movdqa %2,%%xmm2 \n" 3973 "movdqa %3,%%xmm3 \n" 3974 "movdqa %4,%%xmm4 \n" 3975 3976 // 8 pixel loop. 3977 LABELALIGN 3978 "1: \n" 3979 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 3980 "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n" 3981 "pmaddubsw %%xmm2,%%xmm0 \n" 3982 "pmaddubsw %%xmm2,%%xmm6 \n" 3983 "phaddw %%xmm6,%%xmm0 \n" 3984 "psrlw $0x7,%%xmm0 \n" 3985 "packuswb %%xmm0,%%xmm0 \n" 3986 "movdqu " MEMACCESS(0) ",%%xmm5 \n" 3987 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 3988 "pmaddubsw %%xmm3,%%xmm5 \n" 3989 "pmaddubsw %%xmm3,%%xmm1 \n" 3990 "phaddw %%xmm1,%%xmm5 \n" 3991 "psrlw $0x7,%%xmm5 \n" 3992 "packuswb %%xmm5,%%xmm5 \n" 3993 "punpcklbw %%xmm5,%%xmm0 \n" 3994 "movdqu " MEMACCESS(0) ",%%xmm5 \n" 3995 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 3996 "pmaddubsw %%xmm4,%%xmm5 \n" 3997 "pmaddubsw %%xmm4,%%xmm1 \n" 3998 "phaddw %%xmm1,%%xmm5 \n" 3999 "psrlw $0x7,%%xmm5 \n" 4000 "packuswb %%xmm5,%%xmm5 \n" 4001 "movdqu " MEMACCESS(0) ",%%xmm6 \n" 4002 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 4003 "psrld $0x18,%%xmm6 \n" 4004 "psrld $0x18,%%xmm1 \n" 4005 "packuswb %%xmm1,%%xmm6 \n" 4006 "packuswb %%xmm6,%%xmm6 \n" 4007 "punpcklbw %%xmm6,%%xmm5 \n" 4008 "movdqa %%xmm0,%%xmm1 \n" 4009 "punpcklwd %%xmm5,%%xmm0 \n" 4010 "punpckhwd %%xmm5,%%xmm1 \n" 4011 "movdqu %%xmm0," MEMACCESS(0) " \n" 4012 "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n" 4013 "lea " MEMLEA(0x20,0) ",%0 \n" 4014 "sub $0x8,%1 \n" 4015 "jg 1b \n" 4016 : "+r"(dst_argb), // %0 4017 "+r"(width) // %1 4018 : "m"(kARGBToSepiaB), // %2 4019 "m"(kARGBToSepiaG), // %3 4020 "m"(kARGBToSepiaR) // %4 4021 : "memory", "cc" 4022 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" 4023 ); 4024} 4025#endif // HAS_ARGBSEPIAROW_SSSE3 4026 4027#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 4028// Tranform 8 ARGB pixels (32 bytes) with color matrix. 4029// Same as Sepia except matrix is provided. 4030void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, 4031 uint8* dst_argb, 4032 const int8* matrix_argb, 4033 int width) { 4034 asm volatile ( 4035 "movdqu " MEMACCESS(3) ",%%xmm5 \n" 4036 "pshufd $0x00,%%xmm5,%%xmm2 \n" 4037 "pshufd $0x55,%%xmm5,%%xmm3 \n" 4038 "pshufd $0xaa,%%xmm5,%%xmm4 \n" 4039 "pshufd $0xff,%%xmm5,%%xmm5 \n" 4040 4041 // 8 pixel loop. 4042 LABELALIGN 4043 "1: \n" 4044 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 4045 "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n" 4046 "pmaddubsw %%xmm2,%%xmm0 \n" 4047 "pmaddubsw %%xmm2,%%xmm7 \n" 4048 "movdqu " MEMACCESS(0) ",%%xmm6 \n" 4049 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 4050 "pmaddubsw %%xmm3,%%xmm6 \n" 4051 "pmaddubsw %%xmm3,%%xmm1 \n" 4052 "phaddsw %%xmm7,%%xmm0 \n" 4053 "phaddsw %%xmm1,%%xmm6 \n" 4054 "psraw $0x6,%%xmm0 \n" 4055 "psraw $0x6,%%xmm6 \n" 4056 "packuswb %%xmm0,%%xmm0 \n" 4057 "packuswb %%xmm6,%%xmm6 \n" 4058 "punpcklbw %%xmm6,%%xmm0 \n" 4059 "movdqu " MEMACCESS(0) ",%%xmm1 \n" 4060 "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n" 4061 "pmaddubsw %%xmm4,%%xmm1 \n" 4062 "pmaddubsw %%xmm4,%%xmm7 \n" 4063 "phaddsw %%xmm7,%%xmm1 \n" 4064 "movdqu " MEMACCESS(0) ",%%xmm6 \n" 4065 "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n" 4066 "pmaddubsw %%xmm5,%%xmm6 \n" 4067 "pmaddubsw %%xmm5,%%xmm7 \n" 4068 "phaddsw %%xmm7,%%xmm6 \n" 4069 "psraw $0x6,%%xmm1 \n" 4070 "psraw $0x6,%%xmm6 \n" 4071 "packuswb %%xmm1,%%xmm1 \n" 4072 "packuswb %%xmm6,%%xmm6 \n" 4073 "punpcklbw %%xmm6,%%xmm1 \n" 4074 "movdqa %%xmm0,%%xmm6 \n" 4075 "punpcklwd %%xmm1,%%xmm0 \n" 4076 "punpckhwd %%xmm1,%%xmm6 \n" 4077 "movdqu %%xmm0," MEMACCESS(1) " \n" 4078 "movdqu %%xmm6," MEMACCESS2(0x10,1) " \n" 4079 "lea " MEMLEA(0x20,0) ",%0 \n" 4080 "lea " MEMLEA(0x20,1) ",%1 \n" 4081 "sub $0x8,%2 \n" 4082 "jg 1b \n" 4083 : "+r"(src_argb), // %0 4084 "+r"(dst_argb), // %1 4085 "+r"(width) // %2 4086 : "r"(matrix_argb) // %3 4087 : "memory", "cc" 4088 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 4089 ); 4090} 4091#endif // HAS_ARGBCOLORMATRIXROW_SSSE3 4092 4093#ifdef HAS_ARGBQUANTIZEROW_SSE2 4094// Quantize 4 ARGB pixels (16 bytes). 4095void ARGBQuantizeRow_SSE2(uint8* dst_argb, 4096 int scale, 4097 int interval_size, 4098 int interval_offset, 4099 int width) { 4100 asm volatile ( 4101 "movd %2,%%xmm2 \n" 4102 "movd %3,%%xmm3 \n" 4103 "movd %4,%%xmm4 \n" 4104 "pshuflw $0x40,%%xmm2,%%xmm2 \n" 4105 "pshufd $0x44,%%xmm2,%%xmm2 \n" 4106 "pshuflw $0x40,%%xmm3,%%xmm3 \n" 4107 "pshufd $0x44,%%xmm3,%%xmm3 \n" 4108 "pshuflw $0x40,%%xmm4,%%xmm4 \n" 4109 "pshufd $0x44,%%xmm4,%%xmm4 \n" 4110 "pxor %%xmm5,%%xmm5 \n" 4111 "pcmpeqb %%xmm6,%%xmm6 \n" 4112 "pslld $0x18,%%xmm6 \n" 4113 4114 // 4 pixel loop. 4115 LABELALIGN 4116 "1: \n" 4117 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 4118 "punpcklbw %%xmm5,%%xmm0 \n" 4119 "pmulhuw %%xmm2,%%xmm0 \n" 4120 "movdqu " MEMACCESS(0) ",%%xmm1 \n" 4121 "punpckhbw %%xmm5,%%xmm1 \n" 4122 "pmulhuw %%xmm2,%%xmm1 \n" 4123 "pmullw %%xmm3,%%xmm0 \n" 4124 "movdqu " MEMACCESS(0) ",%%xmm7 \n" 4125 "pmullw %%xmm3,%%xmm1 \n" 4126 "pand %%xmm6,%%xmm7 \n" 4127 "paddw %%xmm4,%%xmm0 \n" 4128 "paddw %%xmm4,%%xmm1 \n" 4129 "packuswb %%xmm1,%%xmm0 \n" 4130 "por %%xmm7,%%xmm0 \n" 4131 "movdqu %%xmm0," MEMACCESS(0) " \n" 4132 "lea " MEMLEA(0x10,0) ",%0 \n" 4133 "sub $0x4,%1 \n" 4134 "jg 1b \n" 4135 : "+r"(dst_argb), // %0 4136 "+r"(width) // %1 4137 : "r"(scale), // %2 4138 "r"(interval_size), // %3 4139 "r"(interval_offset) // %4 4140 : "memory", "cc" 4141 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 4142 ); 4143} 4144#endif // HAS_ARGBQUANTIZEROW_SSE2 4145 4146#ifdef HAS_ARGBSHADEROW_SSE2 4147// Shade 4 pixels at a time by specified value. 4148void ARGBShadeRow_SSE2(const uint8* src_argb, 4149 uint8* dst_argb, 4150 int width, 4151 uint32 value) { 4152 asm volatile ( 4153 "movd %3,%%xmm2 \n" 4154 "punpcklbw %%xmm2,%%xmm2 \n" 4155 "punpcklqdq %%xmm2,%%xmm2 \n" 4156 4157 // 4 pixel loop. 4158 LABELALIGN 4159 "1: \n" 4160 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 4161 "lea " MEMLEA(0x10,0) ",%0 \n" 4162 "movdqa %%xmm0,%%xmm1 \n" 4163 "punpcklbw %%xmm0,%%xmm0 \n" 4164 "punpckhbw %%xmm1,%%xmm1 \n" 4165 "pmulhuw %%xmm2,%%xmm0 \n" 4166 "pmulhuw %%xmm2,%%xmm1 \n" 4167 "psrlw $0x8,%%xmm0 \n" 4168 "psrlw $0x8,%%xmm1 \n" 4169 "packuswb %%xmm1,%%xmm0 \n" 4170 "movdqu %%xmm0," MEMACCESS(1) " \n" 4171 "lea " MEMLEA(0x10,1) ",%1 \n" 4172 "sub $0x4,%2 \n" 4173 "jg 1b \n" 4174 : "+r"(src_argb), // %0 4175 "+r"(dst_argb), // %1 4176 "+r"(width) // %2 4177 : "r"(value) // %3 4178 : "memory", "cc" 4179 , "xmm0", "xmm1", "xmm2" 4180 ); 4181} 4182#endif // HAS_ARGBSHADEROW_SSE2 4183 4184#ifdef HAS_ARGBMULTIPLYROW_SSE2 4185// Multiply 2 rows of ARGB pixels together, 4 pixels at a time. 4186void ARGBMultiplyRow_SSE2(const uint8* src_argb0, 4187 const uint8* src_argb1, 4188 uint8* dst_argb, 4189 int width) { 4190 asm volatile ( 4191 "pxor %%xmm5,%%xmm5 \n" 4192 4193 // 4 pixel loop. 4194 LABELALIGN 4195 "1: \n" 4196 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 4197 "lea " MEMLEA(0x10,0) ",%0 \n" 4198 "movdqu " MEMACCESS(1) ",%%xmm2 \n" 4199 "lea " MEMLEA(0x10,1) ",%1 \n" 4200 "movdqu %%xmm0,%%xmm1 \n" 4201 "movdqu %%xmm2,%%xmm3 \n" 4202 "punpcklbw %%xmm0,%%xmm0 \n" 4203 "punpckhbw %%xmm1,%%xmm1 \n" 4204 "punpcklbw %%xmm5,%%xmm2 \n" 4205 "punpckhbw %%xmm5,%%xmm3 \n" 4206 "pmulhuw %%xmm2,%%xmm0 \n" 4207 "pmulhuw %%xmm3,%%xmm1 \n" 4208 "packuswb %%xmm1,%%xmm0 \n" 4209 "movdqu %%xmm0," MEMACCESS(2) " \n" 4210 "lea " MEMLEA(0x10,2) ",%2 \n" 4211 "sub $0x4,%3 \n" 4212 "jg 1b \n" 4213 : "+r"(src_argb0), // %0 4214 "+r"(src_argb1), // %1 4215 "+r"(dst_argb), // %2 4216 "+r"(width) // %3 4217 : 4218 : "memory", "cc" 4219 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 4220 ); 4221} 4222#endif // HAS_ARGBMULTIPLYROW_SSE2 4223 4224#ifdef HAS_ARGBMULTIPLYROW_AVX2 4225// Multiply 2 rows of ARGB pixels together, 8 pixels at a time. 4226void ARGBMultiplyRow_AVX2(const uint8* src_argb0, 4227 const uint8* src_argb1, 4228 uint8* dst_argb, 4229 int width) { 4230 asm volatile ( 4231 "vpxor %%ymm5,%%ymm5,%%ymm5 \n" 4232 4233 // 4 pixel loop. 4234 LABELALIGN 4235 "1: \n" 4236 "vmovdqu " MEMACCESS(0) ",%%ymm1 \n" 4237 "lea " MEMLEA(0x20,0) ",%0 \n" 4238 "vmovdqu " MEMACCESS(1) ",%%ymm3 \n" 4239 "lea " MEMLEA(0x20,1) ",%1 \n" 4240 "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n" 4241 "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n" 4242 "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n" 4243 "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n" 4244 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" 4245 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" 4246 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" 4247 "vmovdqu %%ymm0," MEMACCESS(2) " \n" 4248 "lea " MEMLEA(0x20,2) ",%2 \n" 4249 "sub $0x8,%3 \n" 4250 "jg 1b \n" 4251 "vzeroupper \n" 4252 : "+r"(src_argb0), // %0 4253 "+r"(src_argb1), // %1 4254 "+r"(dst_argb), // %2 4255 "+r"(width) // %3 4256 : 4257 : "memory", "cc" 4258#if defined(__AVX2__) 4259 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 4260#endif 4261 ); 4262} 4263#endif // HAS_ARGBMULTIPLYROW_AVX2 4264 4265#ifdef HAS_ARGBADDROW_SSE2 4266// Add 2 rows of ARGB pixels together, 4 pixels at a time. 4267void ARGBAddRow_SSE2(const uint8* src_argb0, 4268 const uint8* src_argb1, 4269 uint8* dst_argb, 4270 int width) { 4271 asm volatile ( 4272 // 4 pixel loop. 4273 LABELALIGN 4274 "1: \n" 4275 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 4276 "lea " MEMLEA(0x10,0) ",%0 \n" 4277 "movdqu " MEMACCESS(1) ",%%xmm1 \n" 4278 "lea " MEMLEA(0x10,1) ",%1 \n" 4279 "paddusb %%xmm1,%%xmm0 \n" 4280 "movdqu %%xmm0," MEMACCESS(2) " \n" 4281 "lea " MEMLEA(0x10,2) ",%2 \n" 4282 "sub $0x4,%3 \n" 4283 "jg 1b \n" 4284 : "+r"(src_argb0), // %0 4285 "+r"(src_argb1), // %1 4286 "+r"(dst_argb), // %2 4287 "+r"(width) // %3 4288 : 4289 : "memory", "cc" 4290 , "xmm0", "xmm1" 4291 ); 4292} 4293#endif // HAS_ARGBADDROW_SSE2 4294 4295#ifdef HAS_ARGBADDROW_AVX2 4296// Add 2 rows of ARGB pixels together, 4 pixels at a time. 4297void ARGBAddRow_AVX2(const uint8* src_argb0, 4298 const uint8* src_argb1, 4299 uint8* dst_argb, 4300 int width) { 4301 asm volatile ( 4302 // 4 pixel loop. 4303 LABELALIGN 4304 "1: \n" 4305 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 4306 "lea " MEMLEA(0x20,0) ",%0 \n" 4307 "vpaddusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n" 4308 "lea " MEMLEA(0x20,1) ",%1 \n" 4309 "vmovdqu %%ymm0," MEMACCESS(2) " \n" 4310 "lea " MEMLEA(0x20,2) ",%2 \n" 4311 "sub $0x8,%3 \n" 4312 "jg 1b \n" 4313 "vzeroupper \n" 4314 : "+r"(src_argb0), // %0 4315 "+r"(src_argb1), // %1 4316 "+r"(dst_argb), // %2 4317 "+r"(width) // %3 4318 : 4319 : "memory", "cc" 4320 , "xmm0" 4321 ); 4322} 4323#endif // HAS_ARGBADDROW_AVX2 4324 4325#ifdef HAS_ARGBSUBTRACTROW_SSE2 4326// Subtract 2 rows of ARGB pixels, 4 pixels at a time. 4327void ARGBSubtractRow_SSE2(const uint8* src_argb0, 4328 const uint8* src_argb1, 4329 uint8* dst_argb, 4330 int width) { 4331 asm volatile ( 4332 // 4 pixel loop. 4333 LABELALIGN 4334 "1: \n" 4335 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 4336 "lea " MEMLEA(0x10,0) ",%0 \n" 4337 "movdqu " MEMACCESS(1) ",%%xmm1 \n" 4338 "lea " MEMLEA(0x10,1) ",%1 \n" 4339 "psubusb %%xmm1,%%xmm0 \n" 4340 "movdqu %%xmm0," MEMACCESS(2) " \n" 4341 "lea " MEMLEA(0x10,2) ",%2 \n" 4342 "sub $0x4,%3 \n" 4343 "jg 1b \n" 4344 : "+r"(src_argb0), // %0 4345 "+r"(src_argb1), // %1 4346 "+r"(dst_argb), // %2 4347 "+r"(width) // %3 4348 : 4349 : "memory", "cc" 4350 , "xmm0", "xmm1" 4351 ); 4352} 4353#endif // HAS_ARGBSUBTRACTROW_SSE2 4354 4355#ifdef HAS_ARGBSUBTRACTROW_AVX2 4356// Subtract 2 rows of ARGB pixels, 8 pixels at a time. 4357void ARGBSubtractRow_AVX2(const uint8* src_argb0, 4358 const uint8* src_argb1, 4359 uint8* dst_argb, 4360 int width) { 4361 asm volatile ( 4362 // 4 pixel loop. 4363 LABELALIGN 4364 "1: \n" 4365 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 4366 "lea " MEMLEA(0x20,0) ",%0 \n" 4367 "vpsubusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n" 4368 "lea " MEMLEA(0x20,1) ",%1 \n" 4369 "vmovdqu %%ymm0," MEMACCESS(2) " \n" 4370 "lea " MEMLEA(0x20,2) ",%2 \n" 4371 "sub $0x8,%3 \n" 4372 "jg 1b \n" 4373 "vzeroupper \n" 4374 : "+r"(src_argb0), // %0 4375 "+r"(src_argb1), // %1 4376 "+r"(dst_argb), // %2 4377 "+r"(width) // %3 4378 : 4379 : "memory", "cc" 4380 , "xmm0" 4381 ); 4382} 4383#endif // HAS_ARGBSUBTRACTROW_AVX2 4384 4385#ifdef HAS_SOBELXROW_SSE2 4386// SobelX as a matrix is 4387// -1 0 1 4388// -2 0 2 4389// -1 0 1 4390void SobelXRow_SSE2(const uint8* src_y0, 4391 const uint8* src_y1, 4392 const uint8* src_y2, 4393 uint8* dst_sobelx, 4394 int width) { 4395 asm volatile ( 4396 "sub %0,%1 \n" 4397 "sub %0,%2 \n" 4398 "sub %0,%3 \n" 4399 "pxor %%xmm5,%%xmm5 \n" 4400 4401 // 8 pixel loop. 4402 LABELALIGN 4403 "1: \n" 4404 "movq " MEMACCESS(0) ",%%xmm0 \n" 4405 "movq " MEMACCESS2(0x2,0) ",%%xmm1 \n" 4406 "punpcklbw %%xmm5,%%xmm0 \n" 4407 "punpcklbw %%xmm5,%%xmm1 \n" 4408 "psubw %%xmm1,%%xmm0 \n" 4409 MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1 4410 MEMOPREG(movq,0x02,0,1,1,xmm2) // movq 0x2(%0,%1,1),%%xmm2 4411 "punpcklbw %%xmm5,%%xmm1 \n" 4412 "punpcklbw %%xmm5,%%xmm2 \n" 4413 "psubw %%xmm2,%%xmm1 \n" 4414 MEMOPREG(movq,0x00,0,2,1,xmm2) // movq (%0,%2,1),%%xmm2 4415 MEMOPREG(movq,0x02,0,2,1,xmm3) // movq 0x2(%0,%2,1),%%xmm3 4416 "punpcklbw %%xmm5,%%xmm2 \n" 4417 "punpcklbw %%xmm5,%%xmm3 \n" 4418 "psubw %%xmm3,%%xmm2 \n" 4419 "paddw %%xmm2,%%xmm0 \n" 4420 "paddw %%xmm1,%%xmm0 \n" 4421 "paddw %%xmm1,%%xmm0 \n" 4422 "pxor %%xmm1,%%xmm1 \n" 4423 "psubw %%xmm0,%%xmm1 \n" 4424 "pmaxsw %%xmm1,%%xmm0 \n" 4425 "packuswb %%xmm0,%%xmm0 \n" 4426 MEMOPMEM(movq,xmm0,0x00,0,3,1) // movq %%xmm0,(%0,%3,1) 4427 "lea " MEMLEA(0x8,0) ",%0 \n" 4428 "sub $0x8,%4 \n" 4429 "jg 1b \n" 4430 : "+r"(src_y0), // %0 4431 "+r"(src_y1), // %1 4432 "+r"(src_y2), // %2 4433 "+r"(dst_sobelx), // %3 4434 "+r"(width) // %4 4435 : 4436 : "memory", "cc", NACL_R14 4437 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 4438 ); 4439} 4440#endif // HAS_SOBELXROW_SSE2 4441 4442#ifdef HAS_SOBELYROW_SSE2 4443// SobelY as a matrix is 4444// -1 -2 -1 4445// 0 0 0 4446// 1 2 1 4447void SobelYRow_SSE2(const uint8* src_y0, 4448 const uint8* src_y1, 4449 uint8* dst_sobely, 4450 int width) { 4451 asm volatile ( 4452 "sub %0,%1 \n" 4453 "sub %0,%2 \n" 4454 "pxor %%xmm5,%%xmm5 \n" 4455 4456 // 8 pixel loop. 4457 LABELALIGN 4458 "1: \n" 4459 "movq " MEMACCESS(0) ",%%xmm0 \n" 4460 MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1 4461 "punpcklbw %%xmm5,%%xmm0 \n" 4462 "punpcklbw %%xmm5,%%xmm1 \n" 4463 "psubw %%xmm1,%%xmm0 \n" 4464 "movq " MEMACCESS2(0x1,0) ",%%xmm1 \n" 4465 MEMOPREG(movq,0x01,0,1,1,xmm2) // movq 0x1(%0,%1,1),%%xmm2 4466 "punpcklbw %%xmm5,%%xmm1 \n" 4467 "punpcklbw %%xmm5,%%xmm2 \n" 4468 "psubw %%xmm2,%%xmm1 \n" 4469 "movq " MEMACCESS2(0x2,0) ",%%xmm2 \n" 4470 MEMOPREG(movq,0x02,0,1,1,xmm3) // movq 0x2(%0,%1,1),%%xmm3 4471 "punpcklbw %%xmm5,%%xmm2 \n" 4472 "punpcklbw %%xmm5,%%xmm3 \n" 4473 "psubw %%xmm3,%%xmm2 \n" 4474 "paddw %%xmm2,%%xmm0 \n" 4475 "paddw %%xmm1,%%xmm0 \n" 4476 "paddw %%xmm1,%%xmm0 \n" 4477 "pxor %%xmm1,%%xmm1 \n" 4478 "psubw %%xmm0,%%xmm1 \n" 4479 "pmaxsw %%xmm1,%%xmm0 \n" 4480 "packuswb %%xmm0,%%xmm0 \n" 4481 MEMOPMEM(movq,xmm0,0x00,0,2,1) // movq %%xmm0,(%0,%2,1) 4482 "lea " MEMLEA(0x8,0) ",%0 \n" 4483 "sub $0x8,%3 \n" 4484 "jg 1b \n" 4485 : "+r"(src_y0), // %0 4486 "+r"(src_y1), // %1 4487 "+r"(dst_sobely), // %2 4488 "+r"(width) // %3 4489 : 4490 : "memory", "cc", NACL_R14 4491 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 4492 ); 4493} 4494#endif // HAS_SOBELYROW_SSE2 4495 4496#ifdef HAS_SOBELROW_SSE2 4497// Adds Sobel X and Sobel Y and stores Sobel into ARGB. 4498// A = 255 4499// R = Sobel 4500// G = Sobel 4501// B = Sobel 4502void SobelRow_SSE2(const uint8* src_sobelx, 4503 const uint8* src_sobely, 4504 uint8* dst_argb, 4505 int width) { 4506 asm volatile ( 4507 "sub %0,%1 \n" 4508 "pcmpeqb %%xmm5,%%xmm5 \n" 4509 "pslld $0x18,%%xmm5 \n" 4510 4511 // 8 pixel loop. 4512 LABELALIGN 4513 "1: \n" 4514 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 4515 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 4516 "lea " MEMLEA(0x10,0) ",%0 \n" 4517 "paddusb %%xmm1,%%xmm0 \n" 4518 "movdqa %%xmm0,%%xmm2 \n" 4519 "punpcklbw %%xmm0,%%xmm2 \n" 4520 "punpckhbw %%xmm0,%%xmm0 \n" 4521 "movdqa %%xmm2,%%xmm1 \n" 4522 "punpcklwd %%xmm2,%%xmm1 \n" 4523 "punpckhwd %%xmm2,%%xmm2 \n" 4524 "por %%xmm5,%%xmm1 \n" 4525 "por %%xmm5,%%xmm2 \n" 4526 "movdqa %%xmm0,%%xmm3 \n" 4527 "punpcklwd %%xmm0,%%xmm3 \n" 4528 "punpckhwd %%xmm0,%%xmm0 \n" 4529 "por %%xmm5,%%xmm3 \n" 4530 "por %%xmm5,%%xmm0 \n" 4531 "movdqu %%xmm1," MEMACCESS(2) " \n" 4532 "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n" 4533 "movdqu %%xmm3," MEMACCESS2(0x20,2) " \n" 4534 "movdqu %%xmm0," MEMACCESS2(0x30,2) " \n" 4535 "lea " MEMLEA(0x40,2) ",%2 \n" 4536 "sub $0x10,%3 \n" 4537 "jg 1b \n" 4538 : "+r"(src_sobelx), // %0 4539 "+r"(src_sobely), // %1 4540 "+r"(dst_argb), // %2 4541 "+r"(width) // %3 4542 : 4543 : "memory", "cc", NACL_R14 4544 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 4545 ); 4546} 4547#endif // HAS_SOBELROW_SSE2 4548 4549#ifdef HAS_SOBELTOPLANEROW_SSE2 4550// Adds Sobel X and Sobel Y and stores Sobel into a plane. 4551void SobelToPlaneRow_SSE2(const uint8* src_sobelx, 4552 const uint8* src_sobely, 4553 uint8* dst_y, 4554 int width) { 4555 asm volatile ( 4556 "sub %0,%1 \n" 4557 "pcmpeqb %%xmm5,%%xmm5 \n" 4558 "pslld $0x18,%%xmm5 \n" 4559 4560 // 8 pixel loop. 4561 LABELALIGN 4562 "1: \n" 4563 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 4564 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 4565 "lea " MEMLEA(0x10,0) ",%0 \n" 4566 "paddusb %%xmm1,%%xmm0 \n" 4567 "movdqu %%xmm0," MEMACCESS(2) " \n" 4568 "lea " MEMLEA(0x10,2) ",%2 \n" 4569 "sub $0x10,%3 \n" 4570 "jg 1b \n" 4571 : "+r"(src_sobelx), // %0 4572 "+r"(src_sobely), // %1 4573 "+r"(dst_y), // %2 4574 "+r"(width) // %3 4575 : 4576 : "memory", "cc", NACL_R14 4577 "xmm0", "xmm1" 4578 ); 4579} 4580#endif // HAS_SOBELTOPLANEROW_SSE2 4581 4582#ifdef HAS_SOBELXYROW_SSE2 4583// Mixes Sobel X, Sobel Y and Sobel into ARGB. 4584// A = 255 4585// R = Sobel X 4586// G = Sobel 4587// B = Sobel Y 4588void SobelXYRow_SSE2(const uint8* src_sobelx, 4589 const uint8* src_sobely, 4590 uint8* dst_argb, 4591 int width) { 4592 asm volatile ( 4593 "sub %0,%1 \n" 4594 "pcmpeqb %%xmm5,%%xmm5 \n" 4595 4596 // 8 pixel loop. 4597 LABELALIGN 4598 "1: \n" 4599 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 4600 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 4601 "lea " MEMLEA(0x10,0) ",%0 \n" 4602 "movdqa %%xmm0,%%xmm2 \n" 4603 "paddusb %%xmm1,%%xmm2 \n" 4604 "movdqa %%xmm0,%%xmm3 \n" 4605 "punpcklbw %%xmm5,%%xmm3 \n" 4606 "punpckhbw %%xmm5,%%xmm0 \n" 4607 "movdqa %%xmm1,%%xmm4 \n" 4608 "punpcklbw %%xmm2,%%xmm4 \n" 4609 "punpckhbw %%xmm2,%%xmm1 \n" 4610 "movdqa %%xmm4,%%xmm6 \n" 4611 "punpcklwd %%xmm3,%%xmm6 \n" 4612 "punpckhwd %%xmm3,%%xmm4 \n" 4613 "movdqa %%xmm1,%%xmm7 \n" 4614 "punpcklwd %%xmm0,%%xmm7 \n" 4615 "punpckhwd %%xmm0,%%xmm1 \n" 4616 "movdqu %%xmm6," MEMACCESS(2) " \n" 4617 "movdqu %%xmm4," MEMACCESS2(0x10,2) " \n" 4618 "movdqu %%xmm7," MEMACCESS2(0x20,2) " \n" 4619 "movdqu %%xmm1," MEMACCESS2(0x30,2) " \n" 4620 "lea " MEMLEA(0x40,2) ",%2 \n" 4621 "sub $0x10,%3 \n" 4622 "jg 1b \n" 4623 : "+r"(src_sobelx), // %0 4624 "+r"(src_sobely), // %1 4625 "+r"(dst_argb), // %2 4626 "+r"(width) // %3 4627 : 4628 : "memory", "cc", NACL_R14 4629 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 4630 ); 4631} 4632#endif // HAS_SOBELXYROW_SSE2 4633 4634#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 4635// Creates a table of cumulative sums where each value is a sum of all values 4636// above and to the left of the value, inclusive of the value. 4637void ComputeCumulativeSumRow_SSE2(const uint8* row, 4638 int32* cumsum, 4639 const int32* previous_cumsum, 4640 int width) { 4641 asm volatile ( 4642 "pxor %%xmm0,%%xmm0 \n" 4643 "pxor %%xmm1,%%xmm1 \n" 4644 "sub $0x4,%3 \n" 4645 "jl 49f \n" 4646 "test $0xf,%1 \n" 4647 "jne 49f \n" 4648 4649 // 4 pixel loop. 4650 LABELALIGN 4651 "40: \n" 4652 "movdqu " MEMACCESS(0) ",%%xmm2 \n" 4653 "lea " MEMLEA(0x10,0) ",%0 \n" 4654 "movdqa %%xmm2,%%xmm4 \n" 4655 "punpcklbw %%xmm1,%%xmm2 \n" 4656 "movdqa %%xmm2,%%xmm3 \n" 4657 "punpcklwd %%xmm1,%%xmm2 \n" 4658 "punpckhwd %%xmm1,%%xmm3 \n" 4659 "punpckhbw %%xmm1,%%xmm4 \n" 4660 "movdqa %%xmm4,%%xmm5 \n" 4661 "punpcklwd %%xmm1,%%xmm4 \n" 4662 "punpckhwd %%xmm1,%%xmm5 \n" 4663 "paddd %%xmm2,%%xmm0 \n" 4664 "movdqu " MEMACCESS(2) ",%%xmm2 \n" 4665 "paddd %%xmm0,%%xmm2 \n" 4666 "paddd %%xmm3,%%xmm0 \n" 4667 "movdqu " MEMACCESS2(0x10,2) ",%%xmm3 \n" 4668 "paddd %%xmm0,%%xmm3 \n" 4669 "paddd %%xmm4,%%xmm0 \n" 4670 "movdqu " MEMACCESS2(0x20,2) ",%%xmm4 \n" 4671 "paddd %%xmm0,%%xmm4 \n" 4672 "paddd %%xmm5,%%xmm0 \n" 4673 "movdqu " MEMACCESS2(0x30,2) ",%%xmm5 \n" 4674 "lea " MEMLEA(0x40,2) ",%2 \n" 4675 "paddd %%xmm0,%%xmm5 \n" 4676 "movdqu %%xmm2," MEMACCESS(1) " \n" 4677 "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n" 4678 "movdqu %%xmm4," MEMACCESS2(0x20,1) " \n" 4679 "movdqu %%xmm5," MEMACCESS2(0x30,1) " \n" 4680 "lea " MEMLEA(0x40,1) ",%1 \n" 4681 "sub $0x4,%3 \n" 4682 "jge 40b \n" 4683 4684 "49: \n" 4685 "add $0x3,%3 \n" 4686 "jl 19f \n" 4687 4688 // 1 pixel loop. 4689 LABELALIGN 4690 "10: \n" 4691 "movd " MEMACCESS(0) ",%%xmm2 \n" 4692 "lea " MEMLEA(0x4,0) ",%0 \n" 4693 "punpcklbw %%xmm1,%%xmm2 \n" 4694 "punpcklwd %%xmm1,%%xmm2 \n" 4695 "paddd %%xmm2,%%xmm0 \n" 4696 "movdqu " MEMACCESS(2) ",%%xmm2 \n" 4697 "lea " MEMLEA(0x10,2) ",%2 \n" 4698 "paddd %%xmm0,%%xmm2 \n" 4699 "movdqu %%xmm2," MEMACCESS(1) " \n" 4700 "lea " MEMLEA(0x10,1) ",%1 \n" 4701 "sub $0x1,%3 \n" 4702 "jge 10b \n" 4703 4704 "19: \n" 4705 : "+r"(row), // %0 4706 "+r"(cumsum), // %1 4707 "+r"(previous_cumsum), // %2 4708 "+r"(width) // %3 4709 : 4710 : "memory", "cc" 4711 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 4712 ); 4713} 4714#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 4715 4716#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 4717void CumulativeSumToAverageRow_SSE2(const int32* topleft, 4718 const int32* botleft, 4719 int width, 4720 int area, 4721 uint8* dst, 4722 int count) { 4723 asm volatile ( 4724 "movd %5,%%xmm5 \n" 4725 "cvtdq2ps %%xmm5,%%xmm5 \n" 4726 "rcpss %%xmm5,%%xmm4 \n" 4727 "pshufd $0x0,%%xmm4,%%xmm4 \n" 4728 "sub $0x4,%3 \n" 4729 "jl 49f \n" 4730 "cmpl $0x80,%5 \n" 4731 "ja 40f \n" 4732 4733 "pshufd $0x0,%%xmm5,%%xmm5 \n" 4734 "pcmpeqb %%xmm6,%%xmm6 \n" 4735 "psrld $0x10,%%xmm6 \n" 4736 "cvtdq2ps %%xmm6,%%xmm6 \n" 4737 "addps %%xmm6,%%xmm5 \n" 4738 "mulps %%xmm4,%%xmm5 \n" 4739 "cvtps2dq %%xmm5,%%xmm5 \n" 4740 "packssdw %%xmm5,%%xmm5 \n" 4741 4742 // 4 pixel small loop. 4743 LABELALIGN 4744 "4: \n" 4745 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 4746 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 4747 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 4748 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" 4749 MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0 4750 MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1 4751 MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2 4752 MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3 4753 "lea " MEMLEA(0x40,0) ",%0 \n" 4754 "psubd " MEMACCESS(1) ",%%xmm0 \n" 4755 "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n" 4756 "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n" 4757 "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n" 4758 MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0 4759 MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1 4760 MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2 4761 MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3 4762 "lea " MEMLEA(0x40,1) ",%1 \n" 4763 "packssdw %%xmm1,%%xmm0 \n" 4764 "packssdw %%xmm3,%%xmm2 \n" 4765 "pmulhuw %%xmm5,%%xmm0 \n" 4766 "pmulhuw %%xmm5,%%xmm2 \n" 4767 "packuswb %%xmm2,%%xmm0 \n" 4768 "movdqu %%xmm0," MEMACCESS(2) " \n" 4769 "lea " MEMLEA(0x10,2) ",%2 \n" 4770 "sub $0x4,%3 \n" 4771 "jge 4b \n" 4772 "jmp 49f \n" 4773 4774 // 4 pixel loop \n" 4775 LABELALIGN 4776 "40: \n" 4777 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 4778 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 4779 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 4780 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" 4781 MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0 4782 MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1 4783 MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2 4784 MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3 4785 "lea " MEMLEA(0x40,0) ",%0 \n" 4786 "psubd " MEMACCESS(1) ",%%xmm0 \n" 4787 "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n" 4788 "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n" 4789 "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n" 4790 MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0 4791 MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1 4792 MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2 4793 MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3 4794 "lea " MEMLEA(0x40,1) ",%1 \n" 4795 "cvtdq2ps %%xmm0,%%xmm0 \n" 4796 "cvtdq2ps %%xmm1,%%xmm1 \n" 4797 "mulps %%xmm4,%%xmm0 \n" 4798 "mulps %%xmm4,%%xmm1 \n" 4799 "cvtdq2ps %%xmm2,%%xmm2 \n" 4800 "cvtdq2ps %%xmm3,%%xmm3 \n" 4801 "mulps %%xmm4,%%xmm2 \n" 4802 "mulps %%xmm4,%%xmm3 \n" 4803 "cvtps2dq %%xmm0,%%xmm0 \n" 4804 "cvtps2dq %%xmm1,%%xmm1 \n" 4805 "cvtps2dq %%xmm2,%%xmm2 \n" 4806 "cvtps2dq %%xmm3,%%xmm3 \n" 4807 "packssdw %%xmm1,%%xmm0 \n" 4808 "packssdw %%xmm3,%%xmm2 \n" 4809 "packuswb %%xmm2,%%xmm0 \n" 4810 "movdqu %%xmm0," MEMACCESS(2) " \n" 4811 "lea " MEMLEA(0x10,2) ",%2 \n" 4812 "sub $0x4,%3 \n" 4813 "jge 40b \n" 4814 4815 "49: \n" 4816 "add $0x3,%3 \n" 4817 "jl 19f \n" 4818 4819 // 1 pixel loop \n" 4820 LABELALIGN 4821 "10: \n" 4822 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 4823 MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0 4824 "lea " MEMLEA(0x10,0) ",%0 \n" 4825 "psubd " MEMACCESS(1) ",%%xmm0 \n" 4826 MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0 4827 "lea " MEMLEA(0x10,1) ",%1 \n" 4828 "cvtdq2ps %%xmm0,%%xmm0 \n" 4829 "mulps %%xmm4,%%xmm0 \n" 4830 "cvtps2dq %%xmm0,%%xmm0 \n" 4831 "packssdw %%xmm0,%%xmm0 \n" 4832 "packuswb %%xmm0,%%xmm0 \n" 4833 "movd %%xmm0," MEMACCESS(2) " \n" 4834 "lea " MEMLEA(0x4,2) ",%2 \n" 4835 "sub $0x1,%3 \n" 4836 "jge 10b \n" 4837 "19: \n" 4838 : "+r"(topleft), // %0 4839 "+r"(botleft), // %1 4840 "+r"(dst), // %2 4841 "+rm"(count) // %3 4842 : "r"((intptr_t)(width)), // %4 4843 "rm"(area) // %5 4844 : "memory", "cc", NACL_R14 4845 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" 4846 ); 4847} 4848#endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 4849 4850#ifdef HAS_ARGBAFFINEROW_SSE2 4851// Copy ARGB pixels from source image with slope to a row of destination. 4852LIBYUV_API 4853void ARGBAffineRow_SSE2(const uint8* src_argb, 4854 int src_argb_stride, 4855 uint8* dst_argb, 4856 const float* src_dudv, 4857 int width) { 4858 intptr_t src_argb_stride_temp = src_argb_stride; 4859 intptr_t temp; 4860 asm volatile ( 4861 "movq " MEMACCESS(3) ",%%xmm2 \n" 4862 "movq " MEMACCESS2(0x08,3) ",%%xmm7 \n" 4863 "shl $0x10,%1 \n" 4864 "add $0x4,%1 \n" 4865 "movd %1,%%xmm5 \n" 4866 "sub $0x4,%4 \n" 4867 "jl 49f \n" 4868 4869 "pshufd $0x44,%%xmm7,%%xmm7 \n" 4870 "pshufd $0x0,%%xmm5,%%xmm5 \n" 4871 "movdqa %%xmm2,%%xmm0 \n" 4872 "addps %%xmm7,%%xmm0 \n" 4873 "movlhps %%xmm0,%%xmm2 \n" 4874 "movdqa %%xmm7,%%xmm4 \n" 4875 "addps %%xmm4,%%xmm4 \n" 4876 "movdqa %%xmm2,%%xmm3 \n" 4877 "addps %%xmm4,%%xmm3 \n" 4878 "addps %%xmm4,%%xmm4 \n" 4879 4880 // 4 pixel loop \n" 4881 LABELALIGN 4882 "40: \n" 4883 "cvttps2dq %%xmm2,%%xmm0 \n" // x, y float to int first 2 4884 "cvttps2dq %%xmm3,%%xmm1 \n" // x, y float to int next 2 4885 "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts 4886 "pmaddwd %%xmm5,%%xmm0 \n" // off = x * 4 + y * stride 4887 "movd %%xmm0,%k1 \n" 4888 "pshufd $0x39,%%xmm0,%%xmm0 \n" 4889 "movd %%xmm0,%k5 \n" 4890 "pshufd $0x39,%%xmm0,%%xmm0 \n" 4891 MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1 4892 MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6 4893 "punpckldq %%xmm6,%%xmm1 \n" 4894 "addps %%xmm4,%%xmm2 \n" 4895 "movq %%xmm1," MEMACCESS(2) " \n" 4896 "movd %%xmm0,%k1 \n" 4897 "pshufd $0x39,%%xmm0,%%xmm0 \n" 4898 "movd %%xmm0,%k5 \n" 4899 MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0 4900 MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6 4901 "punpckldq %%xmm6,%%xmm0 \n" 4902 "addps %%xmm4,%%xmm3 \n" 4903 "movq %%xmm0," MEMACCESS2(0x08,2) " \n" 4904 "lea " MEMLEA(0x10,2) ",%2 \n" 4905 "sub $0x4,%4 \n" 4906 "jge 40b \n" 4907 4908 "49: \n" 4909 "add $0x3,%4 \n" 4910 "jl 19f \n" 4911 4912 // 1 pixel loop \n" 4913 LABELALIGN 4914 "10: \n" 4915 "cvttps2dq %%xmm2,%%xmm0 \n" 4916 "packssdw %%xmm0,%%xmm0 \n" 4917 "pmaddwd %%xmm5,%%xmm0 \n" 4918 "addps %%xmm7,%%xmm2 \n" 4919 "movd %%xmm0,%k1 \n" 4920 MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0 4921 "movd %%xmm0," MEMACCESS(2) " \n" 4922 "lea " MEMLEA(0x04,2) ",%2 \n" 4923 "sub $0x1,%4 \n" 4924 "jge 10b \n" 4925 "19: \n" 4926 : "+r"(src_argb), // %0 4927 "+r"(src_argb_stride_temp), // %1 4928 "+r"(dst_argb), // %2 4929 "+r"(src_dudv), // %3 4930 "+rm"(width), // %4 4931 "=&r"(temp) // %5 4932 : 4933 : "memory", "cc", NACL_R14 4934 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 4935 ); 4936} 4937#endif // HAS_ARGBAFFINEROW_SSE2 4938 4939#ifdef HAS_INTERPOLATEROW_SSSE3 4940// Bilinear filter 16x2 -> 16x1 4941void InterpolateRow_SSSE3(uint8* dst_ptr, 4942 const uint8* src_ptr, 4943 ptrdiff_t src_stride, 4944 int dst_width, 4945 int source_y_fraction) { 4946 asm volatile ( 4947 "sub %1,%0 \n" 4948 "cmp $0x0,%3 \n" 4949 "je 100f \n" 4950 "cmp $0x80,%3 \n" 4951 "je 50f \n" 4952 4953 "movd %3,%%xmm0 \n" 4954 "neg %3 \n" 4955 "add $0x100,%3 \n" 4956 "movd %3,%%xmm5 \n" 4957 "punpcklbw %%xmm0,%%xmm5 \n" 4958 "punpcklwd %%xmm5,%%xmm5 \n" 4959 "pshufd $0x0,%%xmm5,%%xmm5 \n" 4960 "mov $0x80808080,%%eax \n" 4961 "movd %%eax,%%xmm4 \n" 4962 "pshufd $0x0,%%xmm4,%%xmm4 \n" 4963 4964 // General purpose row blend. 4965 LABELALIGN 4966 "1: \n" 4967 "movdqu " MEMACCESS(1) ",%%xmm0 \n" 4968 MEMOPREG(movdqu,0x00,1,4,1,xmm2) 4969 "movdqa %%xmm0,%%xmm1 \n" 4970 "punpcklbw %%xmm2,%%xmm0 \n" 4971 "punpckhbw %%xmm2,%%xmm1 \n" 4972 "psubb %%xmm4,%%xmm0 \n" 4973 "psubb %%xmm4,%%xmm1 \n" 4974 "movdqa %%xmm5,%%xmm2 \n" 4975 "movdqa %%xmm5,%%xmm3 \n" 4976 "pmaddubsw %%xmm0,%%xmm2 \n" 4977 "pmaddubsw %%xmm1,%%xmm3 \n" 4978 "paddw %%xmm4,%%xmm2 \n" 4979 "paddw %%xmm4,%%xmm3 \n" 4980 "psrlw $0x8,%%xmm2 \n" 4981 "psrlw $0x8,%%xmm3 \n" 4982 "packuswb %%xmm3,%%xmm2 \n" 4983 MEMOPMEM(movdqu,xmm2,0x00,1,0,1) 4984 "lea " MEMLEA(0x10,1) ",%1 \n" 4985 "sub $0x10,%2 \n" 4986 "jg 1b \n" 4987 "jmp 99f \n" 4988 4989 // Blend 50 / 50. 4990 LABELALIGN 4991 "50: \n" 4992 "movdqu " MEMACCESS(1) ",%%xmm0 \n" 4993 MEMOPREG(movdqu,0x00,1,4,1,xmm1) 4994 "pavgb %%xmm1,%%xmm0 \n" 4995 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) 4996 "lea " MEMLEA(0x10,1) ",%1 \n" 4997 "sub $0x10,%2 \n" 4998 "jg 50b \n" 4999 "jmp 99f \n" 5000 5001 // Blend 100 / 0 - Copy row unchanged. 5002 LABELALIGN 5003 "100: \n" 5004 "movdqu " MEMACCESS(1) ",%%xmm0 \n" 5005 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) 5006 "lea " MEMLEA(0x10,1) ",%1 \n" 5007 "sub $0x10,%2 \n" 5008 "jg 100b \n" 5009 5010 "99: \n" 5011 : "+r"(dst_ptr), // %0 5012 "+r"(src_ptr), // %1 5013 "+rm"(dst_width), // %2 5014 "+r"(source_y_fraction) // %3 5015 : "r"((intptr_t)(src_stride)) // %4 5016 : "memory", "cc", "eax", NACL_R14 5017 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 5018 ); 5019} 5020#endif // HAS_INTERPOLATEROW_SSSE3 5021 5022#ifdef HAS_INTERPOLATEROW_AVX2 5023// Bilinear filter 32x2 -> 32x1 5024void InterpolateRow_AVX2(uint8* dst_ptr, 5025 const uint8* src_ptr, 5026 ptrdiff_t src_stride, 5027 int dst_width, 5028 int source_y_fraction) { 5029 asm volatile ( 5030 "cmp $0x0,%3 \n" 5031 "je 100f \n" 5032 "sub %1,%0 \n" 5033 "cmp $0x80,%3 \n" 5034 "je 50f \n" 5035 5036 "vmovd %3,%%xmm0 \n" 5037 "neg %3 \n" 5038 "add $0x100,%3 \n" 5039 "vmovd %3,%%xmm5 \n" 5040 "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n" 5041 "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n" 5042 "vbroadcastss %%xmm5,%%ymm5 \n" 5043 "mov $0x80808080,%%eax \n" 5044 "vmovd %%eax,%%xmm4 \n" 5045 "vbroadcastss %%xmm4,%%ymm4 \n" 5046 5047 // General purpose row blend. 5048 LABELALIGN 5049 "1: \n" 5050 "vmovdqu " MEMACCESS(1) ",%%ymm0 \n" 5051 MEMOPREG(vmovdqu,0x00,1,4,1,ymm2) 5052 "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n" 5053 "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n" 5054 "vpsubb %%ymm4,%%ymm1,%%ymm1 \n" 5055 "vpsubb %%ymm4,%%ymm0,%%ymm0 \n" 5056 "vpmaddubsw %%ymm1,%%ymm5,%%ymm1 \n" 5057 "vpmaddubsw %%ymm0,%%ymm5,%%ymm0 \n" 5058 "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" 5059 "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" 5060 "vpsrlw $0x8,%%ymm1,%%ymm1 \n" 5061 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" 5062 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" 5063 MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1) 5064 "lea " MEMLEA(0x20,1) ",%1 \n" 5065 "sub $0x20,%2 \n" 5066 "jg 1b \n" 5067 "jmp 99f \n" 5068 5069 // Blend 50 / 50. 5070 LABELALIGN 5071 "50: \n" 5072 "vmovdqu " MEMACCESS(1) ",%%ymm0 \n" 5073 VMEMOPREG(vpavgb,0x00,1,4,1,ymm0,ymm0) // vpavgb (%1,%4,1),%%ymm0,%%ymm0 5074 MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1) 5075 "lea " MEMLEA(0x20,1) ",%1 \n" 5076 "sub $0x20,%2 \n" 5077 "jg 50b \n" 5078 "jmp 99f \n" 5079 5080 // Blend 100 / 0 - Copy row unchanged. 5081 LABELALIGN 5082 "100: \n" 5083 "rep movsb " MEMMOVESTRING(1,0) " \n" 5084 "jmp 999f \n" 5085 5086 "99: \n" 5087 "vzeroupper \n" 5088 "999: \n" 5089 : "+D"(dst_ptr), // %0 5090 "+S"(src_ptr), // %1 5091 "+cm"(dst_width), // %2 5092 "+r"(source_y_fraction) // %3 5093 : "r"((intptr_t)(src_stride)) // %4 5094 : "memory", "cc", "eax", NACL_R14 5095 "xmm0", "xmm1", "xmm2", "xmm4", "xmm5" 5096 ); 5097} 5098#endif // HAS_INTERPOLATEROW_AVX2 5099 5100#ifdef HAS_ARGBSHUFFLEROW_SSSE3 5101// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. 5102void ARGBShuffleRow_SSSE3(const uint8* src_argb, 5103 uint8* dst_argb, 5104 const uint8* shuffler, 5105 int width) { 5106 asm volatile ( 5107 "movdqu " MEMACCESS(3) ",%%xmm5 \n" 5108 LABELALIGN 5109 "1: \n" 5110 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 5111 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 5112 "lea " MEMLEA(0x20,0) ",%0 \n" 5113 "pshufb %%xmm5,%%xmm0 \n" 5114 "pshufb %%xmm5,%%xmm1 \n" 5115 "movdqu %%xmm0," MEMACCESS(1) " \n" 5116 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" 5117 "lea " MEMLEA(0x20,1) ",%1 \n" 5118 "sub $0x8,%2 \n" 5119 "jg 1b \n" 5120 : "+r"(src_argb), // %0 5121 "+r"(dst_argb), // %1 5122 "+r"(width) // %2 5123 : "r"(shuffler) // %3 5124 : "memory", "cc" 5125 , "xmm0", "xmm1", "xmm5" 5126 ); 5127} 5128#endif // HAS_ARGBSHUFFLEROW_SSSE3 5129 5130#ifdef HAS_ARGBSHUFFLEROW_AVX2 5131// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. 5132void ARGBShuffleRow_AVX2(const uint8* src_argb, 5133 uint8* dst_argb, 5134 const uint8* shuffler, 5135 int width) { 5136 asm volatile ( 5137 "vbroadcastf128 " MEMACCESS(3) ",%%ymm5 \n" 5138 LABELALIGN 5139 "1: \n" 5140 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 5141 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" 5142 "lea " MEMLEA(0x40,0) ",%0 \n" 5143 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" 5144 "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" 5145 "vmovdqu %%ymm0," MEMACCESS(1) " \n" 5146 "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n" 5147 "lea " MEMLEA(0x40,1) ",%1 \n" 5148 "sub $0x10,%2 \n" 5149 "jg 1b \n" 5150 "vzeroupper \n" 5151 : "+r"(src_argb), // %0 5152 "+r"(dst_argb), // %1 5153 "+r"(width) // %2 5154 : "r"(shuffler) // %3 5155 : "memory", "cc" 5156 , "xmm0", "xmm1", "xmm5" 5157 ); 5158} 5159#endif // HAS_ARGBSHUFFLEROW_AVX2 5160 5161#ifdef HAS_ARGBSHUFFLEROW_SSE2 5162// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. 5163void ARGBShuffleRow_SSE2(const uint8* src_argb, 5164 uint8* dst_argb, 5165 const uint8* shuffler, 5166 int width) { 5167 uintptr_t pixel_temp; 5168 asm volatile ( 5169 "pxor %%xmm5,%%xmm5 \n" 5170 "mov " MEMACCESS(4) ",%k2 \n" 5171 "cmp $0x3000102,%k2 \n" 5172 "je 3012f \n" 5173 "cmp $0x10203,%k2 \n" 5174 "je 123f \n" 5175 "cmp $0x30201,%k2 \n" 5176 "je 321f \n" 5177 "cmp $0x2010003,%k2 \n" 5178 "je 2103f \n" 5179 5180 LABELALIGN 5181 "1: \n" 5182 "movzb " MEMACCESS(4) ",%2 \n" 5183 MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 5184 "mov %b2," MEMACCESS(1) " \n" 5185 "movzb " MEMACCESS2(0x1,4) ",%2 \n" 5186 MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 5187 "mov %b2," MEMACCESS2(0x1,1) " \n" 5188 "movzb " MEMACCESS2(0x2,4) ",%2 \n" 5189 MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 5190 "mov %b2," MEMACCESS2(0x2,1) " \n" 5191 "movzb " MEMACCESS2(0x3,4) ",%2 \n" 5192 MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 5193 "mov %b2," MEMACCESS2(0x3,1) " \n" 5194 "lea " MEMLEA(0x4,0) ",%0 \n" 5195 "lea " MEMLEA(0x4,1) ",%1 \n" 5196 "sub $0x1,%3 \n" 5197 "jg 1b \n" 5198 "jmp 99f \n" 5199 5200 LABELALIGN 5201 "123: \n" 5202 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 5203 "lea " MEMLEA(0x10,0) ",%0 \n" 5204 "movdqa %%xmm0,%%xmm1 \n" 5205 "punpcklbw %%xmm5,%%xmm0 \n" 5206 "punpckhbw %%xmm5,%%xmm1 \n" 5207 "pshufhw $0x1b,%%xmm0,%%xmm0 \n" 5208 "pshuflw $0x1b,%%xmm0,%%xmm0 \n" 5209 "pshufhw $0x1b,%%xmm1,%%xmm1 \n" 5210 "pshuflw $0x1b,%%xmm1,%%xmm1 \n" 5211 "packuswb %%xmm1,%%xmm0 \n" 5212 "movdqu %%xmm0," MEMACCESS(1) " \n" 5213 "lea " MEMLEA(0x10,1) ",%1 \n" 5214 "sub $0x4,%3 \n" 5215 "jg 123b \n" 5216 "jmp 99f \n" 5217 5218 LABELALIGN 5219 "321: \n" 5220 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 5221 "lea " MEMLEA(0x10,0) ",%0 \n" 5222 "movdqa %%xmm0,%%xmm1 \n" 5223 "punpcklbw %%xmm5,%%xmm0 \n" 5224 "punpckhbw %%xmm5,%%xmm1 \n" 5225 "pshufhw $0x39,%%xmm0,%%xmm0 \n" 5226 "pshuflw $0x39,%%xmm0,%%xmm0 \n" 5227 "pshufhw $0x39,%%xmm1,%%xmm1 \n" 5228 "pshuflw $0x39,%%xmm1,%%xmm1 \n" 5229 "packuswb %%xmm1,%%xmm0 \n" 5230 "movdqu %%xmm0," MEMACCESS(1) " \n" 5231 "lea " MEMLEA(0x10,1) ",%1 \n" 5232 "sub $0x4,%3 \n" 5233 "jg 321b \n" 5234 "jmp 99f \n" 5235 5236 LABELALIGN 5237 "2103: \n" 5238 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 5239 "lea " MEMLEA(0x10,0) ",%0 \n" 5240 "movdqa %%xmm0,%%xmm1 \n" 5241 "punpcklbw %%xmm5,%%xmm0 \n" 5242 "punpckhbw %%xmm5,%%xmm1 \n" 5243 "pshufhw $0x93,%%xmm0,%%xmm0 \n" 5244 "pshuflw $0x93,%%xmm0,%%xmm0 \n" 5245 "pshufhw $0x93,%%xmm1,%%xmm1 \n" 5246 "pshuflw $0x93,%%xmm1,%%xmm1 \n" 5247 "packuswb %%xmm1,%%xmm0 \n" 5248 "movdqu %%xmm0," MEMACCESS(1) " \n" 5249 "lea " MEMLEA(0x10,1) ",%1 \n" 5250 "sub $0x4,%3 \n" 5251 "jg 2103b \n" 5252 "jmp 99f \n" 5253 5254 LABELALIGN 5255 "3012: \n" 5256 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 5257 "lea " MEMLEA(0x10,0) ",%0 \n" 5258 "movdqa %%xmm0,%%xmm1 \n" 5259 "punpcklbw %%xmm5,%%xmm0 \n" 5260 "punpckhbw %%xmm5,%%xmm1 \n" 5261 "pshufhw $0xc6,%%xmm0,%%xmm0 \n" 5262 "pshuflw $0xc6,%%xmm0,%%xmm0 \n" 5263 "pshufhw $0xc6,%%xmm1,%%xmm1 \n" 5264 "pshuflw $0xc6,%%xmm1,%%xmm1 \n" 5265 "packuswb %%xmm1,%%xmm0 \n" 5266 "movdqu %%xmm0," MEMACCESS(1) " \n" 5267 "lea " MEMLEA(0x10,1) ",%1 \n" 5268 "sub $0x4,%3 \n" 5269 "jg 3012b \n" 5270 5271 "99: \n" 5272 : "+r"(src_argb), // %0 5273 "+r"(dst_argb), // %1 5274 "=&d"(pixel_temp), // %2 5275 "+r"(width) // %3 5276 : "r"(shuffler) // %4 5277 : "memory", "cc", NACL_R14 5278 "xmm0", "xmm1", "xmm5" 5279 ); 5280} 5281#endif // HAS_ARGBSHUFFLEROW_SSE2 5282 5283#ifdef HAS_I422TOYUY2ROW_SSE2 5284void I422ToYUY2Row_SSE2(const uint8* src_y, 5285 const uint8* src_u, 5286 const uint8* src_v, 5287 uint8* dst_frame, 5288 int width) { 5289 asm volatile ( 5290 "sub %1,%2 \n" 5291 LABELALIGN 5292 "1: \n" 5293 "movq " MEMACCESS(1) ",%%xmm2 \n" 5294 MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3 5295 "lea " MEMLEA(0x8,1) ",%1 \n" 5296 "punpcklbw %%xmm3,%%xmm2 \n" 5297 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 5298 "lea " MEMLEA(0x10,0) ",%0 \n" 5299 "movdqa %%xmm0,%%xmm1 \n" 5300 "punpcklbw %%xmm2,%%xmm0 \n" 5301 "punpckhbw %%xmm2,%%xmm1 \n" 5302 "movdqu %%xmm0," MEMACCESS(3) " \n" 5303 "movdqu %%xmm1," MEMACCESS2(0x10,3) " \n" 5304 "lea " MEMLEA(0x20,3) ",%3 \n" 5305 "sub $0x10,%4 \n" 5306 "jg 1b \n" 5307 : "+r"(src_y), // %0 5308 "+r"(src_u), // %1 5309 "+r"(src_v), // %2 5310 "+r"(dst_frame), // %3 5311 "+rm"(width) // %4 5312 : 5313 : "memory", "cc", NACL_R14 5314 "xmm0", "xmm1", "xmm2", "xmm3" 5315 ); 5316} 5317#endif // HAS_I422TOYUY2ROW_SSE2 5318 5319#ifdef HAS_I422TOUYVYROW_SSE2 5320void I422ToUYVYRow_SSE2(const uint8* src_y, 5321 const uint8* src_u, 5322 const uint8* src_v, 5323 uint8* dst_frame, 5324 int width) { 5325 asm volatile ( 5326 "sub %1,%2 \n" 5327 LABELALIGN 5328 "1: \n" 5329 "movq " MEMACCESS(1) ",%%xmm2 \n" 5330 MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3 5331 "lea " MEMLEA(0x8,1) ",%1 \n" 5332 "punpcklbw %%xmm3,%%xmm2 \n" 5333 "movdqu " MEMACCESS(0) ",%%xmm0 \n" 5334 "movdqa %%xmm2,%%xmm1 \n" 5335 "lea " MEMLEA(0x10,0) ",%0 \n" 5336 "punpcklbw %%xmm0,%%xmm1 \n" 5337 "punpckhbw %%xmm0,%%xmm2 \n" 5338 "movdqu %%xmm1," MEMACCESS(3) " \n" 5339 "movdqu %%xmm2," MEMACCESS2(0x10,3) " \n" 5340 "lea " MEMLEA(0x20,3) ",%3 \n" 5341 "sub $0x10,%4 \n" 5342 "jg 1b \n" 5343 : "+r"(src_y), // %0 5344 "+r"(src_u), // %1 5345 "+r"(src_v), // %2 5346 "+r"(dst_frame), // %3 5347 "+rm"(width) // %4 5348 : 5349 : "memory", "cc", NACL_R14 5350 "xmm0", "xmm1", "xmm2", "xmm3" 5351 ); 5352} 5353#endif // HAS_I422TOUYVYROW_SSE2 5354 5355#ifdef HAS_ARGBPOLYNOMIALROW_SSE2 5356void ARGBPolynomialRow_SSE2(const uint8* src_argb, 5357 uint8* dst_argb, 5358 const float* poly, 5359 int width) { 5360 asm volatile ( 5361 "pxor %%xmm3,%%xmm3 \n" 5362 5363 // 2 pixel loop. 5364 LABELALIGN 5365 "1: \n" 5366 "movq " MEMACCESS(0) ",%%xmm0 \n" 5367 "lea " MEMLEA(0x8,0) ",%0 \n" 5368 "punpcklbw %%xmm3,%%xmm0 \n" 5369 "movdqa %%xmm0,%%xmm4 \n" 5370 "punpcklwd %%xmm3,%%xmm0 \n" 5371 "punpckhwd %%xmm3,%%xmm4 \n" 5372 "cvtdq2ps %%xmm0,%%xmm0 \n" 5373 "cvtdq2ps %%xmm4,%%xmm4 \n" 5374 "movdqa %%xmm0,%%xmm1 \n" 5375 "movdqa %%xmm4,%%xmm5 \n" 5376 "mulps " MEMACCESS2(0x10,3) ",%%xmm0 \n" 5377 "mulps " MEMACCESS2(0x10,3) ",%%xmm4 \n" 5378 "addps " MEMACCESS(3) ",%%xmm0 \n" 5379 "addps " MEMACCESS(3) ",%%xmm4 \n" 5380 "movdqa %%xmm1,%%xmm2 \n" 5381 "movdqa %%xmm5,%%xmm6 \n" 5382 "mulps %%xmm1,%%xmm2 \n" 5383 "mulps %%xmm5,%%xmm6 \n" 5384 "mulps %%xmm2,%%xmm1 \n" 5385 "mulps %%xmm6,%%xmm5 \n" 5386 "mulps " MEMACCESS2(0x20,3) ",%%xmm2 \n" 5387 "mulps " MEMACCESS2(0x20,3) ",%%xmm6 \n" 5388 "mulps " MEMACCESS2(0x30,3) ",%%xmm1 \n" 5389 "mulps " MEMACCESS2(0x30,3) ",%%xmm5 \n" 5390 "addps %%xmm2,%%xmm0 \n" 5391 "addps %%xmm6,%%xmm4 \n" 5392 "addps %%xmm1,%%xmm0 \n" 5393 "addps %%xmm5,%%xmm4 \n" 5394 "cvttps2dq %%xmm0,%%xmm0 \n" 5395 "cvttps2dq %%xmm4,%%xmm4 \n" 5396 "packuswb %%xmm4,%%xmm0 \n" 5397 "packuswb %%xmm0,%%xmm0 \n" 5398 "movq %%xmm0," MEMACCESS(1) " \n" 5399 "lea " MEMLEA(0x8,1) ",%1 \n" 5400 "sub $0x2,%2 \n" 5401 "jg 1b \n" 5402 : "+r"(src_argb), // %0 5403 "+r"(dst_argb), // %1 5404 "+r"(width) // %2 5405 : "r"(poly) // %3 5406 : "memory", "cc" 5407 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" 5408 ); 5409} 5410#endif // HAS_ARGBPOLYNOMIALROW_SSE2 5411 5412#ifdef HAS_ARGBPOLYNOMIALROW_AVX2 5413void ARGBPolynomialRow_AVX2(const uint8* src_argb, 5414 uint8* dst_argb, 5415 const float* poly, 5416 int width) { 5417 asm volatile ( 5418 "vbroadcastf128 " MEMACCESS(3) ",%%ymm4 \n" 5419 "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n" 5420 "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n" 5421 "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n" 5422 5423 // 2 pixel loop. 5424 LABELALIGN 5425 "1: \n" 5426 "vpmovzxbd " MEMACCESS(0) ",%%ymm0 \n" // 2 ARGB pixels 5427 "lea " MEMLEA(0x8,0) ",%0 \n" 5428 "vcvtdq2ps %%ymm0,%%ymm0 \n" // X 8 floats 5429 "vmulps %%ymm0,%%ymm0,%%ymm2 \n" // X * X 5430 "vmulps %%ymm7,%%ymm0,%%ymm3 \n" // C3 * X 5431 "vfmadd132ps %%ymm5,%%ymm4,%%ymm0 \n" // result = C0 + C1 * X 5432 "vfmadd231ps %%ymm6,%%ymm2,%%ymm0 \n" // result += C2 * X * X 5433 "vfmadd231ps %%ymm3,%%ymm2,%%ymm0 \n" // result += C3 * X * X * X 5434 "vcvttps2dq %%ymm0,%%ymm0 \n" 5435 "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n" 5436 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 5437 "vpackuswb %%xmm0,%%xmm0,%%xmm0 \n" 5438 "vmovq %%xmm0," MEMACCESS(1) " \n" 5439 "lea " MEMLEA(0x8,1) ",%1 \n" 5440 "sub $0x2,%2 \n" 5441 "jg 1b \n" 5442 "vzeroupper \n" 5443 : "+r"(src_argb), // %0 5444 "+r"(dst_argb), // %1 5445 "+r"(width) // %2 5446 : "r"(poly) // %3 5447 : "memory", "cc", 5448 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 5449 ); 5450} 5451#endif // HAS_ARGBPOLYNOMIALROW_AVX2 5452 5453#ifdef HAS_HALFFLOATROW_SSE2 5454static float kScaleBias = 1.9259299444e-34f; 5455void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) { 5456 asm volatile ( 5457 "pshufd $0x0,%3,%%xmm4 \n" 5458 "pxor %%xmm5,%%xmm5 \n" 5459 "sub %0,%1 \n" 5460 5461 // 16 pixel loop. 5462 LABELALIGN 5463 "1: \n" 5464 "movdqu " MEMACCESS(0) ",%%xmm2 \n" // 8 shorts 5465 "add $0x10,%0 \n" 5466 "movdqa %%xmm2,%%xmm3 \n" 5467 "punpcklwd %%xmm5,%%xmm2 \n" // 8 ints in xmm2/1 5468 "cvtdq2ps %%xmm2,%%xmm2 \n" // 8 floats 5469 "punpckhwd %%xmm5,%%xmm3 \n" 5470 "cvtdq2ps %%xmm3,%%xmm3 \n" 5471 "mulps %%xmm4,%%xmm2 \n" 5472 "mulps %%xmm4,%%xmm3 \n" 5473 "psrld $0xd,%%xmm2 \n" 5474 "psrld $0xd,%%xmm3 \n" 5475 "packssdw %%xmm3,%%xmm2 \n" 5476 MEMOPMEM(movdqu,xmm2,-0x10,0,1,1) 5477 "sub $0x8,%2 \n" 5478 "jg 1b \n" 5479 : "+r"(src), // %0 5480 "+r"(dst), // %1 5481 "+r"(width) // %2 5482 : "x"(scale * kScaleBias) // %3 5483 : "memory", "cc", 5484 "xmm2", "xmm3", "xmm4", "xmm5" 5485 ); 5486} 5487#endif // HAS_HALFFLOATROW_SSE2 5488 5489#ifdef HAS_HALFFLOATROW_AVX2 5490void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) { 5491 asm volatile ( 5492 "vbroadcastss %3, %%ymm4 \n" 5493 "vpxor %%ymm5,%%ymm5,%%ymm5 \n" 5494 "sub %0,%1 \n" 5495 5496 // 16 pixel loop. 5497 LABELALIGN 5498 "1: \n" 5499 "vmovdqu " MEMACCESS(0) ",%%ymm2 \n" // 16 shorts 5500 "add $0x20,%0 \n" 5501 "vpunpckhwd %%ymm5,%%ymm2,%%ymm3 \n" // mutates 5502 "vpunpcklwd %%ymm5,%%ymm2,%%ymm2 \n" 5503 "vcvtdq2ps %%ymm3,%%ymm3 \n" 5504 "vcvtdq2ps %%ymm2,%%ymm2 \n" 5505 "vmulps %%ymm3,%%ymm4,%%ymm3 \n" 5506 "vmulps %%ymm2,%%ymm4,%%ymm2 \n" 5507 "vpsrld $0xd,%%ymm3,%%ymm3 \n" 5508 "vpsrld $0xd,%%ymm2,%%ymm2 \n" 5509 "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // unmutates 5510 MEMOPMEM(vmovdqu,ymm2,-0x20,0,1,1) 5511 "sub $0x10,%2 \n" 5512 "jg 1b \n" 5513 5514 "vzeroupper \n" 5515 : "+r"(src), // %0 5516 "+r"(dst), // %1 5517 "+r"(width) // %2 5518 : "x"(scale * kScaleBias) // %3 5519 : "memory", "cc", 5520 "xmm2", "xmm3", "xmm4", "xmm5" 5521 ); 5522} 5523#endif // HAS_HALFFLOATROW_AVX2 5524 5525#ifdef HAS_HALFFLOATROW_F16C 5526void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width) { 5527 asm volatile ( 5528 "vbroadcastss %3, %%ymm4 \n" 5529 "sub %0,%1 \n" 5530 5531 // 16 pixel loop. 5532 LABELALIGN 5533 "1: \n" 5534 "vpmovzxwd " MEMACCESS(0) ",%%ymm2 \n" // 16 shorts -> 16 ints 5535 "vpmovzxwd " MEMACCESS2(0x10,0) ",%%ymm3 \n" 5536 "vcvtdq2ps %%ymm2,%%ymm2 \n" 5537 "vcvtdq2ps %%ymm3,%%ymm3 \n" 5538 "vmulps %%ymm2,%%ymm4,%%ymm2 \n" 5539 "vmulps %%ymm3,%%ymm4,%%ymm3 \n" 5540 "vcvtps2ph $3, %%ymm2, %%xmm2 \n" 5541 "vcvtps2ph $3, %%ymm3, %%xmm3 \n" 5542 MEMOPMEM(vmovdqu,xmm2,0x00,0,1,1) 5543 MEMOPMEM(vmovdqu,xmm3,0x10,0,1,1) 5544 "add $0x20,%0 \n" 5545 "sub $0x10,%2 \n" 5546 "jg 1b \n" 5547 "vzeroupper \n" 5548 : "+r"(src), // %0 5549 "+r"(dst), // %1 5550 "+r"(width) // %2 5551 : "x"(scale) // %3 5552 : "memory", "cc", 5553 "xmm2", "xmm3", "xmm4" 5554 ); 5555} 5556#endif // HAS_HALFFLOATROW_F16C 5557 5558#ifdef HAS_HALFFLOATROW_F16C 5559void HalfFloat1Row_F16C(const uint16* src, uint16* dst, float, int width) { 5560 asm volatile ( 5561 "sub %0,%1 \n" 5562 // 16 pixel loop. 5563 LABELALIGN 5564 "1: \n" 5565 "vpmovzxwd " MEMACCESS(0) ",%%ymm2 \n" // 16 shorts -> 16 ints 5566 "vpmovzxwd " MEMACCESS2(0x10,0) ",%%ymm3 \n" 5567 "vcvtdq2ps %%ymm2,%%ymm2 \n" 5568 "vcvtdq2ps %%ymm3,%%ymm3 \n" 5569 "vcvtps2ph $3, %%ymm2, %%xmm2 \n" 5570 "vcvtps2ph $3, %%ymm3, %%xmm3 \n" 5571 MEMOPMEM(vmovdqu,xmm2,0x00,0,1,1) 5572 MEMOPMEM(vmovdqu,xmm3,0x10,0,1,1) 5573 "add $0x20,%0 \n" 5574 "sub $0x10,%2 \n" 5575 "jg 1b \n" 5576 "vzeroupper \n" 5577 : "+r"(src), // %0 5578 "+r"(dst), // %1 5579 "+r"(width) // %2 5580 : 5581 : "memory", "cc", 5582 "xmm2", "xmm3" 5583 ); 5584} 5585#endif // HAS_HALFFLOATROW_F16C 5586 5587#ifdef HAS_ARGBCOLORTABLEROW_X86 5588// Tranform ARGB pixels with color table. 5589void ARGBColorTableRow_X86(uint8* dst_argb, 5590 const uint8* table_argb, 5591 int width) { 5592 uintptr_t pixel_temp; 5593 asm volatile ( 5594 // 1 pixel loop. 5595 LABELALIGN 5596 "1: \n" 5597 "movzb " MEMACCESS(0) ",%1 \n" 5598 "lea " MEMLEA(0x4,0) ",%0 \n" 5599 MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1 5600 "mov %b1," MEMACCESS2(-0x4,0) " \n" 5601 "movzb " MEMACCESS2(-0x3,0) ",%1 \n" 5602 MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1 5603 "mov %b1," MEMACCESS2(-0x3,0) " \n" 5604 "movzb " MEMACCESS2(-0x2,0) ",%1 \n" 5605 MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1 5606 "mov %b1," MEMACCESS2(-0x2,0) " \n" 5607 "movzb " MEMACCESS2(-0x1,0) ",%1 \n" 5608 MEMOPARG(movzb,0x03,3,1,4,1) " \n" // movzb 0x3(%3,%1,4),%1 5609 "mov %b1," MEMACCESS2(-0x1,0) " \n" 5610 "dec %2 \n" 5611 "jg 1b \n" 5612 : "+r"(dst_argb), // %0 5613 "=&d"(pixel_temp), // %1 5614 "+r"(width) // %2 5615 : "r"(table_argb) // %3 5616 : "memory", "cc"); 5617} 5618#endif // HAS_ARGBCOLORTABLEROW_X86 5619 5620#ifdef HAS_RGBCOLORTABLEROW_X86 5621// Tranform RGB pixels with color table. 5622void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { 5623 uintptr_t pixel_temp; 5624 asm volatile ( 5625 // 1 pixel loop. 5626 LABELALIGN 5627 "1: \n" 5628 "movzb " MEMACCESS(0) ",%1 \n" 5629 "lea " MEMLEA(0x4,0) ",%0 \n" 5630 MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1 5631 "mov %b1," MEMACCESS2(-0x4,0) " \n" 5632 "movzb " MEMACCESS2(-0x3,0) ",%1 \n" 5633 MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1 5634 "mov %b1," MEMACCESS2(-0x3,0) " \n" 5635 "movzb " MEMACCESS2(-0x2,0) ",%1 \n" 5636 MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1 5637 "mov %b1," MEMACCESS2(-0x2,0) " \n" 5638 "dec %2 \n" 5639 "jg 1b \n" 5640 : "+r"(dst_argb), // %0 5641 "=&d"(pixel_temp), // %1 5642 "+r"(width) // %2 5643 : "r"(table_argb) // %3 5644 : "memory", "cc"); 5645} 5646#endif // HAS_RGBCOLORTABLEROW_X86 5647 5648#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 5649// Tranform RGB pixels with luma table. 5650void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, 5651 uint8* dst_argb, 5652 int width, 5653 const uint8* luma, 5654 uint32 lumacoeff) { 5655 uintptr_t pixel_temp; 5656 uintptr_t table_temp; 5657 asm volatile ( 5658 "movd %6,%%xmm3 \n" 5659 "pshufd $0x0,%%xmm3,%%xmm3 \n" 5660 "pcmpeqb %%xmm4,%%xmm4 \n" 5661 "psllw $0x8,%%xmm4 \n" 5662 "pxor %%xmm5,%%xmm5 \n" 5663 5664 // 4 pixel loop. 5665 LABELALIGN 5666 "1: \n" 5667 "movdqu " MEMACCESS(2) ",%%xmm0 \n" 5668 "pmaddubsw %%xmm3,%%xmm0 \n" 5669 "phaddw %%xmm0,%%xmm0 \n" 5670 "pand %%xmm4,%%xmm0 \n" 5671 "punpcklwd %%xmm5,%%xmm0 \n" 5672 "movd %%xmm0,%k1 \n" // 32 bit offset 5673 "add %5,%1 \n" 5674 "pshufd $0x39,%%xmm0,%%xmm0 \n" 5675 5676 "movzb " MEMACCESS(2) ",%0 \n" 5677 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 5678 "mov %b0," MEMACCESS(3) " \n" 5679 "movzb " MEMACCESS2(0x1,2) ",%0 \n" 5680 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 5681 "mov %b0," MEMACCESS2(0x1,3) " \n" 5682 "movzb " MEMACCESS2(0x2,2) ",%0 \n" 5683 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 5684 "mov %b0," MEMACCESS2(0x2,3) " \n" 5685 "movzb " MEMACCESS2(0x3,2) ",%0 \n" 5686 "mov %b0," MEMACCESS2(0x3,3) " \n" 5687 5688 "movd %%xmm0,%k1 \n" // 32 bit offset 5689 "add %5,%1 \n" 5690 "pshufd $0x39,%%xmm0,%%xmm0 \n" 5691 5692 "movzb " MEMACCESS2(0x4,2) ",%0 \n" 5693 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 5694 "mov %b0," MEMACCESS2(0x4,3) " \n" 5695 "movzb " MEMACCESS2(0x5,2) ",%0 \n" 5696 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 5697 "mov %b0," MEMACCESS2(0x5,3) " \n" 5698 "movzb " MEMACCESS2(0x6,2) ",%0 \n" 5699 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 5700 "mov %b0," MEMACCESS2(0x6,3) " \n" 5701 "movzb " MEMACCESS2(0x7,2) ",%0 \n" 5702 "mov %b0," MEMACCESS2(0x7,3) " \n" 5703 5704 "movd %%xmm0,%k1 \n" // 32 bit offset 5705 "add %5,%1 \n" 5706 "pshufd $0x39,%%xmm0,%%xmm0 \n" 5707 5708 "movzb " MEMACCESS2(0x8,2) ",%0 \n" 5709 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 5710 "mov %b0," MEMACCESS2(0x8,3) " \n" 5711 "movzb " MEMACCESS2(0x9,2) ",%0 \n" 5712 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 5713 "mov %b0," MEMACCESS2(0x9,3) " \n" 5714 "movzb " MEMACCESS2(0xa,2) ",%0 \n" 5715 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 5716 "mov %b0," MEMACCESS2(0xa,3) " \n" 5717 "movzb " MEMACCESS2(0xb,2) ",%0 \n" 5718 "mov %b0," MEMACCESS2(0xb,3) " \n" 5719 5720 "movd %%xmm0,%k1 \n" // 32 bit offset 5721 "add %5,%1 \n" 5722 5723 "movzb " MEMACCESS2(0xc,2) ",%0 \n" 5724 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 5725 "mov %b0," MEMACCESS2(0xc,3) " \n" 5726 "movzb " MEMACCESS2(0xd,2) ",%0 \n" 5727 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 5728 "mov %b0," MEMACCESS2(0xd,3) " \n" 5729 "movzb " MEMACCESS2(0xe,2) ",%0 \n" 5730 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 5731 "mov %b0," MEMACCESS2(0xe,3) " \n" 5732 "movzb " MEMACCESS2(0xf,2) ",%0 \n" 5733 "mov %b0," MEMACCESS2(0xf,3) " \n" 5734 "lea " MEMLEA(0x10,2) ",%2 \n" 5735 "lea " MEMLEA(0x10,3) ",%3 \n" 5736 "sub $0x4,%4 \n" 5737 "jg 1b \n" 5738 : "=&d"(pixel_temp), // %0 5739 "=&a"(table_temp), // %1 5740 "+r"(src_argb), // %2 5741 "+r"(dst_argb), // %3 5742 "+rm"(width) // %4 5743 : "r"(luma), // %5 5744 "rm"(lumacoeff) // %6 5745 : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5" 5746 ); 5747} 5748#endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 5749 5750#endif // defined(__x86_64__) || defined(__i386__) 5751 5752#ifdef __cplusplus 5753} // extern "C" 5754} // namespace libyuv 5755#endif 5756