1ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian/* 2ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian * Copyright 2011 The LibYuv Project Authors. All rights reserved. 3ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian * 4ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian * Use of this source code is governed by a BSD-style license 5ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian * that can be found in the LICENSE file in the root of the source 6ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian * tree. An additional intellectual property rights grant can be found 7ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian * in the file PATENTS. All contributing project authors may 8ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian * be found in the AUTHORS file in the root of the source tree. 9ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian */ 10ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 11ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#include "libyuv/row.h" 12ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#if !defined(LIBYUV_DISABLE_X86) && defined(_M_X64) && \ 147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian defined(_MSC_VER) && !defined(__clang__) 15ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#include <emmintrin.h> 16ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#include <tmmintrin.h> // For _mm_maddubs_epi16 17ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif 18ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 19ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef __cplusplus 20ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramaniannamespace libyuv { 21ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianextern "C" { 22ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif 23ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 24ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// This module is for Visual C. 257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#if !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) && \ 267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian defined(_MSC_VER) && !defined(__clang__) 277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianstruct YuvConstants { 297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian lvec8 kUVToB; // 0 307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian lvec8 kUVToG; // 32 317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian lvec8 kUVToR; // 64 327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian lvec16 kUVBiasB; // 96 337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian lvec16 kUVBiasG; // 128 347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian lvec16 kUVBiasR; // 160 357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian lvec16 kYToRgb; // 192 36ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}; 37ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// BT.601 YUV to RGB reference 397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// R = (Y - 16) * 1.164 - V * -1.596 407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813 417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// B = (Y - 16) * 1.164 - U * -2.018 427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Y contribution to R,G,B. Scale and bias. 447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// TODO(fbarchard): Consider moving constants into a common header. 457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ 467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */ 477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// U and V contributions to R,G,B. 497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define UB -128 /* max(-128, round(-2.018 * 64)) */ 507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define UG 25 /* round(0.391 * 64) */ 517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define VG 52 /* round(0.813 * 64) */ 527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define VR -102 /* round(-1.596 * 64) */ 537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Bias values to subtract 16 from Y and 128 from U and V. 557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define BB (UB * 128 + YGB) 567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define BG (UG * 128 + VG * 128 + YGB) 577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define BR (VR * 128 + YGB) 587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// BT601 constants for YUV to RGB. 607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianstatic YuvConstants SIMD_ALIGNED(kYuvConstants) = { 617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, 627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 }, 637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, 647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG }, 657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR }, 677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, 687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, 697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, 707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } 71ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}; 72ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// BT601 constants for NV21 where chroma plane is VU instead of UV. 747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianstatic YuvConstants SIMD_ALIGNED(kYvuConstants) = { 757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB }, 777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, 787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG }, 797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, 807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 }, 817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, 827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, 837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, 847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } 85ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}; 86ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#undef YG 887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#undef YGB 897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#undef UB 907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#undef UG 917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#undef VG 927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#undef VR 937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#undef BB 947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#undef BG 957ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#undef BR 967ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 977ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// JPEG YUV to RGB reference 987ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// * R = Y - V * -1.40200 997ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// * G = Y - U * 0.34414 - V * 0.71414 1007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// * B = Y - U * -1.77200 1017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 1027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Y contribution to R,G,B. Scale and bias. 1037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// TODO(fbarchard): Consider moving constants into a common header. 1047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */ 1057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define YGBJ 32 /* 64 / 2 */ 1067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 1077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// U and V contributions to R,G,B. 1087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define UBJ -113 /* round(-1.77200 * 64) */ 1097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define UGJ 22 /* round(0.34414 * 64) */ 1107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define VGJ 46 /* round(0.71414 * 64) */ 1117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define VRJ -90 /* round(-1.40200 * 64) */ 1127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 1137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Bias values to subtract 16 from Y and 128 from U and V. 1147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define BBJ (UBJ * 128 + YGBJ) 1157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define BGJ (UGJ * 128 + VGJ * 128 + YGBJ) 1167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define BRJ (VRJ * 128 + YGBJ) 1177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 1187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// JPEG constants for YUV to RGB. 1197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianstatic YuvConstants SIMD_ALIGNED(kYuvJConstants) = { 1207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian { UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, 1217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0 }, 1227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian { UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, 1237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, 1247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, 1257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ }, 1267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian { 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 1277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ }, 1287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian { BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, 1297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ }, 1307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian { BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, 1317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ }, 1327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian { BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, 1337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ }, 1347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian { YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, 1357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ } 136ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}; 137ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#undef YGJ 1397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#undef YGBJ 1407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#undef UBJ 1417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#undef UGJ 1427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#undef VGJ 1437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#undef VRJ 1447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#undef BBJ 1457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#undef BGJ 1467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#undef BRJ 147ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 148ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// 64 bit 149ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#if defined(_M_X64) 1507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#if defined(HAS_I422TOARGBROW_SSSE3) 151ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid I422ToARGBRow_SSSE3(const uint8* y_buf, 152ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian const uint8* u_buf, 153ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian const uint8* v_buf, 154ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian uint8* dst_argb, 155ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian int width) { 156ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __m128i xmm0, xmm1, xmm2, xmm3; 157ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian const __m128i xmm5 = _mm_set1_epi8(-1); 158ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf; 159ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 160ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian while (width > 0) { 161ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); 162ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); 163ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); 164ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); 1657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian xmm1 = _mm_loadu_si128(&xmm0); 1667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian xmm2 = _mm_loadu_si128(&xmm0); 1677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kYuvConstants.kUVToB); 1687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kYuvConstants.kUVToG); 1697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kYuvConstants.kUVToR); 1707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian xmm0 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasB, xmm0); 1717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian xmm1 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasG, xmm1); 1727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian xmm2 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasR, xmm2); 173ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian xmm3 = _mm_loadl_epi64((__m128i*)y_buf); 1747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian xmm3 = _mm_unpacklo_epi8(xmm3, xmm3); 1757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian xmm3 = _mm_mulhi_epu16(xmm3, *(__m128i*)kYuvConstants.kYToRgb); 176ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian xmm0 = _mm_adds_epi16(xmm0, xmm3); 177ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian xmm1 = _mm_adds_epi16(xmm1, xmm3); 178ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian xmm2 = _mm_adds_epi16(xmm2, xmm3); 179ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian xmm0 = _mm_srai_epi16(xmm0, 6); 180ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian xmm1 = _mm_srai_epi16(xmm1, 6); 181ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian xmm2 = _mm_srai_epi16(xmm2, 6); 182ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian xmm0 = _mm_packus_epi16(xmm0, xmm0); 183ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian xmm1 = _mm_packus_epi16(xmm1, xmm1); 184ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian xmm2 = _mm_packus_epi16(xmm2, xmm2); 185ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); 186ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); 1877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian xmm1 = _mm_loadu_si128(&xmm0); 188ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); 189ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); 190ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 191ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian _mm_storeu_si128((__m128i *)dst_argb, xmm0); 192ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1); 193ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 194ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian y_buf += 8; 195ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian u_buf += 4; 196ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian dst_argb += 32; 197ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian width -= 8; 198ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 199ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 2007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif 201ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// 32 bit 202ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#else // defined(_M_X64) 203ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBTOYROW_SSSE3 204ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 205ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Constants for ARGB. 206ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const vec8 kARGBToY = { 207ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 208ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}; 209ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 210ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// JPeg full range. 211ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const vec8 kARGBToYJ = { 212ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0 213ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}; 214ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 215ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const vec8 kARGBToU = { 216ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 217ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}; 218ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 219ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const vec8 kARGBToUJ = { 220ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0 221ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}; 222ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 223ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const vec8 kARGBToV = { 224ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, 225ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}; 226ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 227ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const vec8 kARGBToVJ = { 228ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0 229ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}; 230ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 231ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// vpshufb for vphaddw + vpackuswb packed to shorts. 232ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const lvec8 kShufARGBToUV_AVX = { 233ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, 2347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 235ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}; 236ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 237ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Constants for BGRA. 238ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const vec8 kBGRAToY = { 239ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13 240ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}; 241ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 242ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const vec8 kBGRAToU = { 243ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112 244ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}; 245ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 246ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const vec8 kBGRAToV = { 247ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18 248ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}; 249ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 250ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Constants for ABGR. 251ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const vec8 kABGRToY = { 252ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0 253ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}; 254ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 255ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const vec8 kABGRToU = { 256ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0 257ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}; 258ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 259ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const vec8 kABGRToV = { 260ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0 261ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}; 262ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 263ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Constants for RGBA. 264ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const vec8 kRGBAToY = { 265ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33 266ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}; 267ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 268ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const vec8 kRGBAToU = { 269ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38 270ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}; 271ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 272ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const vec8 kRGBAToV = { 273ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112 274ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}; 275ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 276ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const uvec8 kAddY16 = { 277ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u 278ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}; 279ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 7 bit fixed point 0.5. 281ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const vec16 kAddYJ64 = { 282ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 64, 64, 64, 64, 64, 64, 64, 64 283ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}; 284ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 285ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const uvec8 kAddUV128 = { 286ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 287ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u 288ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}; 289ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 290ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const uvec16 kAddUVJ128 = { 291ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u 292ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}; 293ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 294ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Shuffle table for converting RGB24 to ARGB. 295ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const uvec8 kShuffleMaskRGB24ToARGB = { 296ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u 297ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}; 298ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 299ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Shuffle table for converting RAW to ARGB. 300ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const uvec8 kShuffleMaskRAWToARGB = { 301ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u 302ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}; 303ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 304ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Shuffle table for converting ARGB to RGB24. 305ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const uvec8 kShuffleMaskARGBToRGB24 = { 306ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u 307ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}; 308ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 309ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Shuffle table for converting ARGB to RAW. 310ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const uvec8 kShuffleMaskARGBToRAW = { 311ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u 312ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}; 313ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 314ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 315ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const uvec8 kShuffleMaskARGBToRGB24_0 = { 316ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u 317ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}; 318ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 319ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Shuffle table for converting ARGB to RAW. 320ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const uvec8 kShuffleMaskARGBToRAW_0 = { 321ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u 322ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}; 323ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 324ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Duplicates gray value 3 times and fills in alpha opaque. 3257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 3267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { 327ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 328ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4] // src_y 329ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8] // dst_argb 330ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 12] // pix 331ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pcmpeqb xmm5, xmm5 // generate mask 0xff000000 332ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pslld xmm5, 24 333ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 334ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 335ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movq xmm0, qword ptr [eax] 336ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 8] 337ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm0, xmm0 338ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm1, xmm0 339ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklwd xmm0, xmm0 340ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpckhwd xmm1, xmm1 341ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian por xmm0, xmm5 342ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian por xmm1, xmm5 3437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx], xmm0 3447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx + 16], xmm1 345ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 32] 346ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 8 347ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 348ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 349ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 350ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 351ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#ifdef HAS_J400TOARGBROW_AVX2 3537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Duplicates gray value 3 times and fills in alpha opaque. 3547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 3557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int pix) { 356ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 3577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov eax, [esp + 4] // src_y 3587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov edx, [esp + 8] // dst_argb 3597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov ecx, [esp + 12] // pix 3607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 3617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpslld ymm5, ymm5, 24 362ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 363ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 3647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu xmm0, [eax] 3657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian lea eax, [eax + 16] 3667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpermq ymm0, ymm0, 0xd8 3677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpunpcklbw ymm0, ymm0, ymm0 3687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpermq ymm0, ymm0, 0xd8 3697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpunpckhwd ymm1, ymm0, ymm0 3707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpunpcklwd ymm0, ymm0, ymm0 3717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpor ymm0, ymm0, ymm5 3727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpor ymm1, ymm1, ymm5 3737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu [edx], ymm0 3747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu [edx + 32], ymm1 3757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian lea edx, [edx + 64] 3767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 16 3777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian jg convertloop 3787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vzeroupper 379ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 380ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 381ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 3827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif // HAS_J400TOARGBROW_AVX2 383ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 385ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { 386ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 387ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4] // src_rgb24 388ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8] // dst_argb 389ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 12] // pix 390ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pcmpeqb xmm5, xmm5 // generate mask 0xff000000 391ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pslld xmm5, 24 392ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm4, kShuffleMaskRGB24ToARGB 393ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 394ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 395ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm0, [eax] 396ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm1, [eax + 16] 397ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm3, [eax + 32] 398ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 48] 399ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm2, xmm3 400ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} 401ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufb xmm2, xmm4 402ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian por xmm2, xmm5 403ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} 404ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufb xmm0, xmm4 4057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx + 32], xmm2 406ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian por xmm0, xmm5 407ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufb xmm1, xmm4 4087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx], xmm0 409ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian por xmm1, xmm5 410ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} 411ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufb xmm3, xmm4 4127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx + 16], xmm1 413ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian por xmm3, xmm5 4147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx + 48], xmm3 415ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 64] 4167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 16 417ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 418ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 419ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 420ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 421ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 423ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, 424ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian int pix) { 425ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 426ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4] // src_raw 427ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8] // dst_argb 428ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 12] // pix 429ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pcmpeqb xmm5, xmm5 // generate mask 0xff000000 430ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pslld xmm5, 24 431ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm4, kShuffleMaskRAWToARGB 432ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 433ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 434ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm0, [eax] 435ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm1, [eax + 16] 436ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm3, [eax + 32] 437ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 48] 438ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm2, xmm3 439ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} 440ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufb xmm2, xmm4 441ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian por xmm2, xmm5 442ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} 443ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufb xmm0, xmm4 4447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx + 32], xmm2 445ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian por xmm0, xmm5 446ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufb xmm1, xmm4 4477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx], xmm0 448ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian por xmm1, xmm5 449ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} 450ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufb xmm3, xmm4 4517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx + 16], xmm1 452ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian por xmm3, xmm5 4537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx + 48], xmm3 454ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 64] 4557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 16 456ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 457ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 458ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 459ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 460ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 461ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// pmul method to replicate bits. 462ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Math to replicate bits: 463ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// (v << 8) | (v << 3) 464ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// v * 256 + v * 8 465ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// v * (256 + 8) 466ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 467ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// 20 instructions. 4687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 469ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, 470ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian int pix) { 471ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 472ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, 0x01080108 // generate multiplier to repeat 5 bits 473ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd xmm5, eax 474ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufd xmm5, xmm5, 0 475ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits 476ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd xmm6, eax 477ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufd xmm6, xmm6, 0 478ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red 479ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psllw xmm3, 11 480ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green 481ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psllw xmm4, 10 482ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm4, 5 483ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha 484ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psllw xmm7, 8 485ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 486ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4] // src_rgb565 487ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8] // dst_argb 488ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 12] // pix 489ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub edx, eax 490ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub edx, eax 491ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 492ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 493ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm0, [eax] // fetch 8 pixels of bgr565 494ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm1, xmm0 495ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm2, xmm0 496ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pand xmm1, xmm3 // R in upper 5 bits 497ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psllw xmm2, 11 // B in upper 5 bits 498ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmulhuw xmm1, xmm5 // * (256 + 8) 499ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmulhuw xmm2, xmm5 // * (256 + 8) 500ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psllw xmm1, 8 501ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian por xmm1, xmm2 // RB 502ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pand xmm0, xmm4 // G in middle 6 bits 503ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmulhuw xmm0, xmm6 // << 5 * (256 + 4) 504ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian por xmm0, xmm7 // AG 505ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm2, xmm1 506ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm1, xmm0 507ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpckhbw xmm2, xmm0 5087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB 5097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB 510ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 16] 511ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 8 512ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 513ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 514ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 515ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 516ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#ifdef HAS_RGB565TOARGBROW_AVX2 5187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// pmul method to replicate bits. 5197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Math to replicate bits: 5207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// (v << 8) | (v << 3) 5217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// v * 256 + v * 8 5227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// v * (256 + 8) 5237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 5247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 5257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, 5267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian int pix) { 5277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm { 5287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov eax, 0x01080108 // generate multiplier to repeat 5 bits 5297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovd xmm5, eax 5307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vbroadcastss ymm5, xmm5 5317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits 5327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movd xmm6, eax 5337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vbroadcastss ymm6, xmm6 5347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red 5357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpsllw ymm3, ymm3, 11 5367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x07e007e0 for Green 5377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpsllw ymm4, ymm4, 10 5387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpsrlw ymm4, ymm4, 5 5397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha 5407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpsllw ymm7, ymm7, 8 5417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 5427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov eax, [esp + 4] // src_rgb565 5437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov edx, [esp + 8] // dst_argb 5447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov ecx, [esp + 12] // pix 5457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub edx, eax 5467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub edx, eax 5477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 5487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian convertloop: 5497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu ymm0, [eax] // fetch 16 pixels of bgr565 5507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpand ymm1, ymm0, ymm3 // R in upper 5 bits 5517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpsllw ymm2, ymm0, 11 // B in upper 5 bits 5527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8) 5537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8) 5547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpsllw ymm1, ymm1, 8 5557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpor ymm1, ymm1, ymm2 // RB 5567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpand ymm0, ymm0, ymm4 // G in middle 6 bits 5577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpmulhuw ymm0, ymm0, ymm6 // << 5 * (256 + 4) 5587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpor ymm0, ymm0, ymm7 // AG 5597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpermq ymm0, ymm0, 0xd8 // mutate for unpack 5607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpermq ymm1, ymm1, 0xd8 5617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpunpckhbw ymm2, ymm1, ymm0 5627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpunpcklbw ymm1, ymm1, ymm0 5637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu [eax * 2 + edx], ymm1 // store 4 pixels of ARGB 5647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu [eax * 2 + edx + 32], ymm2 // store next 4 pixels of ARGB 5657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian lea eax, [eax + 32] 5667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 16 5677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian jg convertloop 5687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vzeroupper 5697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian ret 5707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian } 5717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian} 5727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif // HAS_RGB565TOARGBROW_AVX2 5737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 5747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#ifdef HAS_ARGB1555TOARGBROW_AVX2 5757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 5767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb, 5777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian int pix) { 5787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm { 5797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov eax, 0x01080108 // generate multiplier to repeat 5 bits 5807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovd xmm5, eax 5817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vbroadcastss ymm5, xmm5 5827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits 5837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movd xmm6, eax 5847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vbroadcastss ymm6, xmm6 5857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red 5867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpsllw ymm3, ymm3, 11 5877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpsrlw ymm4, ymm3, 6 // generate mask 0x03e003e0 for Green 5887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha 5897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpsllw ymm7, ymm7, 8 5907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 5917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov eax, [esp + 4] // src_argb1555 5927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov edx, [esp + 8] // dst_argb 5937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov ecx, [esp + 12] // pix 5947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub edx, eax 5957ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub edx, eax 5967ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 5977ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian convertloop: 5987ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu ymm0, [eax] // fetch 16 pixels of 1555 5997ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpsllw ymm1, ymm0, 1 // R in upper 5 bits 6007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpsllw ymm2, ymm0, 11 // B in upper 5 bits 6017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpand ymm1, ymm1, ymm3 6027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8) 6037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8) 6047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpsllw ymm1, ymm1, 8 6057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpor ymm1, ymm1, ymm2 // RB 6067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpsraw ymm2, ymm0, 8 // A 6077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpand ymm0, ymm0, ymm4 // G in middle 5 bits 6087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpmulhuw ymm0, ymm0, ymm6 // << 6 * (256 + 8) 6097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpand ymm2, ymm2, ymm7 6107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpor ymm0, ymm0, ymm2 // AG 6117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpermq ymm0, ymm0, 0xd8 // mutate for unpack 6127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpermq ymm1, ymm1, 0xd8 6137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpunpckhbw ymm2, ymm1, ymm0 6147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpunpcklbw ymm1, ymm1, ymm0 6157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu [eax * 2 + edx], ymm1 // store 8 pixels of ARGB 6167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu [eax * 2 + edx + 32], ymm2 // store next 8 pixels of ARGB 6177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian lea eax, [eax + 32] 6187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 16 6197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian jg convertloop 6207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vzeroupper 6217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian ret 6227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian } 6237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian} 6247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif // HAS_ARGB1555TOARGBROW_AVX2 6257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 6267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#ifdef HAS_ARGB4444TOARGBROW_AVX2 6277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 6287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb, 6297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian int pix) { 6307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm { 6317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f 6327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovd xmm4, eax 6337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vbroadcastss ymm4, xmm4 6347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpslld ymm5, ymm4, 4 // 0xf0f0f0f0 for high nibbles 6357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov eax, [esp + 4] // src_argb4444 6367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov edx, [esp + 8] // dst_argb 6377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov ecx, [esp + 12] // pix 6387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub edx, eax 6397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub edx, eax 6407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 6417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian convertloop: 6427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu ymm0, [eax] // fetch 16 pixels of bgra4444 6437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpand ymm2, ymm0, ymm5 // mask high nibbles 6447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpand ymm0, ymm0, ymm4 // mask low nibbles 6457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpsrlw ymm3, ymm2, 4 6467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpsllw ymm1, ymm0, 4 6477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpor ymm2, ymm2, ymm3 6487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpor ymm0, ymm0, ymm1 6497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpermq ymm0, ymm0, 0xd8 // mutate for unpack 6507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpermq ymm2, ymm2, 0xd8 6517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpunpckhbw ymm1, ymm0, ymm2 6527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpunpcklbw ymm0, ymm0, ymm2 6537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu [eax * 2 + edx], ymm0 // store 8 pixels of ARGB 6547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu [eax * 2 + edx + 32], ymm1 // store next 8 pixels of ARGB 6557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian lea eax, [eax + 32] 6567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 16 6577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian jg convertloop 6587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vzeroupper 6597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian ret 6607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian } 6617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian} 6627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif // HAS_ARGB4444TOARGBROW_AVX2 6637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 664ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// 24 instructions 6657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 666ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, 667ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian int pix) { 668ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 669ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, 0x01080108 // generate multiplier to repeat 5 bits 670ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd xmm5, eax 671ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufd xmm5, xmm5, 0 672ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits 673ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd xmm6, eax 674ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufd xmm6, xmm6, 0 675ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red 676ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psllw xmm3, 11 677ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green 678ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm4, 6 679ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha 680ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psllw xmm7, 8 681ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 682ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4] // src_argb1555 683ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8] // dst_argb 684ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 12] // pix 685ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub edx, eax 686ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub edx, eax 687ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 688ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 689ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm0, [eax] // fetch 8 pixels of 1555 690ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm1, xmm0 691ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm2, xmm0 692ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psllw xmm1, 1 // R in upper 5 bits 693ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psllw xmm2, 11 // B in upper 5 bits 694ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pand xmm1, xmm3 695ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmulhuw xmm2, xmm5 // * (256 + 8) 696ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmulhuw xmm1, xmm5 // * (256 + 8) 697ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psllw xmm1, 8 698ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian por xmm1, xmm2 // RB 699ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm2, xmm0 700ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pand xmm0, xmm4 // G in middle 5 bits 701ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psraw xmm2, 8 // A 702ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmulhuw xmm0, xmm6 // << 6 * (256 + 8) 703ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pand xmm2, xmm7 704ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian por xmm0, xmm2 // AG 705ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm2, xmm1 706ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm1, xmm0 707ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpckhbw xmm2, xmm0 7087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB 7097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB 710ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 16] 711ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 8 712ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 713ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 714ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 715ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 716ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 717ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// 18 instructions. 7187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 719ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, 720ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian int pix) { 721ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 722ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f 723ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd xmm4, eax 724ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufd xmm4, xmm4, 0 725ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles 726ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pslld xmm5, 4 727ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4] // src_argb4444 728ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8] // dst_argb 729ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 12] // pix 730ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub edx, eax 731ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub edx, eax 732ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 733ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 734ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm0, [eax] // fetch 8 pixels of bgra4444 735ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm2, xmm0 736ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pand xmm0, xmm4 // mask low nibbles 737ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pand xmm2, xmm5 // mask high nibbles 738ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm1, xmm0 739ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm3, xmm2 740ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psllw xmm1, 4 741ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm3, 4 742ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian por xmm0, xmm1 743ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian por xmm2, xmm3 744ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm1, xmm0 745ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm0, xmm2 746ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpckhbw xmm1, xmm2 7477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [eax * 2 + edx], xmm0 // store 4 pixels of ARGB 7487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB 749ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 16] 750ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 8 751ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 752ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 753ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 754ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 755ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 7567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 757ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { 758ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 759ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4] // src_argb 760ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8] // dst_rgb 761ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 12] // pix 762ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm6, kShuffleMaskARGBToRGB24 763ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 764ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 765ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm0, [eax] // fetch 16 pixels of argb 766ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm1, [eax + 16] 767ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm2, [eax + 32] 768ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm3, [eax + 48] 769ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 64] 770ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB 771ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufb xmm1, xmm6 772ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufb xmm2, xmm6 773ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufb xmm3, xmm6 774ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm4, xmm1 // 4 bytes from 1 for 0 775ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrldq xmm1, 4 // 8 bytes from 1 776ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pslldq xmm4, 12 // 4 bytes from 1 for 0 777ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm5, xmm2 // 8 bytes from 2 for 1 778ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian por xmm0, xmm4 // 4 bytes from 1 for 0 779ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pslldq xmm5, 8 // 8 bytes from 2 for 1 780ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu [edx], xmm0 // store 0 781ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian por xmm1, xmm5 // 8 bytes from 2 for 1 782ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrldq xmm2, 8 // 4 bytes from 2 783ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pslldq xmm3, 4 // 12 bytes from 3 for 2 784ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian por xmm2, xmm3 // 12 bytes from 3 for 2 785ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu [edx + 16], xmm1 // store 1 786ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu [edx + 32], xmm2 // store 2 787ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 48] 788ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 16 789ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 790ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 791ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 792ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 793ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 7947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 795ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { 796ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 797ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4] // src_argb 798ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8] // dst_rgb 799ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 12] // pix 800ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm6, kShuffleMaskARGBToRAW 801ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 802ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 803ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm0, [eax] // fetch 16 pixels of argb 804ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm1, [eax + 16] 805ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm2, [eax + 32] 806ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm3, [eax + 48] 807ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 64] 808ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB 809ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufb xmm1, xmm6 810ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufb xmm2, xmm6 811ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufb xmm3, xmm6 812ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm4, xmm1 // 4 bytes from 1 for 0 813ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrldq xmm1, 4 // 8 bytes from 1 814ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pslldq xmm4, 12 // 4 bytes from 1 for 0 815ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm5, xmm2 // 8 bytes from 2 for 1 816ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian por xmm0, xmm4 // 4 bytes from 1 for 0 817ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pslldq xmm5, 8 // 8 bytes from 2 for 1 818ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu [edx], xmm0 // store 0 819ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian por xmm1, xmm5 // 8 bytes from 2 for 1 820ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrldq xmm2, 8 // 4 bytes from 2 821ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pslldq xmm3, 4 // 12 bytes from 3 for 2 822ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian por xmm2, xmm3 // 12 bytes from 3 for 2 823ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu [edx + 16], xmm1 // store 1 824ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu [edx + 32], xmm2 // store 2 825ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 48] 826ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 16 827ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 828ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 829ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 830ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 831ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 8327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 4 pixels 8337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 834ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { 835ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 836ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4] // src_argb 837ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8] // dst_rgb 838ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 12] // pix 839ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pcmpeqb xmm3, xmm3 // generate mask 0x0000001f 840ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrld xmm3, 27 841ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 842ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrld xmm4, 26 843ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pslld xmm4, 5 844ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 845ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pslld xmm5, 11 846ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 847ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 8487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm0, [eax] // fetch 4 pixels of argb 8497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqa xmm1, xmm0 // B 8507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqa xmm2, xmm0 // G 8517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian pslld xmm0, 8 // R 8527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian psrld xmm1, 3 // B 8537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian psrld xmm2, 5 // G 8547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian psrad xmm0, 16 // R 8557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian pand xmm1, xmm3 // B 8567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian pand xmm2, xmm4 // G 8577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian pand xmm0, xmm5 // R 8587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian por xmm1, xmm2 // BG 8597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian por xmm0, xmm1 // BGR 8607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian packssdw xmm0, xmm0 8617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian lea eax, [eax + 16] 8627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movq qword ptr [edx], xmm0 // store 4 pixels of RGB565 8637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian lea edx, [edx + 8] 8647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 4 8657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian jg convertloop 8667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian ret 8677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian } 8687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian} 8697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 8707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 8 pixels 8717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 8727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb, 8737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian const uint32 dither4, int pix) { 8747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm { 8757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 8767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov eax, [esp + 4] // src_argb 8777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov edx, [esp + 8] // dst_rgb 8787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movd xmm6, [esp + 12] // dither4 8797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov ecx, [esp + 16] // pix 8807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian punpcklbw xmm6, xmm6 // make dither 16 bytes 8817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqa xmm7, xmm6 8827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian punpcklwd xmm6, xmm6 8837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian punpckhwd xmm7, xmm7 8847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian pcmpeqb xmm3, xmm3 // generate mask 0x0000001f 8857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian psrld xmm3, 27 8867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 8877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian psrld xmm4, 26 8887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian pslld xmm4, 5 8897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 8907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian pslld xmm5, 11 8917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 8927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian convertloop: 8937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm0, [eax] // fetch 4 pixels of argb 8947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian paddusb xmm0, xmm6 // add dither 895ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm1, xmm0 // B 896ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm2, xmm0 // G 897ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pslld xmm0, 8 // R 898ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrld xmm1, 3 // B 899ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrld xmm2, 5 // G 900ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrad xmm0, 16 // R 901ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pand xmm1, xmm3 // B 902ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pand xmm2, xmm4 // G 903ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pand xmm0, xmm5 // R 904ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian por xmm1, xmm2 // BG 905ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian por xmm0, xmm1 // BGR 906ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packssdw xmm0, xmm0 907ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 16] 908ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movq qword ptr [edx], xmm0 // store 4 pixels of RGB565 909ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 8] 910ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 4 911ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 912ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 913ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 914ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 915ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 9167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#ifdef HAS_ARGBTORGB565DITHERROW_AVX2 9177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 9187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb, 9197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian const uint32 dither4, int pix) { 9207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm { 9217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov eax, [esp + 4] // src_argb 9227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov edx, [esp + 8] // dst_rgb 9237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vbroadcastss xmm6, [esp + 12] // dither4 9247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov ecx, [esp + 16] // pix 9257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpunpcklbw xmm6, xmm6, xmm6 // make dither 32 bytes 9267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpermq ymm6, ymm6, 0xd8 9277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpunpcklwd ymm6, ymm6, ymm6 9287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f 9297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpsrld ymm3, ymm3, 27 9307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 9317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpsrld ymm4, ymm4, 26 9327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpslld ymm4, ymm4, 5 9337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpslld ymm5, ymm3, 11 // generate mask 0x0000f800 9347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 9357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian convertloop: 9367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu ymm0, [eax] // fetch 8 pixels of argb 9377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpaddusb ymm0, ymm0, ymm6 // add dither 9387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpsrld ymm2, ymm0, 5 // G 9397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpsrld ymm1, ymm0, 3 // B 9407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpsrld ymm0, ymm0, 8 // R 9417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpand ymm2, ymm2, ymm4 // G 9427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpand ymm1, ymm1, ymm3 // B 9437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpand ymm0, ymm0, ymm5 // R 9447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpor ymm1, ymm1, ymm2 // BG 9457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpor ymm0, ymm0, ymm1 // BGR 9467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpackusdw ymm0, ymm0, ymm0 9477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpermq ymm0, ymm0, 0xd8 9487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian lea eax, [eax + 32] 9497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu [edx], xmm0 // store 8 pixels of RGB565 9507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian lea edx, [edx + 16] 9517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 8 9527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian jg convertloop 9537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vzeroupper 9547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian ret 9557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian } 9567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian} 9577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif // HAS_ARGBTORGB565DITHERROW_AVX2 9587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 959ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// TODO(fbarchard): Improve sign extension/packing. 9607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 961ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { 962ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 963ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4] // src_argb 964ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8] // dst_rgb 965ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 12] // pix 966ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pcmpeqb xmm4, xmm4 // generate mask 0x0000001f 967ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrld xmm4, 27 968ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm5, xmm4 // generate mask 0x000003e0 969ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pslld xmm5, 5 970ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm6, xmm4 // generate mask 0x00007c00 971ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pslld xmm6, 10 972ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pcmpeqb xmm7, xmm7 // generate mask 0xffff8000 973ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pslld xmm7, 15 974ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 975ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 9767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm0, [eax] // fetch 4 pixels of argb 977ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm1, xmm0 // B 978ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm2, xmm0 // G 979ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm3, xmm0 // R 980ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrad xmm0, 16 // A 981ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrld xmm1, 3 // B 982ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrld xmm2, 6 // G 983ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrld xmm3, 9 // R 984ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pand xmm0, xmm7 // A 985ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pand xmm1, xmm4 // B 986ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pand xmm2, xmm5 // G 987ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pand xmm3, xmm6 // R 988ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian por xmm0, xmm1 // BA 989ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian por xmm2, xmm3 // GR 990ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian por xmm0, xmm2 // BGRA 991ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packssdw xmm0, xmm0 992ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 16] 993ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555 994ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 8] 995ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 4 996ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 997ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 998ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 999ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 1000ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 10017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 1002ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { 1003ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 1004ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4] // src_argb 1005ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8] // dst_rgb 1006ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 12] // pix 1007ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pcmpeqb xmm4, xmm4 // generate mask 0xf000f000 1008ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psllw xmm4, 12 1009ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm3, xmm4 // generate mask 0x00f000f0 1010ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm3, 8 1011ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1012ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 10137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm0, [eax] // fetch 4 pixels of argb 1014ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm1, xmm0 1015ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pand xmm0, xmm3 // low nibble 1016ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pand xmm1, xmm4 // high nibble 10177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian psrld xmm0, 4 10187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian psrld xmm1, 8 1019ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian por xmm0, xmm1 1020ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm0, xmm0 1021ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 16] 1022ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444 1023ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 8] 1024ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 4 1025ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 1026ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 1027ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 1028ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 1029ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 10307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#ifdef HAS_ARGBTORGB565ROW_AVX2 10317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 10327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { 10337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm { 10347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov eax, [esp + 4] // src_argb 10357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov edx, [esp + 8] // dst_rgb 10367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov ecx, [esp + 12] // pix 10377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f 10387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpsrld ymm3, ymm3, 27 10397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 10407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpsrld ymm4, ymm4, 26 10417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpslld ymm4, ymm4, 5 10427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpslld ymm5, ymm3, 11 // generate mask 0x0000f800 10437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 10447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian convertloop: 10457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu ymm0, [eax] // fetch 8 pixels of argb 10467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpsrld ymm2, ymm0, 5 // G 10477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpsrld ymm1, ymm0, 3 // B 10487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpsrld ymm0, ymm0, 8 // R 10497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpand ymm2, ymm2, ymm4 // G 10507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpand ymm1, ymm1, ymm3 // B 10517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpand ymm0, ymm0, ymm5 // R 10527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpor ymm1, ymm1, ymm2 // BG 10537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpor ymm0, ymm0, ymm1 // BGR 10547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpackusdw ymm0, ymm0, ymm0 10557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpermq ymm0, ymm0, 0xd8 10567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian lea eax, [eax + 32] 10577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu [edx], xmm0 // store 8 pixels of RGB565 10587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian lea edx, [edx + 16] 10597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 8 10607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian jg convertloop 10617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vzeroupper 10627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian ret 10637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian } 10647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian} 10657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif // HAS_ARGBTORGB565ROW_AVX2 10667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 10677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#ifdef HAS_ARGBTOARGB1555ROW_AVX2 10687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 10697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { 10707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm { 10717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov eax, [esp + 4] // src_argb 10727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov edx, [esp + 8] // dst_rgb 10737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov ecx, [esp + 12] // pix 10747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpcmpeqb ymm4, ymm4, ymm4 10757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpsrld ymm4, ymm4, 27 // generate mask 0x0000001f 10767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpslld ymm5, ymm4, 5 // generate mask 0x000003e0 10777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpslld ymm6, ymm4, 10 // generate mask 0x00007c00 10787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xffff8000 10797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpslld ymm7, ymm7, 15 10807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 10817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian convertloop: 10827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu ymm0, [eax] // fetch 8 pixels of argb 10837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpsrld ymm3, ymm0, 9 // R 10847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpsrld ymm2, ymm0, 6 // G 10857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpsrld ymm1, ymm0, 3 // B 10867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpsrad ymm0, ymm0, 16 // A 10877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpand ymm3, ymm3, ymm6 // R 10887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpand ymm2, ymm2, ymm5 // G 10897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpand ymm1, ymm1, ymm4 // B 10907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpand ymm0, ymm0, ymm7 // A 10917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpor ymm0, ymm0, ymm1 // BA 10927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpor ymm2, ymm2, ymm3 // GR 10937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpor ymm0, ymm0, ymm2 // BGRA 10947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpackssdw ymm0, ymm0, ymm0 10957ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpermq ymm0, ymm0, 0xd8 10967ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian lea eax, [eax + 32] 10977ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu [edx], xmm0 // store 8 pixels of ARGB1555 10987ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian lea edx, [edx + 16] 10997ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 8 11007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian jg convertloop 11017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vzeroupper 11027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian ret 11037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian } 11047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian} 11057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif // HAS_ARGBTOARGB1555ROW_AVX2 11067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 11077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#ifdef HAS_ARGBTOARGB4444ROW_AVX2 11087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 11097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { 11107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm { 11117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov eax, [esp + 4] // src_argb 11127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov edx, [esp + 8] // dst_rgb 11137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov ecx, [esp + 12] // pix 11147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000 11157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpsllw ymm4, ymm4, 12 11167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0 11177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 11187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian convertloop: 11197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu ymm0, [eax] // fetch 8 pixels of argb 11207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpand ymm1, ymm0, ymm4 // high nibble 11217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpand ymm0, ymm0, ymm3 // low nibble 11227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpsrld ymm1, ymm1, 8 11237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpsrld ymm0, ymm0, 4 11247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpor ymm0, ymm0, ymm1 11257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpackuswb ymm0, ymm0, ymm0 11267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpermq ymm0, ymm0, 0xd8 11277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian lea eax, [eax + 32] 11287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu [edx], xmm0 // store 8 pixels of ARGB4444 11297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian lea edx, [edx + 16] 11307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 8 11317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian jg convertloop 11327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vzeroupper 11337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian ret 11347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian } 11357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian} 11367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif // HAS_ARGBTOARGB4444ROW_AVX2 11377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 1138ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Convert 16 ARGB pixels (64 bytes) to 16 Y values. 11397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 1140ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 1141ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 1142ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4] /* src_argb */ 1143ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8] /* dst_y */ 1144ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 12] /* pix */ 1145ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm4, kARGBToY 11467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqa xmm5, kAddY16 1147ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1148ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 11497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm0, [eax] 11507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm1, [eax + 16] 11517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm2, [eax + 32] 11527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm3, [eax + 48] 1153ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm0, xmm4 1154ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm1, xmm4 1155ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm2, xmm4 1156ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm3, xmm4 1157ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 64] 1158ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian phaddw xmm0, xmm1 1159ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian phaddw xmm2, xmm3 1160ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm0, 7 1161ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm2, 7 1162ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm0, xmm2 1163ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddb xmm0, xmm5 11647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx], xmm0 1165ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 16] 11667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 16 1167ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 1168ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 1169ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 1170ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 1171ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 11727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Convert 16 ARGB pixels (64 bytes) to 16 YJ values. 11737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Same as ARGBToYRow but different coefficients, no add 16, but do rounding. 11747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 1175ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 1176ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 1177ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4] /* src_argb */ 1178ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8] /* dst_y */ 1179ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 12] /* pix */ 1180ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm4, kARGBToYJ 1181ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm5, kAddYJ64 1182ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1183ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 11847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm0, [eax] 11857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm1, [eax + 16] 11867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm2, [eax + 32] 11877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm3, [eax + 48] 1188ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm0, xmm4 1189ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm1, xmm4 1190ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm2, xmm4 1191ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm3, xmm4 1192ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 64] 1193ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian phaddw xmm0, xmm1 1194ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian phaddw xmm2, xmm3 1195ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddw xmm0, xmm5 // Add .5 for rounding. 1196ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddw xmm2, xmm5 1197ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm0, 7 1198ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm2, 7 1199ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm0, xmm2 12007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx], xmm0 1201ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 16] 12027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 16 1203ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 1204ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 1205ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 1206ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 1207ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1208ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBTOYROW_AVX2 12097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// vpermd for vphaddw + vpackuswb vpermd. 12107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianstatic const lvec32 kPermdARGBToY_AVX = { 12117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 0, 4, 1, 5, 2, 6, 3, 7 12127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}; 12137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 1214ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Convert 32 ARGB pixels (128 bytes) to 32 Y values. 12157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 1216ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { 1217ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 1218ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4] /* src_argb */ 1219ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8] /* dst_y */ 1220ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 12] /* pix */ 1221ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vbroadcastf128 ymm4, kARGBToY 1222ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vbroadcastf128 ymm5, kAddY16 12237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu ymm6, kPermdARGBToY_AVX 1224ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1225ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 1226ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovdqu ymm0, [eax] 1227ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovdqu ymm1, [eax + 32] 1228ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovdqu ymm2, [eax + 64] 1229ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovdqu ymm3, [eax + 96] 1230ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpmaddubsw ymm0, ymm0, ymm4 1231ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpmaddubsw ymm1, ymm1, ymm4 1232ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpmaddubsw ymm2, ymm2, ymm4 1233ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpmaddubsw ymm3, ymm3, ymm4 1234ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 128] 1235ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vphaddw ymm0, ymm0, ymm1 // mutates. 1236ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vphaddw ymm2, ymm2, ymm3 1237ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpsrlw ymm0, ymm0, 7 1238ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpsrlw ymm2, ymm2, 7 1239ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpackuswb ymm0, ymm0, ymm2 // mutates. 1240ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation. 12417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpaddb ymm0, ymm0, ymm5 // add 16 for Y 1242ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovdqu [edx], ymm0 1243ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 32] 12447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 32 1245ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 1246ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vzeroupper 1247ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 1248ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 1249ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 1250ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif // HAS_ARGBTOYROW_AVX2 1251ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 12527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#ifdef HAS_ARGBTOYJROW_AVX2 1253ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Convert 32 ARGB pixels (128 bytes) to 32 Y values. 12547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 1255ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { 1256ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 1257ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4] /* src_argb */ 1258ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8] /* dst_y */ 1259ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 12] /* pix */ 1260ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vbroadcastf128 ymm4, kARGBToYJ 1261ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vbroadcastf128 ymm5, kAddYJ64 12627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu ymm6, kPermdARGBToY_AVX 1263ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1264ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 1265ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovdqu ymm0, [eax] 1266ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovdqu ymm1, [eax + 32] 1267ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovdqu ymm2, [eax + 64] 1268ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovdqu ymm3, [eax + 96] 1269ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpmaddubsw ymm0, ymm0, ymm4 1270ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpmaddubsw ymm1, ymm1, ymm4 1271ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpmaddubsw ymm2, ymm2, ymm4 1272ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpmaddubsw ymm3, ymm3, ymm4 1273ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 128] 1274ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vphaddw ymm0, ymm0, ymm1 // mutates. 1275ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vphaddw ymm2, ymm2, ymm3 1276ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpaddw ymm0, ymm0, ymm5 // Add .5 for rounding. 1277ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpaddw ymm2, ymm2, ymm5 1278ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpsrlw ymm0, ymm0, 7 1279ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpsrlw ymm2, ymm2, 7 1280ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpackuswb ymm0, ymm0, ymm2 // mutates. 1281ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation. 1282ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovdqu [edx], ymm0 1283ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 32] 12847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 32 1285ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 1286ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1287ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vzeroupper 1288ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 1289ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 1290ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 1291ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif // HAS_ARGBTOYJROW_AVX2 1292ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 12937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 12947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 1295ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 1296ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4] /* src_argb */ 1297ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8] /* dst_y */ 1298ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 12] /* pix */ 12997ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqa xmm4, kBGRAToY 1300ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm5, kAddY16 1301ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1302ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 1303ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm0, [eax] 1304ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm1, [eax + 16] 1305ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm2, [eax + 32] 1306ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm3, [eax + 48] 1307ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm0, xmm4 1308ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm1, xmm4 1309ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm2, xmm4 1310ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm3, xmm4 1311ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 64] 1312ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian phaddw xmm0, xmm1 1313ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian phaddw xmm2, xmm3 1314ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm0, 7 1315ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm2, 7 1316ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm0, xmm2 1317ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddb xmm0, xmm5 1318ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu [edx], xmm0 1319ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 16] 13207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 16 1321ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 1322ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 1323ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 1324ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 1325ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 13267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 13277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 1328ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 1329ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4] /* src_argb */ 1330ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8] /* dst_y */ 1331ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 12] /* pix */ 13327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqa xmm4, kABGRToY 13337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqa xmm5, kAddY16 1334ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1335ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 1336ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm0, [eax] 1337ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm1, [eax + 16] 1338ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm2, [eax + 32] 1339ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm3, [eax + 48] 1340ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm0, xmm4 1341ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm1, xmm4 1342ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm2, xmm4 1343ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm3, xmm4 1344ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 64] 1345ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian phaddw xmm0, xmm1 1346ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian phaddw xmm2, xmm3 1347ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm0, 7 1348ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm2, 7 1349ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm0, xmm2 13507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian paddb xmm0, xmm5 1351ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu [edx], xmm0 1352ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 16] 13537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 16 1354ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 1355ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 1356ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 1357ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 1358ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 13597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 13607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 1361ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 1362ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4] /* src_argb */ 1363ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8] /* dst_y */ 1364ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 12] /* pix */ 1365ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm4, kRGBAToY 1366ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm5, kAddY16 1367ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1368ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 1369ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm0, [eax] 1370ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm1, [eax + 16] 1371ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm2, [eax + 32] 1372ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm3, [eax + 48] 1373ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm0, xmm4 1374ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm1, xmm4 1375ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm2, xmm4 1376ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm3, xmm4 1377ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 64] 1378ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian phaddw xmm0, xmm1 1379ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian phaddw xmm2, xmm3 1380ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm0, 7 1381ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm2, 7 1382ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm0, xmm2 1383ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddb xmm0, xmm5 1384ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu [edx], xmm0 1385ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 16] 13867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 16 1387ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 1388ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 1389ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 1390ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 1391ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 13927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 1393ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 1394ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian uint8* dst_u, uint8* dst_v, int width) { 1395ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 1396ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 1397ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push edi 1398ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 8 + 4] // src_argb 1399ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, [esp + 8 + 8] // src_stride_argb 1400ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8 + 12] // dst_u 1401ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edi, [esp + 8 + 16] // dst_v 1402ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 8 + 20] // pix 1403ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm5, kAddUV128 14047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqa xmm6, kARGBToV 14057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqa xmm7, kARGBToU 1406ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub edi, edx // stride from u to v 1407ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1408ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 1409ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian /* step 1 - subsample 16x2 argb pixels to 8x1 */ 14107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm0, [eax] 14117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm4, [eax + esi] 14127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian pavgb xmm0, xmm4 14137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm1, [eax + 16] 14147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm4, [eax + esi + 16] 14157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian pavgb xmm1, xmm4 14167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm2, [eax + 32] 14177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm4, [eax + esi + 32] 14187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian pavgb xmm2, xmm4 14197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm3, [eax + 48] 14207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm4, [eax + esi + 48] 14217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian pavgb xmm3, xmm4 14227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 1423ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 64] 1424ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm4, xmm0 1425ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian shufps xmm0, xmm1, 0x88 1426ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian shufps xmm4, xmm1, 0xdd 1427ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgb xmm0, xmm4 1428ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm4, xmm2 1429ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian shufps xmm2, xmm3, 0x88 1430ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian shufps xmm4, xmm3, 0xdd 1431ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgb xmm2, xmm4 1432ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1433ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // step 2 - convert to U and V 1434ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // from here down is very similar to Y code except 1435ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // instead of 16 different pixels, its 8 pixels of U and 8 of V 1436ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm1, xmm0 1437ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm3, xmm2 1438ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm0, xmm7 // U 1439ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm2, xmm7 1440ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm1, xmm6 // V 1441ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm3, xmm6 1442ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian phaddw xmm0, xmm2 1443ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian phaddw xmm1, xmm3 1444ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psraw xmm0, 8 1445ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psraw xmm1, 8 1446ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packsswb xmm0, xmm1 1447ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddb xmm0, xmm5 // -> unsigned 1448ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1449ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // step 3 - store 8 U and 8 V values 1450ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movlps qword ptr [edx], xmm0 // U 1451ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movhps qword ptr [edx + edi], xmm0 // V 1452ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 8] 14537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 16 1454ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 1455ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1456ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop edi 1457ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 1458ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 1459ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 1460ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 1461ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 14627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 1463ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 1464ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian uint8* dst_u, uint8* dst_v, int width) { 1465ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 1466ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 1467ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push edi 1468ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 8 + 4] // src_argb 1469ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, [esp + 8 + 8] // src_stride_argb 1470ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8 + 12] // dst_u 1471ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edi, [esp + 8 + 16] // dst_v 1472ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 8 + 20] // pix 1473ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm5, kAddUVJ128 14747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqa xmm6, kARGBToVJ 14757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqa xmm7, kARGBToUJ 1476ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub edi, edx // stride from u to v 1477ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1478ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 1479ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian /* step 1 - subsample 16x2 argb pixels to 8x1 */ 14807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm0, [eax] 14817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm4, [eax + esi] 14827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian pavgb xmm0, xmm4 14837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm1, [eax + 16] 14847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm4, [eax + esi + 16] 14857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian pavgb xmm1, xmm4 14867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm2, [eax + 32] 14877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm4, [eax + esi + 32] 14887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian pavgb xmm2, xmm4 14897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm3, [eax + 48] 14907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm4, [eax + esi + 48] 14917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian pavgb xmm3, xmm4 14927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 1493ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 64] 1494ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm4, xmm0 1495ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian shufps xmm0, xmm1, 0x88 1496ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian shufps xmm4, xmm1, 0xdd 1497ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgb xmm0, xmm4 1498ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm4, xmm2 1499ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian shufps xmm2, xmm3, 0x88 1500ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian shufps xmm4, xmm3, 0xdd 1501ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgb xmm2, xmm4 1502ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1503ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // step 2 - convert to U and V 1504ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // from here down is very similar to Y code except 1505ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // instead of 16 different pixels, its 8 pixels of U and 8 of V 1506ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm1, xmm0 1507ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm3, xmm2 1508ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm0, xmm7 // U 1509ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm2, xmm7 1510ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm1, xmm6 // V 1511ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm3, xmm6 1512ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian phaddw xmm0, xmm2 1513ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian phaddw xmm1, xmm3 1514ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddw xmm0, xmm5 // +.5 rounding -> unsigned 1515ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddw xmm1, xmm5 1516ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psraw xmm0, 8 1517ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psraw xmm1, 8 1518ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packsswb xmm0, xmm1 1519ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1520ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // step 3 - store 8 U and 8 V values 1521ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movlps qword ptr [edx], xmm0 // U 1522ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movhps qword ptr [edx + edi], xmm0 // V 1523ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 8] 15247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 16 1525ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 1526ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1527ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop edi 1528ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 1529ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 1530ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 1531ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 1532ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1533ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBTOUVROW_AVX2 15347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 1535ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, 1536ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian uint8* dst_u, uint8* dst_v, int width) { 1537ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 1538ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 1539ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push edi 1540ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 8 + 4] // src_argb 1541ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, [esp + 8 + 8] // src_stride_argb 1542ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8 + 12] // dst_u 1543ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edi, [esp + 8 + 16] // dst_v 1544ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 8 + 20] // pix 1545ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vbroadcastf128 ymm5, kAddUV128 1546ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vbroadcastf128 ymm6, kARGBToV 1547ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vbroadcastf128 ymm7, kARGBToU 1548ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub edi, edx // stride from u to v 1549ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1550ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 1551ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian /* step 1 - subsample 32x2 argb pixels to 16x1 */ 1552ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovdqu ymm0, [eax] 1553ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovdqu ymm1, [eax + 32] 1554ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovdqu ymm2, [eax + 64] 1555ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovdqu ymm3, [eax + 96] 1556ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpavgb ymm0, ymm0, [eax + esi] 1557ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpavgb ymm1, ymm1, [eax + esi + 32] 1558ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpavgb ymm2, ymm2, [eax + esi + 64] 1559ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpavgb ymm3, ymm3, [eax + esi + 96] 1560ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 128] 1561ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vshufps ymm4, ymm0, ymm1, 0x88 1562ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vshufps ymm0, ymm0, ymm1, 0xdd 1563ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpavgb ymm0, ymm0, ymm4 // mutated by vshufps 1564ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vshufps ymm4, ymm2, ymm3, 0x88 1565ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vshufps ymm2, ymm2, ymm3, 0xdd 1566ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpavgb ymm2, ymm2, ymm4 // mutated by vshufps 1567ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1568ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // step 2 - convert to U and V 1569ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // from here down is very similar to Y code except 1570ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // instead of 32 different pixels, its 16 pixels of U and 16 of V 1571ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpmaddubsw ymm1, ymm0, ymm7 // U 1572ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpmaddubsw ymm3, ymm2, ymm7 1573ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpmaddubsw ymm0, ymm0, ymm6 // V 1574ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpmaddubsw ymm2, ymm2, ymm6 1575ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vphaddw ymm1, ymm1, ymm3 // mutates 1576ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vphaddw ymm0, ymm0, ymm2 1577ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpsraw ymm1, ymm1, 8 1578ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpsraw ymm0, ymm0, 8 1579ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpacksswb ymm0, ymm1, ymm0 // mutates 1580ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpermq ymm0, ymm0, 0xd8 // For vpacksswb 1581ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpshufb ymm0, ymm0, kShufARGBToUV_AVX // For vshufps + vphaddw 1582ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpaddb ymm0, ymm0, ymm5 // -> unsigned 1583ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1584ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // step 3 - store 16 U and 16 V values 1585ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vextractf128 [edx], ymm0, 0 // U 1586ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vextractf128 [edx + edi], ymm0, 1 // V 1587ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 16] 15887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 32 1589ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 1590ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1591ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop edi 1592ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 1593ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vzeroupper 1594ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 1595ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 1596ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 1597ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif // HAS_ARGBTOUVROW_AVX2 1598ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 15997ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 16007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid ARGBToUV444Row_SSSE3(const uint8* src_argb0, 16017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian uint8* dst_u, uint8* dst_v, int width) { 1602ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 1603ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push edi 16047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov eax, [esp + 4 + 4] // src_argb 16057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov edx, [esp + 4 + 8] // dst_u 16067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov edi, [esp + 4 + 12] // dst_v 16077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov ecx, [esp + 4 + 16] // pix 1608ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm5, kAddUV128 16097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqa xmm6, kARGBToV 16107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqa xmm7, kARGBToU 1611ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub edi, edx // stride from u to v 1612ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1613ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 16147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian /* convert to U and V */ 16157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm0, [eax] // U 1616ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm1, [eax + 16] 1617ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm2, [eax + 32] 1618ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm3, [eax + 48] 16197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian pmaddubsw xmm0, xmm7 16207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian pmaddubsw xmm1, xmm7 1621ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm2, xmm7 16227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian pmaddubsw xmm3, xmm7 16237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian phaddw xmm0, xmm1 16247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian phaddw xmm2, xmm3 1625ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psraw xmm0, 8 16267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian psraw xmm2, 8 16277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian packsswb xmm0, xmm2 16287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian paddb xmm0, xmm5 16297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx], xmm0 1630ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1631ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm0, [eax] // V 1632ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm1, [eax + 16] 1633ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm2, [eax + 32] 1634ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm3, [eax + 48] 1635ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm0, xmm6 1636ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm1, xmm6 1637ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm2, xmm6 1638ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm3, xmm6 1639ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian phaddw xmm0, xmm1 1640ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian phaddw xmm2, xmm3 1641ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psraw xmm0, 8 1642ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psraw xmm2, 8 1643ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packsswb xmm0, xmm2 1644ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddb xmm0, xmm5 1645ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 64] 1646ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu [edx + edi], xmm0 1647ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 16] 16487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 16 1649ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 1650ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1651ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop edi 1652ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 1653ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 1654ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 1655ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 16567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 1657ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBToUV422Row_SSSE3(const uint8* src_argb0, 1658ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian uint8* dst_u, uint8* dst_v, int width) { 1659ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 1660ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push edi 1661ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4 + 4] // src_argb 1662ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 4 + 8] // dst_u 1663ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edi, [esp + 4 + 12] // dst_v 1664ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 4 + 16] // pix 1665ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm5, kAddUV128 16667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqa xmm6, kARGBToV 16677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqa xmm7, kARGBToU 1668ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub edi, edx // stride from u to v 1669ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1670ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 1671ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian /* step 1 - subsample 16x2 argb pixels to 8x1 */ 16727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm0, [eax] 16737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm1, [eax + 16] 16747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm2, [eax + 32] 16757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm3, [eax + 48] 1676ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 64] 1677ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm4, xmm0 1678ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian shufps xmm0, xmm1, 0x88 1679ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian shufps xmm4, xmm1, 0xdd 1680ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgb xmm0, xmm4 1681ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm4, xmm2 1682ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian shufps xmm2, xmm3, 0x88 1683ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian shufps xmm4, xmm3, 0xdd 1684ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgb xmm2, xmm4 1685ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1686ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // step 2 - convert to U and V 1687ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // from here down is very similar to Y code except 1688ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // instead of 16 different pixels, its 8 pixels of U and 8 of V 1689ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm1, xmm0 1690ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm3, xmm2 1691ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm0, xmm7 // U 1692ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm2, xmm7 1693ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm1, xmm6 // V 1694ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm3, xmm6 1695ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian phaddw xmm0, xmm2 1696ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian phaddw xmm1, xmm3 1697ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psraw xmm0, 8 1698ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psraw xmm1, 8 1699ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packsswb xmm0, xmm1 1700ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddb xmm0, xmm5 // -> unsigned 1701ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1702ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // step 3 - store 8 U and 8 V values 1703ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movlps qword ptr [edx], xmm0 // U 1704ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movhps qword ptr [edx + edi], xmm0 // V 1705ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 8] 17067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 16 1707ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 1708ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1709ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop edi 1710ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 1711ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 1712ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 1713ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 17147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 17157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 17167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian uint8* dst_u, uint8* dst_v, int width) { 1717ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 17187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian push esi 1719ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push edi 17207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov eax, [esp + 8 + 4] // src_argb 17217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov esi, [esp + 8 + 8] // src_stride_argb 17227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov edx, [esp + 8 + 12] // dst_u 17237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov edi, [esp + 8 + 16] // dst_v 17247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov ecx, [esp + 8 + 20] // pix 1725ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm5, kAddUV128 17267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqa xmm6, kBGRAToV 17277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqa xmm7, kBGRAToU 1728ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub edi, edx // stride from u to v 1729ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1730ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 1731ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1732ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm0, [eax] 17337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm4, [eax + esi] 17347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian pavgb xmm0, xmm4 1735ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm1, [eax + 16] 17367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm4, [eax + esi + 16] 17377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian pavgb xmm1, xmm4 1738ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm2, [eax + 32] 17397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm4, [eax + esi + 32] 17407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian pavgb xmm2, xmm4 1741ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm3, [eax + 48] 17427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm4, [eax + esi + 48] 17437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian pavgb xmm3, xmm4 17447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 1745ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 64] 1746ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm4, xmm0 1747ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian shufps xmm0, xmm1, 0x88 1748ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian shufps xmm4, xmm1, 0xdd 1749ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgb xmm0, xmm4 1750ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm4, xmm2 1751ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian shufps xmm2, xmm3, 0x88 1752ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian shufps xmm4, xmm3, 0xdd 1753ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgb xmm2, xmm4 1754ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1755ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // step 2 - convert to U and V 1756ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // from here down is very similar to Y code except 1757ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // instead of 16 different pixels, its 8 pixels of U and 8 of V 1758ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm1, xmm0 1759ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm3, xmm2 1760ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm0, xmm7 // U 1761ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm2, xmm7 1762ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm1, xmm6 // V 1763ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm3, xmm6 1764ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian phaddw xmm0, xmm2 1765ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian phaddw xmm1, xmm3 1766ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psraw xmm0, 8 1767ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psraw xmm1, 8 1768ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packsswb xmm0, xmm1 1769ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddb xmm0, xmm5 // -> unsigned 1770ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1771ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // step 3 - store 8 U and 8 V values 1772ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movlps qword ptr [edx], xmm0 // U 1773ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movhps qword ptr [edx + edi], xmm0 // V 1774ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 8] 17757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 16 1776ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 1777ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1778ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop edi 17797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian pop esi 1780ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 1781ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 1782ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 1783ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 17847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 17857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 1786ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian uint8* dst_u, uint8* dst_v, int width) { 1787ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 1788ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 1789ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push edi 1790ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 8 + 4] // src_argb 1791ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, [esp + 8 + 8] // src_stride_argb 1792ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8 + 12] // dst_u 1793ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edi, [esp + 8 + 16] // dst_v 1794ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 8 + 20] // pix 1795ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm5, kAddUV128 17967ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqa xmm6, kABGRToV 17977ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqa xmm7, kABGRToU 1798ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub edi, edx // stride from u to v 1799ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1800ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 1801ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian /* step 1 - subsample 16x2 argb pixels to 8x1 */ 18027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm0, [eax] 18037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm4, [eax + esi] 18047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian pavgb xmm0, xmm4 18057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm1, [eax + 16] 18067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm4, [eax + esi + 16] 18077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian pavgb xmm1, xmm4 18087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm2, [eax + 32] 18097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm4, [eax + esi + 32] 18107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian pavgb xmm2, xmm4 18117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm3, [eax + 48] 18127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm4, [eax + esi + 48] 18137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian pavgb xmm3, xmm4 18147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 1815ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 64] 1816ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm4, xmm0 1817ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian shufps xmm0, xmm1, 0x88 1818ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian shufps xmm4, xmm1, 0xdd 1819ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgb xmm0, xmm4 1820ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm4, xmm2 1821ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian shufps xmm2, xmm3, 0x88 1822ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian shufps xmm4, xmm3, 0xdd 1823ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgb xmm2, xmm4 1824ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1825ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // step 2 - convert to U and V 1826ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // from here down is very similar to Y code except 1827ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // instead of 16 different pixels, its 8 pixels of U and 8 of V 1828ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm1, xmm0 1829ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm3, xmm2 1830ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm0, xmm7 // U 1831ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm2, xmm7 1832ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm1, xmm6 // V 1833ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm3, xmm6 1834ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian phaddw xmm0, xmm2 1835ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian phaddw xmm1, xmm3 1836ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psraw xmm0, 8 1837ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psraw xmm1, 8 1838ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packsswb xmm0, xmm1 1839ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddb xmm0, xmm5 // -> unsigned 1840ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1841ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // step 3 - store 8 U and 8 V values 1842ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movlps qword ptr [edx], xmm0 // U 1843ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movhps qword ptr [edx + edi], xmm0 // V 1844ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 8] 18457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 16 1846ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 1847ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1848ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop edi 1849ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 1850ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 1851ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 1852ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 1853ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 18547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 18557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 18567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian uint8* dst_u, uint8* dst_v, int width) { 1857ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 1858ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 1859ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push edi 1860ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 8 + 4] // src_argb 1861ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, [esp + 8 + 8] // src_stride_argb 1862ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8 + 12] // dst_u 1863ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edi, [esp + 8 + 16] // dst_v 1864ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 8 + 20] // pix 1865ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm5, kAddUV128 18667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqa xmm6, kRGBAToV 18677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqa xmm7, kRGBAToU 1868ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub edi, edx // stride from u to v 1869ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1870ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 1871ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1872ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm0, [eax] 1873ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm4, [eax + esi] 1874ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgb xmm0, xmm4 18757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm1, [eax + 16] 1876ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm4, [eax + esi + 16] 1877ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgb xmm1, xmm4 18787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm2, [eax + 32] 1879ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm4, [eax + esi + 32] 1880ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgb xmm2, xmm4 18817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm3, [eax + 48] 1882ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm4, [eax + esi + 48] 1883ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgb xmm3, xmm4 18847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 1885ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 64] 1886ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm4, xmm0 1887ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian shufps xmm0, xmm1, 0x88 1888ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian shufps xmm4, xmm1, 0xdd 1889ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgb xmm0, xmm4 1890ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm4, xmm2 1891ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian shufps xmm2, xmm3, 0x88 1892ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian shufps xmm4, xmm3, 0xdd 1893ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgb xmm2, xmm4 1894ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1895ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // step 2 - convert to U and V 1896ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // from here down is very similar to Y code except 1897ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // instead of 16 different pixels, its 8 pixels of U and 8 of V 1898ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm1, xmm0 1899ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm3, xmm2 1900ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm0, xmm7 // U 1901ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm2, xmm7 1902ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm1, xmm6 // V 1903ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm3, xmm6 1904ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian phaddw xmm0, xmm2 1905ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian phaddw xmm1, xmm3 1906ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psraw xmm0, 8 1907ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psraw xmm1, 8 1908ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packsswb xmm0, xmm1 1909ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddb xmm0, xmm5 // -> unsigned 1910ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1911ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // step 3 - store 8 U and 8 V values 1912ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movlps qword ptr [edx], xmm0 // U 1913ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movhps qword ptr [edx + edi], xmm0 // V 1914ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 8] 19157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 16 1916ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 1917ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1918ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop edi 1919ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 1920ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 1921ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 1922ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 19237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif // HAS_ARGBTOYROW_SSSE3 1924ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 19257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Read 16 UV from 444 19267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define READYUV444_AVX2 __asm { \ 19277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm vmovdqu xmm0, [esi] /* U */ /* NOLINT */ \ 19287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm vmovdqu xmm1, [esi + edi] /* V */ /* NOLINT */ \ 19297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm lea esi, [esi + 16] \ 19307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm vpermq ymm0, ymm0, 0xd8 \ 19317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm vpermq ymm1, ymm1, 0xd8 \ 19327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ 19337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian } 19347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 19357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Read 8 UV from 422, upsample to 16 UV. 19367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define READYUV422_AVX2 __asm { \ 19377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm vmovq xmm0, qword ptr [esi] /* U */ /* NOLINT */ \ 19387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm vmovq xmm1, qword ptr [esi + edi] /* V */ /* NOLINT */ \ 19397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm lea esi, [esi + 8] \ 19407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ 19417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm vpermq ymm0, ymm0, 0xd8 \ 19427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ 19437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian } 19447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 19457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Read 4 UV from 411, upsample to 16 UV. 19467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define READYUV411_AVX2 __asm { \ 19477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm vmovd xmm0, dword ptr [esi] /* U */ /* NOLINT */ \ 19487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm vmovd xmm1, dword ptr [esi + edi] /* V */ /* NOLINT */ \ 19497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm lea esi, [esi + 4] \ 19507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ 19517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ 19527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm vpermq ymm0, ymm0, 0xd8 \ 19537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm vpunpckldq ymm0, ymm0, ymm0 /* UVUVUVUV (upsample) */ \ 19547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian } 19557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 19567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Read 8 UV from NV12, upsample to 16 UV. 19577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define READNV12_AVX2 __asm { \ 19587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm vmovdqu xmm0, [esi] /* UV */ \ 19597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm lea esi, [esi + 16] \ 19607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm vpermq ymm0, ymm0, 0xd8 \ 19617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ 19627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian } 19637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 19647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Convert 16 pixels: 16 UV and 16 Y. 19657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define YUVTORGB_AVX2(YuvConstants) __asm { \ 19667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian /* Step 1: Find 8 UV contributions to 16 R,G,B values */ \ 19677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm vpmaddubsw ymm2, ymm0, YuvConstants.kUVToR /* scale R UV */ \ 19687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm vpmaddubsw ymm1, ymm0, YuvConstants.kUVToG /* scale G UV */ \ 19697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm vpmaddubsw ymm0, ymm0, YuvConstants.kUVToB /* scale B UV */ \ 19707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm vmovdqu ymm3, YuvConstants.kUVBiasR \ 19717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm vpsubw ymm2, ymm3, ymm2 \ 19727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm vmovdqu ymm3, YuvConstants.kUVBiasG \ 19737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm vpsubw ymm1, ymm3, ymm1 \ 19747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm vmovdqu ymm3, YuvConstants.kUVBiasB \ 19757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm vpsubw ymm0, ymm3, ymm0 \ 19767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian /* Step 2: Find Y contribution to 16 R,G,B values */ \ 19777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm vmovdqu xmm3, [eax] /* NOLINT */ \ 19787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm lea eax, [eax + 16] \ 19797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm vpermq ymm3, ymm3, 0xd8 \ 19807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm vpunpcklbw ymm3, ymm3, ymm3 \ 19817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm vpmulhuw ymm3, ymm3, YuvConstants.kYToRgb \ 19827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm vpaddsw ymm0, ymm0, ymm3 /* B += Y */ \ 19837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm vpaddsw ymm1, ymm1, ymm3 /* G += Y */ \ 19847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm vpaddsw ymm2, ymm2, ymm3 /* R += Y */ \ 19857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm vpsraw ymm0, ymm0, 6 \ 19867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm vpsraw ymm1, ymm1, 6 \ 19877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm vpsraw ymm2, ymm2, 6 \ 19887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm vpackuswb ymm0, ymm0, ymm0 /* B */ \ 19897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm vpackuswb ymm1, ymm1, ymm1 /* G */ \ 19907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm vpackuswb ymm2, ymm2, ymm2 /* R */ \ 19917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian } 19927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 19937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Store 16 ARGB values. 19947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define STOREARGB_AVX2 __asm { \ 19957ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian /* Step 3: Weave into ARGB */ \ 19967ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \ 19977ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm vpermq ymm0, ymm0, 0xd8 \ 19987ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \ 19997ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm vpermq ymm2, ymm2, 0xd8 \ 20007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \ 20017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \ 20027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm vmovdqu 0[edx], ymm1 \ 20037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm vmovdqu 32[edx], ymm0 \ 20047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm lea edx, [edx + 64] \ 20057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian } 20067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 20077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#ifdef HAS_I422TOARGBROW_AVX2 20087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 16 pixels 20097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). 20107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 20117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid I422ToARGBRow_AVX2(const uint8* y_buf, 20127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian const uint8* u_buf, 20137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian const uint8* v_buf, 20147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian uint8* dst_argb, 20157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian int width) { 2016ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 2017ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 2018ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push edi 20197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov eax, [esp + 8 + 4] // Y 20207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov esi, [esp + 8 + 8] // U 20217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov edi, [esp + 8 + 12] // V 20227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov edx, [esp + 8 + 16] // argb 20237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov ecx, [esp + 8 + 20] // width 20247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub edi, esi 20257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha 2026ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2027ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 20287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian READYUV422_AVX2 20297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian YUVTORGB_AVX2(kYuvConstants) 20307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian STOREARGB_AVX2 2031ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2032ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 16 2033ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 2034ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2035ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop edi 2036ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 20377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vzeroupper 2038ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 2039ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 2040ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 20417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif // HAS_I422TOARGBROW_AVX2 2042ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 20437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#ifdef HAS_J422TOARGBROW_AVX2 20447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 16 pixels 20457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). 20467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 20477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid J422ToARGBRow_AVX2(const uint8* y_buf, 20487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian const uint8* u_buf, 20497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian const uint8* v_buf, 20507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian uint8* dst_argb, 20517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian int width) { 2052ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 2053ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 2054ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push edi 20557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov eax, [esp + 8 + 4] // Y 20567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov esi, [esp + 8 + 8] // U 20577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov edi, [esp + 8 + 12] // V 20587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov edx, [esp + 8 + 16] // argb 20597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov ecx, [esp + 8 + 20] // width 20607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub edi, esi 20617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha 2062ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2063ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 20647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian READYUV422_AVX2 20657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian YUVTORGB_AVX2(kYuvJConstants) 20667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian STOREARGB_AVX2 2067ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 20687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 16 20697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian jg convertloop 20707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 20717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian pop edi 20727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian pop esi 20737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vzeroupper 20747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian ret 20757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian } 20767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian} 20777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif // HAS_J422TOARGBROW_AVX2 20787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 20797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#ifdef HAS_I444TOARGBROW_AVX2 20807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 16 pixels 20817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 16 UV values with 16 Y producing 16 ARGB (64 bytes). 20827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 20837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid I444ToARGBRow_AVX2(const uint8* y_buf, 20847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian const uint8* u_buf, 20857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian const uint8* v_buf, 20867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian uint8* dst_argb, 20877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian int width) { 20887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm { 20897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian push esi 20907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian push edi 20917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov eax, [esp + 8 + 4] // Y 20927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov esi, [esp + 8 + 8] // U 20937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov edi, [esp + 8 + 12] // V 20947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov edx, [esp + 8 + 16] // argb 20957ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov ecx, [esp + 8 + 20] // width 20967ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub edi, esi 20977ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha 20987ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 20997ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian convertloop: 21007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian READYUV444_AVX2 21017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian YUVTORGB_AVX2(kYuvConstants) 21027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian STOREARGB_AVX2 2103ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2104ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 16 2105ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 2106ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2107ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop edi 2108ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 21097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vzeroupper 2110ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 2111ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 2112ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 21137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif // HAS_I444TOARGBROW_AVX2 2114ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 21157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#ifdef HAS_I411TOARGBROW_AVX2 21167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 16 pixels 21177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). 21187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 21197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid I411ToARGBRow_AVX2(const uint8* y_buf, 21207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian const uint8* u_buf, 21217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian const uint8* v_buf, 21227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian uint8* dst_argb, 21237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian int width) { 2124ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 2125ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 2126ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push edi 21277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov eax, [esp + 8 + 4] // Y 21287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov esi, [esp + 8 + 8] // U 21297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov edi, [esp + 8 + 12] // V 21307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov edx, [esp + 8 + 16] // argb 21317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov ecx, [esp + 8 + 20] // width 21327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub edi, esi 21337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha 2134ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2135ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 21367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian READYUV411_AVX2 21377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian YUVTORGB_AVX2(kYuvConstants) 21387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian STOREARGB_AVX2 2139ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 21407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 16 21417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian jg convertloop 21427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 21437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian pop edi 21447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian pop esi 21457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vzeroupper 21467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian ret 21477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian } 21487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian} 21497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif // HAS_I411TOARGBROW_AVX2 21507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 21517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#ifdef HAS_NV12TOARGBROW_AVX2 21527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 16 pixels. 21537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). 21547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 21557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid NV12ToARGBRow_AVX2(const uint8* y_buf, 21567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian const uint8* uv_buf, 21577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian uint8* dst_argb, 21587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian int width) { 21597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm { 21607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian push esi 21617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov eax, [esp + 4 + 4] // Y 21627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov esi, [esp + 4 + 8] // UV 21637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov edx, [esp + 4 + 12] // argb 21647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov ecx, [esp + 4 + 16] // width 21657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha 21667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 21677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian convertloop: 21687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian READNV12_AVX2 21697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian YUVTORGB_AVX2(kYuvConstants) 21707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian STOREARGB_AVX2 2171ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2172ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 16 21737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian jg convertloop 21747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 21757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian pop esi 21767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vzeroupper 21777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian ret 21787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian } 21797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian} 21807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif // HAS_NV12TOARGBROW_AVX2 21817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 21827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#ifdef HAS_NV21TOARGBROW_AVX2 21837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 16 pixels. 21847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 8 VU values upsampled to 16 VU, mixed with 16 Y producing 16 ARGB (64 bytes). 21857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 21867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid NV21ToARGBRow_AVX2(const uint8* y_buf, 21877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian const uint8* uv_buf, 21887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian uint8* dst_argb, 21897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian int width) { 21907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm { 21917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian push esi 21927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov eax, [esp + 4 + 4] // Y 21937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov esi, [esp + 4 + 8] // UV 21947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov edx, [esp + 4 + 12] // argb 21957ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov ecx, [esp + 4 + 16] // width 21967ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha 21977ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 21987ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian convertloop: 21997ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian READNV12_AVX2 22007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian YUVTORGB_AVX2(kYvuConstants) 22017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian STOREARGB_AVX2 22027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 22037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 16 22047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian jg convertloop 22057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 22067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian pop esi 22077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vzeroupper 22087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian ret 22097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian } 22107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian} 22117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif // HAS_NV21TOARGBROW_AVX2 22127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 22137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#ifdef HAS_I422TOBGRAROW_AVX2 22147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 16 pixels 22157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes). 22167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3. 22177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 22187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid I422ToBGRARow_AVX2(const uint8* y_buf, 22197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian const uint8* u_buf, 22207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian const uint8* v_buf, 22217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian uint8* dst_argb, 22227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian int width) { 22237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm { 22247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian push esi 22257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian push edi 22267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov eax, [esp + 8 + 4] // Y 22277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov esi, [esp + 8 + 8] // U 22287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov edi, [esp + 8 + 12] // V 22297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov edx, [esp + 8 + 16] // argb 22307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov ecx, [esp + 8 + 20] // width 22317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub edi, esi 22327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha 22337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 22347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian convertloop: 22357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian READYUV422_AVX2 22367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian YUVTORGB_AVX2(kYuvConstants) 22377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 22387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian // Step 3: Weave into BGRA 22397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpunpcklbw ymm1, ymm1, ymm0 // GB 22407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpermq ymm1, ymm1, 0xd8 22417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpunpcklbw ymm2, ymm5, ymm2 // AR 22427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpermq ymm2, ymm2, 0xd8 22437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpunpcklwd ymm0, ymm2, ymm1 // ARGB first 8 pixels 22447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpunpckhwd ymm2, ymm2, ymm1 // ARGB next 8 pixels 22457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu [edx], ymm0 22467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu [edx + 32], ymm2 22477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian lea edx, [edx + 64] 22487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 16 2249ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 2250ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2251ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop edi 2252ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 22537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vzeroupper 2254ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 2255ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 2256ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 22577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif // HAS_I422TOBGRAROW_AVX2 2258ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 22597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#ifdef HAS_I422TORGBAROW_AVX2 22607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 16 pixels 22617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). 22627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3. 22637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 22647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid I422ToRGBARow_AVX2(const uint8* y_buf, 22657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian const uint8* u_buf, 22667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian const uint8* v_buf, 22677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian uint8* dst_argb, 22687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian int width) { 2269ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 2270ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 2271ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push edi 22727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov eax, [esp + 8 + 4] // Y 22737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov esi, [esp + 8 + 8] // U 22747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov edi, [esp + 8 + 12] // V 22757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov edx, [esp + 8 + 16] // argb 22767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov ecx, [esp + 8 + 20] // width 22777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub edi, esi 22787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha 2279ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2280ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 22817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian READYUV422_AVX2 22827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian YUVTORGB_AVX2(kYuvConstants) 2283ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 22847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian // Step 3: Weave into RGBA 22857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpunpcklbw ymm1, ymm1, ymm2 // GR 22867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpermq ymm1, ymm1, 0xd8 22877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpunpcklbw ymm2, ymm5, ymm0 // AB 22887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpermq ymm2, ymm2, 0xd8 22897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpunpcklwd ymm0, ymm2, ymm1 // ABGR first 8 pixels 22907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpunpckhwd ymm1, ymm2, ymm1 // ABGR next 8 pixels 22917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu [edx], ymm0 22927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu [edx + 32], ymm1 22937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian lea edx, [edx + 64] 2294ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 16 2295ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 2296ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2297ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop edi 2298ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 22997ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vzeroupper 2300ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 2301ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 2302ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 23037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif // HAS_I422TORGBAROW_AVX2 2304ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 23057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#ifdef HAS_I422TOABGRROW_AVX2 2306ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// 16 pixels 23077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes). 23087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3. 23097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 23107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid I422ToABGRRow_AVX2(const uint8* y_buf, 2311ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian const uint8* u_buf, 2312ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian const uint8* v_buf, 2313ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian uint8* dst_argb, 2314ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian int width) { 2315ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 2316ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 2317ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push edi 2318ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 8 + 4] // Y 2319ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, [esp + 8 + 8] // U 2320ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edi, [esp + 8 + 12] // V 2321ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8 + 16] // argb 2322ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 8 + 20] // width 2323ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub edi, esi 2324ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha 2325ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2326ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 23277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian READYUV422_AVX2 23287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian YUVTORGB_AVX2(kYuvConstants) 23297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 23307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian // Step 3: Weave into ABGR 23317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpunpcklbw ymm1, ymm2, ymm1 // RG 23327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpermq ymm1, ymm1, 0xd8 23337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpunpcklbw ymm2, ymm0, ymm5 // BA 2334ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpermq ymm2, ymm2, 0xd8 23357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpunpcklwd ymm0, ymm1, ymm2 // RGBA first 8 pixels 23367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpunpckhwd ymm1, ymm1, ymm2 // RGBA next 8 pixels 23377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu [edx], ymm0 23387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu [edx + 32], ymm1 2339ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 64] 2340ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 16 2341ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 2342ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2343ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop edi 2344ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 23457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vzeroupper 2346ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 2347ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 2348ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 23497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif // HAS_I422TOABGRROW_AVX2 2350ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 23517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#if defined(HAS_I422TOARGBROW_SSSE3) 2352ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// TODO(fbarchard): Read that does half size on Y and treats 420 as 444. 2353ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2354ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Read 8 UV from 444. 2355ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#define READYUV444 __asm { \ 2356ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm movq xmm0, qword ptr [esi] /* U */ /* NOLINT */ \ 2357ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm movq xmm1, qword ptr [esi + edi] /* V */ /* NOLINT */ \ 2358ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm lea esi, [esi + 8] \ 2359ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm punpcklbw xmm0, xmm1 /* UV */ \ 2360ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 2361ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2362ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Read 4 UV from 422, upsample to 8 UV. 2363ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#define READYUV422 __asm { \ 2364ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm movd xmm0, [esi] /* U */ \ 2365ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm movd xmm1, [esi + edi] /* V */ \ 2366ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm lea esi, [esi + 4] \ 2367ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm punpcklbw xmm0, xmm1 /* UV */ \ 2368ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ 2369ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 2370ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2371ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Read 2 UV from 411, upsample to 8 UV. 2372ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#define READYUV411 __asm { \ 2373ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm movzx ebx, word ptr [esi] /* U */ /* NOLINT */ \ 2374ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm movd xmm0, ebx \ 2375ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm movzx ebx, word ptr [esi + edi] /* V */ /* NOLINT */ \ 2376ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm movd xmm1, ebx \ 2377ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm lea esi, [esi + 2] \ 2378ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm punpcklbw xmm0, xmm1 /* UV */ \ 2379ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ 23807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm punpckldq xmm0, xmm0 /* UVUVUVUV (upsample) */ \ 2381ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 2382ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2383ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Read 4 UV from NV12, upsample to 8 UV. 2384ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#define READNV12 __asm { \ 2385ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm movq xmm0, qword ptr [esi] /* UV */ /* NOLINT */ \ 2386ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm lea esi, [esi + 8] \ 2387ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ 2388ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 2389ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2390ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Convert 8 pixels: 8 UV and 8 Y. 23917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define YUVTORGB(YuvConstants) __asm { \ 2392ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \ 2393ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm movdqa xmm1, xmm0 \ 2394ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm movdqa xmm2, xmm0 \ 23957ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm movdqa xmm3, xmm0 \ 23967ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm movdqa xmm0, YuvConstants.kUVBiasB /* unbias back to signed */ \ 23977ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm pmaddubsw xmm1, YuvConstants.kUVToB /* scale B UV */ \ 23987ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm psubw xmm0, xmm1 \ 23997ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm movdqa xmm1, YuvConstants.kUVBiasG \ 24007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm pmaddubsw xmm2, YuvConstants.kUVToG /* scale G UV */ \ 24017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm psubw xmm1, xmm2 \ 24027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm movdqa xmm2, YuvConstants.kUVBiasR \ 24037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm pmaddubsw xmm3, YuvConstants.kUVToR /* scale R UV */ \ 24047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm psubw xmm2, xmm3 \ 2405ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian /* Step 2: Find Y contribution to 8 R,G,B values */ \ 2406ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm movq xmm3, qword ptr [eax] /* NOLINT */ \ 2407ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm lea eax, [eax + 8] \ 24087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm punpcklbw xmm3, xmm3 \ 24097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm pmulhuw xmm3, YuvConstants.kYToRgb \ 2410ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm paddsw xmm0, xmm3 /* B += Y */ \ 2411ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm paddsw xmm1, xmm3 /* G += Y */ \ 2412ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm paddsw xmm2, xmm3 /* R += Y */ \ 2413ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm psraw xmm0, 6 \ 2414ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm psraw xmm1, 6 \ 2415ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm psraw xmm2, 6 \ 2416ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm packuswb xmm0, xmm0 /* B */ \ 2417ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm packuswb xmm1, xmm1 /* G */ \ 2418ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm packuswb xmm2, xmm2 /* R */ \ 2419ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 2420ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 24217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Store 8 ARGB values. 24227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define STOREARGB __asm { \ 24237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian /* Step 3: Weave into ARGB */ \ 24247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm punpcklbw xmm0, xmm1 /* BG */ \ 24257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm punpcklbw xmm2, xmm5 /* RA */ \ 2426ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm movdqa xmm1, xmm0 \ 24277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \ 24287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \ 24297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm movdqu 0[edx], xmm0 \ 24307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm movdqu 16[edx], xmm1 \ 24317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm lea edx, [edx + 32] \ 24327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian } 24337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 24347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Store 8 BGRA values. 24357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define STOREBGRA __asm { \ 24367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian /* Step 3: Weave into BGRA */ \ 24377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ 24387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm punpcklbw xmm1, xmm0 /* GB */ \ 24397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm punpcklbw xmm5, xmm2 /* AR */ \ 24407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm movdqa xmm0, xmm5 \ 24417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \ 24427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \ 24437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm movdqu 0[edx], xmm5 \ 24447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm movdqu 16[edx], xmm0 \ 24457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm lea edx, [edx + 32] \ 24467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian } 24477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 24487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Store 8 ABGR values. 24497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define STOREABGR __asm { \ 24507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian /* Step 3: Weave into ABGR */ \ 24517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm punpcklbw xmm2, xmm1 /* RG */ \ 24527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm punpcklbw xmm0, xmm5 /* BA */ \ 24537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm movdqa xmm1, xmm2 \ 24547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm punpcklwd xmm2, xmm0 /* RGBA first 4 pixels */ \ 24557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm punpckhwd xmm1, xmm0 /* RGBA next 4 pixels */ \ 24567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm movdqu 0[edx], xmm2 \ 24577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm movdqu 16[edx], xmm1 \ 24587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm lea edx, [edx + 32] \ 24597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian } 24607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 24617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Store 8 RGBA values. 24627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define STORERGBA __asm { \ 24637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian /* Step 3: Weave into RGBA */ \ 24647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ 24657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm punpcklbw xmm1, xmm2 /* GR */ \ 24667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm punpcklbw xmm5, xmm0 /* AB */ \ 24677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm movdqa xmm0, xmm5 \ 24687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \ 24697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \ 24707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm movdqu 0[edx], xmm5 \ 24717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm movdqu 16[edx], xmm0 \ 24727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm lea edx, [edx + 32] \ 24737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian } 24747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 24757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Store 8 RGB24 values. 24767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define STORERGB24 __asm { \ 24777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian /* Step 3: Weave into RRGB */ \ 24787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm punpcklbw xmm0, xmm1 /* BG */ \ 24797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm punpcklbw xmm2, xmm2 /* RR */ \ 24807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm movdqa xmm1, xmm0 \ 24817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ 24827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \ 24837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian /* Step 4: RRGB -> RGB24 */ \ 24847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \ 24857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \ 24867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \ 24877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \ 24887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \ 24897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm lea edx, [edx + 24] \ 24907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian } 24917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 24927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Store 8 RAW values. 24937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define STORERAW __asm { \ 24947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian /* Step 3: Weave into RRGB */ \ 24957ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm punpcklbw xmm0, xmm1 /* BG */ \ 24967ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm punpcklbw xmm2, xmm2 /* RR */ \ 24977ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm movdqa xmm1, xmm0 \ 24987ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ 24997ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \ 25007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian /* Step 4: RRGB -> RAW */ \ 25017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \ 25027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \ 25037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \ 25047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \ 25057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \ 25067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm lea edx, [edx + 24] \ 25077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian } 25087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 25097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Store 8 RGB565 values. 25107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define STORERGB565 __asm { \ 25117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian /* Step 3: Weave into RRGB */ \ 25127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm punpcklbw xmm0, xmm1 /* BG */ \ 25137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm punpcklbw xmm2, xmm2 /* RR */ \ 25147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm movdqa xmm1, xmm0 \ 25157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ 25167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \ 25177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian /* Step 4: RRGB -> RGB565 */ \ 25187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm movdqa xmm3, xmm0 /* B first 4 pixels of argb */ \ 25197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm movdqa xmm2, xmm0 /* G */ \ 25207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm pslld xmm0, 8 /* R */ \ 25217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm psrld xmm3, 3 /* B */ \ 25227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm psrld xmm2, 5 /* G */ \ 25237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm psrad xmm0, 16 /* R */ \ 25247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm pand xmm3, xmm5 /* B */ \ 25257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm pand xmm2, xmm6 /* G */ \ 25267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm pand xmm0, xmm7 /* R */ \ 25277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm por xmm3, xmm2 /* BG */ \ 25287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm por xmm0, xmm3 /* BGR */ \ 25297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm movdqa xmm3, xmm1 /* B next 4 pixels of argb */ \ 25307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm movdqa xmm2, xmm1 /* G */ \ 25317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm pslld xmm1, 8 /* R */ \ 25327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm psrld xmm3, 3 /* B */ \ 25337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm psrld xmm2, 5 /* G */ \ 25347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm psrad xmm1, 16 /* R */ \ 25357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm pand xmm3, xmm5 /* B */ \ 25367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm pand xmm2, xmm6 /* G */ \ 25377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm pand xmm1, xmm7 /* R */ \ 25387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm por xmm3, xmm2 /* BG */ \ 25397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm por xmm1, xmm3 /* BGR */ \ 25407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm packssdw xmm0, xmm1 \ 25417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm movdqu 0[edx], xmm0 /* store 8 pixels of RGB565 */ \ 25427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm lea edx, [edx + 16] \ 25437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian } 25447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 25457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 8 pixels. 2546ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). 25477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 2548ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid I444ToARGBRow_SSSE3(const uint8* y_buf, 2549ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian const uint8* u_buf, 2550ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian const uint8* v_buf, 2551ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian uint8* dst_argb, 2552ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian int width) { 2553ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 2554ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 2555ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push edi 2556ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 8 + 4] // Y 2557ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, [esp + 8 + 8] // U 2558ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edi, [esp + 8 + 12] // V 2559ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8 + 16] // argb 2560ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 8 + 20] // width 2561ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub edi, esi 2562ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2563ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2564ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 2565ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian READYUV444 25667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian YUVTORGB(kYuvConstants) 25677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian STOREARGB 2568ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2569ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 8 2570ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 2571ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2572ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop edi 2573ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 2574ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 2575ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 2576ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 2577ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 25787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 8 pixels. 25797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes). 25807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 2581ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid I422ToRGB24Row_SSSE3(const uint8* y_buf, 2582ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian const uint8* u_buf, 2583ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian const uint8* v_buf, 2584ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian uint8* dst_rgb24, 2585ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian int width) { 2586ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 2587ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 2588ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push edi 2589ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 8 + 4] // Y 2590ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, [esp + 8 + 8] // U 2591ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edi, [esp + 8 + 12] // V 2592ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8 + 16] // rgb24 2593ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 8 + 20] // width 2594ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub edi, esi 2595ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm5, kShuffleMaskARGBToRGB24_0 2596ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm6, kShuffleMaskARGBToRGB24 2597ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2598ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 2599ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian READYUV422 26007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian YUVTORGB(kYuvConstants) 26017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian STORERGB24 2602ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2603ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 8 2604ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 2605ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2606ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop edi 2607ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 2608ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 2609ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 2610ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 2611ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 26127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 8 pixels. 26137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RAW (24 bytes). 26147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 2615ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid I422ToRAWRow_SSSE3(const uint8* y_buf, 2616ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian const uint8* u_buf, 2617ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian const uint8* v_buf, 2618ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian uint8* dst_raw, 2619ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian int width) { 2620ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 2621ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 2622ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push edi 2623ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 8 + 4] // Y 2624ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, [esp + 8 + 8] // U 2625ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edi, [esp + 8 + 12] // V 2626ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8 + 16] // raw 2627ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 8 + 20] // width 2628ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub edi, esi 2629ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm5, kShuffleMaskARGBToRAW_0 2630ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm6, kShuffleMaskARGBToRAW 2631ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2632ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 2633ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian READYUV422 26347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian YUVTORGB(kYuvConstants) 26357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian STORERAW 2636ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2637ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 8 2638ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 2639ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2640ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop edi 2641ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 2642ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 2643ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 2644ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 2645ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 26467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 8 pixels 26477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes). 26487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 2649ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid I422ToRGB565Row_SSSE3(const uint8* y_buf, 2650ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian const uint8* u_buf, 2651ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian const uint8* v_buf, 2652ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian uint8* rgb565_buf, 2653ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian int width) { 2654ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 2655ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 2656ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push edi 2657ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 8 + 4] // Y 2658ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, [esp + 8 + 8] // U 2659ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edi, [esp + 8 + 12] // V 2660ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8 + 16] // rgb565 2661ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 8 + 20] // width 2662ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub edi, esi 2663ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pcmpeqb xmm5, xmm5 // generate mask 0x0000001f 2664ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrld xmm5, 27 2665ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pcmpeqb xmm6, xmm6 // generate mask 0x000007e0 2666ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrld xmm6, 26 2667ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pslld xmm6, 5 2668ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pcmpeqb xmm7, xmm7 // generate mask 0xfffff800 2669ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pslld xmm7, 11 2670ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2671ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 2672ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian READYUV422 26737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian YUVTORGB(kYuvConstants) 26747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian STORERGB565 2675ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2676ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 8 2677ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 2678ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2679ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop edi 2680ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 2681ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 2682ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 2683ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 2684ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 26857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 8 pixels. 2686ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 26877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 2688ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid I422ToARGBRow_SSSE3(const uint8* y_buf, 2689ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian const uint8* u_buf, 2690ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian const uint8* v_buf, 2691ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian uint8* dst_argb, 2692ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian int width) { 2693ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 2694ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 2695ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push edi 2696ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 8 + 4] // Y 2697ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, [esp + 8 + 8] // U 2698ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edi, [esp + 8 + 12] // V 2699ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8 + 16] // argb 2700ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 8 + 20] // width 2701ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub edi, esi 2702ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2703ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2704ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 2705ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian READYUV422 27067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian YUVTORGB(kYuvConstants) 27077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian STOREARGB 2708ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2709ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 8 2710ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 2711ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2712ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop edi 2713ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 2714ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 2715ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 2716ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 2717ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 27187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 8 pixels. 27197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// JPeg color space version of I422ToARGB 27207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 27217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 27227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid J422ToARGBRow_SSSE3(const uint8* y_buf, 2723ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian const uint8* u_buf, 2724ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian const uint8* v_buf, 2725ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian uint8* dst_argb, 2726ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian int width) { 2727ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 2728ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 2729ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push edi 2730ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 8 + 4] // Y 2731ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, [esp + 8 + 8] // U 2732ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edi, [esp + 8 + 12] // V 2733ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8 + 16] // argb 2734ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 8 + 20] // width 2735ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub edi, esi 2736ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2737ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2738ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 2739ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian READYUV422 27407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian YUVTORGB(kYuvJConstants) 27417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian STOREARGB 27427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 2743ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 8 2744ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 2745ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2746ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop edi 2747ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 2748ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 2749ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 2750ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 2751ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 27527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 8 pixels. 2753ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 2754ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Similar to I420 but duplicate UV once more. 27557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 27567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid I411ToARGBRow_SSSE3(const uint8* y_buf, 27577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian const uint8* u_buf, 27587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian const uint8* v_buf, 27597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian uint8* dst_argb, 27607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian int width) { 2761ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 2762ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push ebx 2763ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 2764ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push edi 2765ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 12 + 4] // Y 2766ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, [esp + 12 + 8] // U 2767ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edi, [esp + 12 + 12] // V 2768ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 12 + 16] // argb 2769ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 12 + 20] // width 2770ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub edi, esi 27717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2772ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2773ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 2774ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian READYUV411 // modifies EBX 27757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian YUVTORGB(kYuvConstants) 27767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian STOREARGB 2777ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2778ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 8 2779ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 2780ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2781ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop edi 2782ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 2783ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop ebx 2784ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 2785ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 2786ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 2787ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 27887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 8 pixels. 2789ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 27907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 27917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid NV12ToARGBRow_SSSE3(const uint8* y_buf, 27927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian const uint8* uv_buf, 27937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian uint8* dst_argb, 27947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian int width) { 2795ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 2796ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 2797ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4 + 4] // Y 2798ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, [esp + 4 + 8] // UV 2799ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 4 + 12] // argb 2800ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 4 + 16] // width 2801ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2802ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2803ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 2804ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian READNV12 28057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian YUVTORGB(kYuvConstants) 28067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian STOREARGB 2807ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2808ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 8 2809ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 2810ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2811ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 2812ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 2813ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 2814ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 2815ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 28167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 8 pixels. 28177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 4 VU values upsampled to 8 VU, mixed with 8 Y producing 8 ARGB (32 bytes). 28187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 28197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid NV21ToARGBRow_SSSE3(const uint8* y_buf, 28207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian const uint8* uv_buf, 28217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian uint8* dst_argb, 28227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian int width) { 2823ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 2824ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 2825ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4 + 4] // Y 28267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov esi, [esp + 4 + 8] // UV 2827ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 4 + 12] // argb 2828ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 4 + 16] // width 2829ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2830ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2831ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 2832ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian READNV12 28337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian YUVTORGB(kYvuConstants) 28347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian STOREARGB 2835ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2836ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 8 2837ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 2838ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2839ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 2840ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 2841ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 2842ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 2843ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 28447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 2845ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid I422ToBGRARow_SSSE3(const uint8* y_buf, 2846ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian const uint8* u_buf, 2847ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian const uint8* v_buf, 2848ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian uint8* dst_bgra, 2849ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian int width) { 2850ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 2851ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 2852ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push edi 2853ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 8 + 4] // Y 2854ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, [esp + 8 + 8] // U 2855ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edi, [esp + 8 + 12] // V 2856ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8 + 16] // bgra 2857ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 8 + 20] // width 2858ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub edi, esi 2859ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2860ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 2861ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian READYUV422 28627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian YUVTORGB(kYuvConstants) 28637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian STOREBGRA 2864ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2865ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 8 2866ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 2867ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2868ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop edi 2869ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 2870ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 2871ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 2872ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 2873ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 28747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 2875ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid I422ToABGRRow_SSSE3(const uint8* y_buf, 2876ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian const uint8* u_buf, 2877ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian const uint8* v_buf, 2878ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian uint8* dst_abgr, 2879ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian int width) { 2880ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 2881ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 2882ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push edi 2883ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 8 + 4] // Y 2884ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, [esp + 8 + 8] // U 2885ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edi, [esp + 8 + 12] // V 2886ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8 + 16] // abgr 2887ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 8 + 20] // width 2888ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub edi, esi 2889ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2890ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2891ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 2892ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian READYUV422 28937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian YUVTORGB(kYuvConstants) 28947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian STOREABGR 2895ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2896ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 8 2897ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 2898ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2899ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop edi 2900ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 2901ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 2902ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 2903ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 2904ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 29057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 2906ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid I422ToRGBARow_SSSE3(const uint8* y_buf, 2907ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian const uint8* u_buf, 2908ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian const uint8* v_buf, 2909ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian uint8* dst_rgba, 2910ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian int width) { 2911ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 2912ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 2913ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push edi 2914ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 8 + 4] // Y 2915ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, [esp + 8 + 8] // U 2916ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edi, [esp + 8 + 12] // V 2917ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8 + 16] // rgba 2918ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 8 + 20] // width 2919ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub edi, esi 2920ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2921ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 2922ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian READYUV422 29237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian YUVTORGB(kYuvConstants) 29247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian STORERGBA 2925ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2926ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 8 2927ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 2928ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2929ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop edi 2930ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 2931ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 2932ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 2933ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 2934ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2935ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif // HAS_I422TOARGBROW_SSSE3 2936ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 29377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#ifdef HAS_I400TOARGBROW_SSE2 29387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 8 pixels of Y converted to 8 pixels of ARGB (32 bytes). 29397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 29407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid I400ToARGBRow_SSE2(const uint8* y_buf, 29417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian uint8* rgb_buf, 29427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian int width) { 2943ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 29447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) 2945ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd xmm2, eax 2946ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufd xmm2, xmm2,0 29477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16) 29487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movd xmm3, eax 29497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian pshufd xmm3, xmm3, 0 29507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian pcmpeqb xmm4, xmm4 // generate mask 0xff000000 29517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian pslld xmm4, 24 29527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 2953ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4] // Y 2954ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8] // rgb 2955ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 12] // width 2956ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2957ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 2958ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 2959ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movq xmm0, qword ptr [eax] 2960ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 8] 29617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian punpcklbw xmm0, xmm0 // Y.Y 29627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian pmulhuw xmm0, xmm2 2963ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psubusw xmm0, xmm3 2964ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm0, 6 2965ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm0, xmm0 // G 2966ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2967ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // Step 2: Weave into ARGB 2968ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm0, xmm0 // GG 2969ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm1, xmm0 2970ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklwd xmm0, xmm0 // BGRA first 4 pixels 2971ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpckhwd xmm1, xmm1 // BGRA next 4 pixels 2972ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian por xmm0, xmm4 2973ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian por xmm1, xmm4 29747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx], xmm0 29757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx + 16], xmm1 2976ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 32] 2977ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 8 2978ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 29797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian ret 29807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian } 29817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian} 29827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif // HAS_I400TOARGBROW_SSE2 29837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 29847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#ifdef HAS_I400TOARGBROW_AVX2 29857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes). 29867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// note: vpunpcklbw mutates and vpackuswb unmutates. 29877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 29887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid I400ToARGBRow_AVX2(const uint8* y_buf, 29897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian uint8* rgb_buf, 29907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian int width) { 29917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm { 29927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) 29937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovd xmm2, eax 29947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vbroadcastss ymm2, xmm2 29957ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16) 29967ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovd xmm3, eax 29977ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vbroadcastss ymm3, xmm3 29987ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xff000000 29997ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpslld ymm4, ymm4, 24 30007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 30017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov eax, [esp + 4] // Y 30027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov edx, [esp + 8] // rgb 30037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov ecx, [esp + 12] // width 3004ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 30057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian convertloop: 30067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164 30077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu xmm0, [eax] 30087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian lea eax, [eax + 16] 30097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpermq ymm0, ymm0, 0xd8 // vpunpcklbw mutates 30107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpunpcklbw ymm0, ymm0, ymm0 // Y.Y 30117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpmulhuw ymm0, ymm0, ymm2 30127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpsubusw ymm0, ymm0, ymm3 30137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpsrlw ymm0, ymm0, 6 30147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120 30157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 30167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian // TODO(fbarchard): Weave alpha with unpack. 30177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian // Step 2: Weave into ARGB 30187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates 30197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpermq ymm1, ymm1, 0xd8 30207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpunpcklwd ymm0, ymm1, ymm1 // GGGG first 8 pixels 30217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpunpckhwd ymm1, ymm1, ymm1 // GGGG next 8 pixels 30227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpor ymm0, ymm0, ymm4 30237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpor ymm1, ymm1, ymm4 30247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu [edx], ymm0 30257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu [edx + 32], ymm1 30267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian lea edx, [edx + 64] 30277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 16 30287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian jg convertloop 30297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vzeroupper 3030ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 3031ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 3032ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 30337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif // HAS_I400TOARGBROW_AVX2 3034ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3035ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_MIRRORROW_SSSE3 3036ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Shuffle table for reversing the bytes. 3037ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const uvec8 kShuffleMirror = { 3038ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u 3039ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}; 3040ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 30417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// TODO(fbarchard): Replace lea with -16 offset. 30427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 3043ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { 3044ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 3045ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4] // src 3046ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8] // dst 3047ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 12] // width 3048ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm5, kShuffleMirror 3049ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3050ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 30517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm0, [eax - 16 + ecx] 3052ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufb xmm0, xmm5 30537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx], xmm0 3054ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 16] 30557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 16 3056ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 3057ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 3058ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 3059ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 3060ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif // HAS_MIRRORROW_SSSE3 3061ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3062ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_MIRRORROW_AVX2 30637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 3064ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { 3065ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 3066ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4] // src 3067ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8] // dst 3068ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 12] // width 30697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vbroadcastf128 ymm5, kShuffleMirror 3070ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3071ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 30727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu ymm0, [eax - 32 + ecx] 3073ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpshufb ymm0, ymm0, ymm5 3074ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpermq ymm0, ymm0, 0x4e // swap high and low halfs 3075ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovdqu [edx], ymm0 3076ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 32] 30777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 32 3078ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 3079ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vzeroupper 3080ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 3081ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 3082ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 3083ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif // HAS_MIRRORROW_AVX2 3084ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3085ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_MIRRORROW_SSE2 30867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 3087ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { 3088ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 3089ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4] // src 3090ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8] // dst 3091ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 12] // width 3092ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3093ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 30947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm0, [eax - 16 + ecx] 3095ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm1, xmm0 // swap bytes 3096ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psllw xmm0, 8 3097ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm1, 8 3098ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian por xmm0, xmm1 3099ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshuflw xmm0, xmm0, 0x1b // swap words 3100ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufhw xmm0, xmm0, 0x1b 3101ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufd xmm0, xmm0, 0x4e // swap qwords 3102ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu [edx], xmm0 3103ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 16] 31047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 16 3105ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 3106ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 3107ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 3108ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 3109ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif // HAS_MIRRORROW_SSE2 3110ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3111ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_MIRRORROW_UV_SSSE3 3112ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Shuffle table for reversing the bytes of UV channels. 3113ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const uvec8 kShuffleMirrorUV = { 3114ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u 3115ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}; 3116ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 31177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 3118ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, 3119ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian int width) { 3120ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 3121ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push edi 3122ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4 + 4] // src 3123ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 4 + 8] // dst_u 3124ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edi, [esp + 4 + 12] // dst_v 3125ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 4 + 16] // width 3126ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm1, kShuffleMirrorUV 3127ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + ecx * 2 - 16] 3128ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub edi, edx 3129ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3130ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 31317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm0, [eax] 3132ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax - 16] 3133ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufb xmm0, xmm1 3134ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movlpd qword ptr [edx], xmm0 3135ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movhpd qword ptr [edx + edi], xmm0 3136ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 8] 31377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 8 3138ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 3139ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3140ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop edi 3141ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 3142ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 3143ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 3144ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif // HAS_MIRRORROW_UV_SSSE3 3145ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 31467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#ifdef HAS_ARGBMIRRORROW_SSE2 31477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 31487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) { 3149ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 3150ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4] // src 3151ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8] // dst 3152ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 12] // width 3153ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax - 16 + ecx * 4] // last 4 pixels. 3154ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3155ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 31567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm0, [eax] 3157ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax - 16] 31587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian pshufd xmm0, xmm0, 0x1b 31597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx], xmm0 3160ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 16] 31617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 4 3162ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 3163ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 3164ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 3165ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 31667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif // HAS_ARGBMIRRORROW_SSE2 3167ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3168ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBMIRRORROW_AVX2 3169ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Shuffle table for reversing the bytes. 3170ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const ulvec32 kARGBShuffleMirror_AVX2 = { 3171ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u 3172ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}; 3173ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 31747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 3175ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { 3176ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 3177ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4] // src 3178ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8] // dst 3179ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 12] // width 31807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu ymm5, kARGBShuffleMirror_AVX2 3181ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3182ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 31837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpermd ymm0, ymm5, [eax - 32 + ecx * 4] // permute dword order 3184ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovdqu [edx], ymm0 3185ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 32] 31867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 8 3187ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 3188ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vzeroupper 3189ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 3190ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 3191ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 3192ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif // HAS_ARGBMIRRORROW_AVX2 3193ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3194ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_SPLITUVROW_SSE2 31957ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 3196ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { 3197ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 3198ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push edi 3199ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4 + 4] // src_uv 3200ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 4 + 8] // dst_u 3201ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edi, [esp + 4 + 12] // dst_v 3202ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 4 + 16] // pix 3203ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 3204ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm5, 8 3205ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub edi, edx 3206ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3207ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 3208ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm0, [eax] 3209ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm1, [eax + 16] 3210ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 32] 3211ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm2, xmm0 3212ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm3, xmm1 3213ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pand xmm0, xmm5 // even bytes 3214ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pand xmm1, xmm5 3215ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm0, xmm1 3216ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm2, 8 // odd bytes 3217ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm3, 8 3218ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm2, xmm3 3219ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu [edx], xmm0 3220ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu [edx + edi], xmm2 3221ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 16] 3222ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 16 3223ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 3224ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3225ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop edi 3226ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 3227ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 3228ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 32297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 3230ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif // HAS_SPLITUVROW_SSE2 3231ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3232ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_SPLITUVROW_AVX2 32337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 3234ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { 3235ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 3236ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push edi 3237ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4 + 4] // src_uv 3238ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 4 + 8] // dst_u 3239ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edi, [esp + 4 + 12] // dst_v 3240ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 4 + 16] // pix 3241ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff 3242ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpsrlw ymm5, ymm5, 8 3243ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub edi, edx 3244ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3245ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 3246ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovdqu ymm0, [eax] 3247ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovdqu ymm1, [eax + 32] 3248ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 64] 3249ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpsrlw ymm2, ymm0, 8 // odd bytes 3250ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpsrlw ymm3, ymm1, 8 3251ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpand ymm0, ymm0, ymm5 // even bytes 3252ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpand ymm1, ymm1, ymm5 3253ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpackuswb ymm0, ymm0, ymm1 3254ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpackuswb ymm2, ymm2, ymm3 3255ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpermq ymm0, ymm0, 0xd8 3256ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpermq ymm2, ymm2, 0xd8 3257ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovdqu [edx], ymm0 3258ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovdqu [edx + edi], ymm2 3259ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 32] 3260ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 32 3261ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 3262ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3263ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop edi 3264ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vzeroupper 3265ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 3266ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 3267ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 3268ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif // HAS_SPLITUVROW_AVX2 3269ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3270ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_MERGEUVROW_SSE2 32717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 3272ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, 3273ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian int width) { 3274ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 3275ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push edi 3276ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4 + 4] // src_u 3277ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 4 + 8] // src_v 3278ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edi, [esp + 4 + 12] // dst_uv 3279ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 4 + 16] // width 3280ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub edx, eax 3281ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3282ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 3283ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm0, [eax] // read 16 U's 3284ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm1, [eax + edx] // and 16 V's 3285ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 16] 3286ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm2, xmm0 3287ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm0, xmm1 // first 8 UV pairs 3288ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpckhbw xmm2, xmm1 // next 8 UV pairs 3289ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu [edi], xmm0 3290ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu [edi + 16], xmm2 3291ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edi, [edi + 32] 3292ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 16 3293ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 3294ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3295ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop edi 3296ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 3297ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 3298ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 3299ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif // HAS_MERGEUVROW_SSE2 3300ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3301ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_MERGEUVROW_AVX2 33027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 3303ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, 3304ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian int width) { 3305ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 3306ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push edi 3307ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4 + 4] // src_u 3308ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 4 + 8] // src_v 3309ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edi, [esp + 4 + 12] // dst_uv 3310ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 4 + 16] // width 3311ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub edx, eax 3312ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3313ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 3314ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovdqu ymm0, [eax] // read 32 U's 3315ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovdqu ymm1, [eax + edx] // and 32 V's 3316ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 32] 3317ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2 3318ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3 33197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vextractf128 [edi], ymm2, 0 // bytes 0..15 33207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vextractf128 [edi + 16], ymm0, 0 // bytes 16..31 33217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vextractf128 [edi + 32], ymm2, 1 // bytes 32..47 33227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vextractf128 [edi + 48], ymm0, 1 // bytes 47..63 3323ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edi, [edi + 64] 3324ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 32 3325ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 3326ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3327ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop edi 3328ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vzeroupper 3329ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 3330ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 3331ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 3332ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif // HAS_MERGEUVROW_AVX2 3333ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3334ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_COPYROW_SSE2 3335ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time. 33367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 3337ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid CopyRow_SSE2(const uint8* src, uint8* dst, int count) { 3338ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 3339ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4] // src 3340ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8] // dst 3341ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 12] // count 3342ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3343ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 33447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm0, [eax] 33457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm1, [eax + 16] 3346ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 32] 33477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx], xmm0 33487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx + 16], xmm1 3349ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 32] 3350ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 32 3351ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 3352ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 3353ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 3354ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 3355ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif // HAS_COPYROW_SSE2 3356ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 33577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#ifdef HAS_COPYROW_AVX 33587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time. 33597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 33607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid CopyRow_AVX(const uint8* src, uint8* dst, int count) { 3361ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 33627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov eax, [esp + 4] // src 33637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov edx, [esp + 8] // dst 3364ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 12] // count 33657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 33667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian convertloop: 33677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu ymm0, [eax] 33687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu ymm1, [eax + 32] 33697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian lea eax, [eax + 64] 33707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu [edx], ymm0 33717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu [edx + 32], ymm1 33727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian lea edx, [edx + 64] 33737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 64 33747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian jg convertloop 33757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 33767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vzeroupper 3377ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 3378ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 3379ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 33807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif // HAS_COPYROW_AVX 3381ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 33827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Multiple of 1. 33837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 33847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid CopyRow_ERMS(const uint8* src, uint8* dst, int count) { 3385ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 3386ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, esi 3387ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, edi 3388ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, [esp + 4] // src 3389ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edi, [esp + 8] // dst 3390ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 12] // count 33917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian rep movsb 3392ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edi, edx 3393ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, eax 3394ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 3395ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 3396ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 3397ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3398ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBCOPYALPHAROW_SSE2 3399ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// width in pixels 34007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 3401ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { 3402ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 3403ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4] // src 3404ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8] // dst 3405ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 12] // count 3406ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pcmpeqb xmm0, xmm0 // generate mask 0xff000000 3407ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pslld xmm0, 24 3408ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff 3409ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrld xmm1, 8 3410ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3411ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 34127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm2, [eax] 34137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm3, [eax + 16] 3414ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 32] 34157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm4, [edx] 34167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm5, [edx + 16] 3417ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pand xmm2, xmm0 3418ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pand xmm3, xmm0 3419ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pand xmm4, xmm1 3420ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pand xmm5, xmm1 3421ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian por xmm2, xmm4 3422ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian por xmm3, xmm5 34237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx], xmm2 34247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx + 16], xmm3 3425ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 32] 3426ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 8 3427ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 3428ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3429ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 3430ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 3431ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 3432ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif // HAS_ARGBCOPYALPHAROW_SSE2 3433ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3434ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBCOPYALPHAROW_AVX2 3435ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// width in pixels 34367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 3437ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { 3438ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 3439ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4] // src 3440ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8] // dst 3441ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 12] // count 3442ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpcmpeqb ymm0, ymm0, ymm0 3443ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff 3444ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3445ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 3446ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovdqu ymm1, [eax] 3447ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovdqu ymm2, [eax + 32] 3448ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 64] 3449ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpblendvb ymm1, ymm1, [edx], ymm0 3450ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpblendvb ymm2, ymm2, [edx + 32], ymm0 3451ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovdqu [edx], ymm1 3452ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovdqu [edx + 32], ymm2 3453ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 64] 3454ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 16 3455ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 3456ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3457ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vzeroupper 3458ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 3459ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 3460ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 3461ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif // HAS_ARGBCOPYALPHAROW_AVX2 3462ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3463ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 3464ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// width in pixels 34657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 3466ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { 3467ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 3468ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4] // src 3469ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8] // dst 3470ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 12] // count 3471ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pcmpeqb xmm0, xmm0 // generate mask 0xff000000 3472ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pslld xmm0, 24 3473ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff 3474ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrld xmm1, 8 3475ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3476ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 3477ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movq xmm2, qword ptr [eax] // 8 Y's 3478ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 8] 3479ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm2, xmm2 3480ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpckhwd xmm3, xmm2 3481ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklwd xmm2, xmm2 34827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm4, [edx] 34837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm5, [edx + 16] 3484ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pand xmm2, xmm0 3485ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pand xmm3, xmm0 3486ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pand xmm4, xmm1 3487ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pand xmm5, xmm1 3488ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian por xmm2, xmm4 3489ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian por xmm3, xmm5 34907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx], xmm2 34917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx + 16], xmm3 3492ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 32] 3493ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 8 3494ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 3495ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3496ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 3497ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 3498ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 3499ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif // HAS_ARGBCOPYYTOALPHAROW_SSE2 3500ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3501ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 3502ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// width in pixels 35037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 3504ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { 3505ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 3506ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4] // src 3507ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8] // dst 3508ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 12] // count 3509ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpcmpeqb ymm0, ymm0, ymm0 3510ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff 3511ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3512ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 3513ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpmovzxbd ymm1, qword ptr [eax] 3514ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpmovzxbd ymm2, qword ptr [eax + 8] 3515ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 16] 3516ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpslld ymm1, ymm1, 24 3517ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpslld ymm2, ymm2, 24 3518ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpblendvb ymm1, ymm1, [edx], ymm0 3519ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpblendvb ymm2, ymm2, [edx + 32], ymm0 3520ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovdqu [edx], ymm1 3521ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovdqu [edx + 32], ymm2 3522ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 64] 3523ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 16 3524ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 3525ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3526ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vzeroupper 3527ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 3528ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 3529ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 3530ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 3531ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3532ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_SETROW_X86 35337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Write 'count' bytes using an 8 bit value repeated. 35347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Count should be multiple of 4. 35357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 35367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid SetRow_X86(uint8* dst, uint8 v8, int count) { 35377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm { 35387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movzx eax, byte ptr [esp + 8] // v8 35397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov edx, 0x01010101 // Duplicate byte to all bytes. 35407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mul edx // overwrites edx with upper part of result. 3541ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, edi 3542ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edi, [esp + 4] // dst 3543ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 12] // count 3544ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian shr ecx, 2 3545ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian rep stosd 3546ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edi, edx 3547ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 3548ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 3549ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 3550ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 35517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Write 'count' bytes using an 8 bit value repeated. 35527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 35537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid SetRow_ERMS(uint8* dst, uint8 v8, int count) { 3554ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 35557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov edx, edi 35567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov edi, [esp + 4] // dst 35577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov eax, [esp + 8] // v8 35587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov ecx, [esp + 12] // count 35597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian rep stosb 35607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov edi, edx 35617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian ret 35627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian } 35637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian} 3564ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 35657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Write 'count' 32 bit values. 35667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 35677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) { 35687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm { 35697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov edx, edi 35707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov edi, [esp + 4] // dst 35717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov eax, [esp + 8] // v32 35727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov ecx, [esp + 12] // count 35737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian rep stosd 35747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov edi, edx 3575ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 3576ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 3577ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 3578ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif // HAS_SETROW_X86 3579ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3580ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_YUY2TOYROW_AVX2 35817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 3582ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid YUY2ToYRow_AVX2(const uint8* src_yuy2, 3583ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian uint8* dst_y, int pix) { 3584ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 3585ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4] // src_yuy2 3586ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8] // dst_y 3587ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 12] // pix 3588ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff 3589ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpsrlw ymm5, ymm5, 8 3590ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3591ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 3592ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovdqu ymm0, [eax] 3593ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovdqu ymm1, [eax + 32] 3594ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 64] 3595ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpand ymm0, ymm0, ymm5 // even bytes are Y 3596ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpand ymm1, ymm1, ymm5 3597ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpackuswb ymm0, ymm0, ymm1 // mutates. 3598ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpermq ymm0, ymm0, 0xd8 3599ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovdqu [edx], ymm0 3600ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 32] 36017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 32 3602ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 3603ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vzeroupper 3604ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 3605ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 3606ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 3607ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 36087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 3609ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, 3610ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian uint8* dst_u, uint8* dst_v, int pix) { 3611ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 3612ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 3613ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push edi 3614ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 8 + 4] // src_yuy2 3615ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, [esp + 8 + 8] // stride_yuy2 3616ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8 + 12] // dst_u 3617ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edi, [esp + 8 + 16] // dst_v 3618ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 8 + 20] // pix 3619ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff 3620ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpsrlw ymm5, ymm5, 8 3621ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub edi, edx 3622ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3623ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 3624ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovdqu ymm0, [eax] 3625ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovdqu ymm1, [eax + 32] 3626ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpavgb ymm0, ymm0, [eax + esi] 3627ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpavgb ymm1, ymm1, [eax + esi + 32] 3628ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 64] 3629ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV 3630ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpsrlw ymm1, ymm1, 8 3631ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpackuswb ymm0, ymm0, ymm1 // mutates. 3632ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpermq ymm0, ymm0, 0xd8 3633ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpand ymm1, ymm0, ymm5 // U 3634ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpsrlw ymm0, ymm0, 8 // V 3635ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpackuswb ymm1, ymm1, ymm1 // mutates. 3636ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpackuswb ymm0, ymm0, ymm0 // mutates. 3637ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpermq ymm1, ymm1, 0xd8 3638ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpermq ymm0, ymm0, 0xd8 3639ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vextractf128 [edx], ymm1, 0 // U 3640ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vextractf128 [edx + edi], ymm0, 0 // V 3641ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 16] 3642ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 32 3643ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 3644ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3645ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop edi 3646ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 3647ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vzeroupper 3648ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 3649ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 3650ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 3651ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 36527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 3653ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid YUY2ToUV422Row_AVX2(const uint8* src_yuy2, 3654ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian uint8* dst_u, uint8* dst_v, int pix) { 3655ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 3656ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push edi 3657ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4 + 4] // src_yuy2 3658ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 4 + 8] // dst_u 3659ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edi, [esp + 4 + 12] // dst_v 3660ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 4 + 16] // pix 36617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff 36627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpsrlw ymm5, ymm5, 8 36637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub edi, edx 36647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 36657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian convertloop: 36667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu ymm0, [eax] 36677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu ymm1, [eax + 32] 36687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian lea eax, [eax + 64] 36697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV 36707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpsrlw ymm1, ymm1, 8 36717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpackuswb ymm0, ymm0, ymm1 // mutates. 36727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpermq ymm0, ymm0, 0xd8 36737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpand ymm1, ymm0, ymm5 // U 36747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpsrlw ymm0, ymm0, 8 // V 36757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpackuswb ymm1, ymm1, ymm1 // mutates. 36767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpackuswb ymm0, ymm0, ymm0 // mutates. 36777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpermq ymm1, ymm1, 0xd8 36787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpermq ymm0, ymm0, 0xd8 36797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vextractf128 [edx], ymm1, 0 // U 36807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vextractf128 [edx + edi], ymm0, 0 // V 36817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian lea edx, [edx + 16] 36827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 32 3683ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 3684ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3685ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop edi 36867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vzeroupper 3687ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 3688ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 3689ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 3690ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 36917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 36927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid UYVYToYRow_AVX2(const uint8* src_uyvy, 36937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian uint8* dst_y, int pix) { 3694ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 36957ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov eax, [esp + 4] // src_uyvy 3696ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8] // dst_y 3697ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 12] // pix 3698ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3699ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 37007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu ymm0, [eax] 37017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu ymm1, [eax + 32] 37027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian lea eax, [eax + 64] 37037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpsrlw ymm0, ymm0, 8 // odd bytes are Y 37047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpsrlw ymm1, ymm1, 8 37057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpackuswb ymm0, ymm0, ymm1 // mutates. 37067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpermq ymm0, ymm0, 0xd8 37077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu [edx], ymm0 37087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian lea edx, [edx + 32] 37097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 32 3710ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 37117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vzeroupper 3712ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 3713ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 3714ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 3715ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 37167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 37177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, 37187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian uint8* dst_u, uint8* dst_v, int pix) { 3719ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 3720ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 3721ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push edi 3722ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 8 + 4] // src_yuy2 3723ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, [esp + 8 + 8] // stride_yuy2 3724ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8 + 12] // dst_u 3725ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edi, [esp + 8 + 16] // dst_v 3726ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 8 + 20] // pix 37277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff 37287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpsrlw ymm5, ymm5, 8 3729ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub edi, edx 3730ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3731ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 37327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu ymm0, [eax] 37337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu ymm1, [eax + 32] 37347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpavgb ymm0, ymm0, [eax + esi] 37357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpavgb ymm1, ymm1, [eax + esi + 32] 37367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian lea eax, [eax + 64] 37377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpand ymm0, ymm0, ymm5 // UYVY -> UVUV 37387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpand ymm1, ymm1, ymm5 37397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpackuswb ymm0, ymm0, ymm1 // mutates. 37407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpermq ymm0, ymm0, 0xd8 37417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpand ymm1, ymm0, ymm5 // U 37427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpsrlw ymm0, ymm0, 8 // V 37437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpackuswb ymm1, ymm1, ymm1 // mutates. 37447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpackuswb ymm0, ymm0, ymm0 // mutates. 37457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpermq ymm1, ymm1, 0xd8 37467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpermq ymm0, ymm0, 0xd8 37477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vextractf128 [edx], ymm1, 0 // U 37487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vextractf128 [edx + edi], ymm0, 0 // V 37497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian lea edx, [edx + 16] 37507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 32 3751ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 3752ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3753ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop edi 3754ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 37557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vzeroupper 3756ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 3757ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 3758ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 3759ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 37607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 37617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid UYVYToUV422Row_AVX2(const uint8* src_uyvy, 37627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian uint8* dst_u, uint8* dst_v, int pix) { 3763ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 3764ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push edi 3765ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4 + 4] // src_yuy2 3766ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 4 + 8] // dst_u 3767ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edi, [esp + 4 + 12] // dst_v 3768ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 4 + 16] // pix 37697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff 37707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpsrlw ymm5, ymm5, 8 3771ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub edi, edx 3772ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3773ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 37747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu ymm0, [eax] 37757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu ymm1, [eax + 32] 37767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian lea eax, [eax + 64] 37777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpand ymm0, ymm0, ymm5 // UYVY -> UVUV 37787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpand ymm1, ymm1, ymm5 37797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpackuswb ymm0, ymm0, ymm1 // mutates. 37807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpermq ymm0, ymm0, 0xd8 37817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpand ymm1, ymm0, ymm5 // U 37827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpsrlw ymm0, ymm0, 8 // V 37837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpackuswb ymm1, ymm1, ymm1 // mutates. 37847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpackuswb ymm0, ymm0, ymm0 // mutates. 37857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpermq ymm1, ymm1, 0xd8 37867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpermq ymm0, ymm0, 0xd8 37877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vextractf128 [edx], ymm1, 0 // U 37887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vextractf128 [edx + edi], ymm0, 0 // V 37897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian lea edx, [edx + 16] 37907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 32 3791ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 3792ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3793ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop edi 37947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vzeroupper 3795ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 3796ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 3797ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 37987ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif // HAS_YUY2TOYROW_AVX2 3799ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 38007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#ifdef HAS_YUY2TOYROW_SSE2 38017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 38027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid YUY2ToYRow_SSE2(const uint8* src_yuy2, 3803ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian uint8* dst_y, int pix) { 3804ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 38057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov eax, [esp + 4] // src_yuy2 3806ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8] // dst_y 3807ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 12] // pix 38087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 38097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian psrlw xmm5, 8 3810ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3811ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 38127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm0, [eax] 38137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm1, [eax + 16] 3814ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 32] 38157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian pand xmm0, xmm5 // even bytes are Y 38167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian pand xmm1, xmm5 3817ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm0, xmm1 38187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx], xmm0 3819ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 16] 38207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 16 3821ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 3822ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 3823ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 3824ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 3825ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 38267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 38277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, 3828ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian uint8* dst_u, uint8* dst_v, int pix) { 3829ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 3830ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 3831ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push edi 3832ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 8 + 4] // src_yuy2 3833ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, [esp + 8 + 8] // stride_yuy2 3834ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8 + 12] // dst_u 3835ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edi, [esp + 8 + 16] // dst_v 3836ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 8 + 20] // pix 3837ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 3838ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm5, 8 3839ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub edi, edx 3840ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3841ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 38427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm0, [eax] 38437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm1, [eax + 16] 38447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm2, [eax + esi] 38457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm3, [eax + esi + 16] 3846ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 32] 3847ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgb xmm0, xmm2 3848ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgb xmm1, xmm3 38497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian psrlw xmm0, 8 // YUYV -> UVUV 38507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian psrlw xmm1, 8 3851ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm0, xmm1 3852ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm1, xmm0 3853ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pand xmm0, xmm5 // U 3854ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm0, xmm0 3855ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm1, 8 // V 3856ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm1, xmm1 3857ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movq qword ptr [edx], xmm0 3858ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movq qword ptr [edx + edi], xmm1 3859ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 8] 3860ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 16 3861ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 3862ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3863ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop edi 3864ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 3865ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 3866ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 3867ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 3868ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 38697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 38707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid YUY2ToUV422Row_SSE2(const uint8* src_yuy2, 3871ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian uint8* dst_u, uint8* dst_v, int pix) { 3872ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 3873ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push edi 3874ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4 + 4] // src_yuy2 3875ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 4 + 8] // dst_u 3876ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edi, [esp + 4 + 12] // dst_v 3877ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 4 + 16] // pix 3878ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 3879ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm5, 8 3880ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub edi, edx 3881ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3882ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 38837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm0, [eax] 38847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm1, [eax + 16] 3885ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 32] 38867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian psrlw xmm0, 8 // YUYV -> UVUV 38877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian psrlw xmm1, 8 3888ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm0, xmm1 3889ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm1, xmm0 3890ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pand xmm0, xmm5 // U 3891ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm0, xmm0 3892ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm1, 8 // V 3893ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm1, xmm1 3894ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movq qword ptr [edx], xmm0 3895ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movq qword ptr [edx + edi], xmm1 3896ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 8] 3897ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 16 3898ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 3899ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3900ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop edi 3901ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 3902ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 3903ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 3904ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 39057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 39067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid UYVYToYRow_SSE2(const uint8* src_uyvy, 39077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian uint8* dst_y, int pix) { 3908ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 3909ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4] // src_uyvy 3910ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8] // dst_y 3911ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 12] // pix 3912ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3913ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 3914ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm0, [eax] 3915ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm1, [eax + 16] 3916ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 32] 3917ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm0, 8 // odd bytes are Y 3918ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm1, 8 3919ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm0, xmm1 3920ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu [edx], xmm0 3921ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 16] 39227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 16 3923ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 3924ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 3925ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 3926ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 3927ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 39287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 39297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, 39307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian uint8* dst_u, uint8* dst_v, int pix) { 3931ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 3932ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 3933ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push edi 3934ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 8 + 4] // src_yuy2 3935ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, [esp + 8 + 8] // stride_yuy2 3936ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8 + 12] // dst_u 3937ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edi, [esp + 8 + 16] // dst_v 3938ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 8 + 20] // pix 3939ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 3940ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm5, 8 3941ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub edi, edx 3942ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3943ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 3944ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm0, [eax] 3945ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm1, [eax + 16] 3946ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm2, [eax + esi] 3947ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm3, [eax + esi + 16] 3948ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 32] 3949ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgb xmm0, xmm2 3950ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgb xmm1, xmm3 3951ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pand xmm0, xmm5 // UYVY -> UVUV 3952ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pand xmm1, xmm5 3953ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm0, xmm1 3954ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm1, xmm0 3955ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pand xmm0, xmm5 // U 3956ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm0, xmm0 3957ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm1, 8 // V 3958ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm1, xmm1 3959ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movq qword ptr [edx], xmm0 3960ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movq qword ptr [edx + edi], xmm1 3961ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 8] 3962ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 16 3963ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 3964ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3965ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop edi 3966ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 3967ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 3968ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 3969ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 3970ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 39717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 39727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid UYVYToUV422Row_SSE2(const uint8* src_uyvy, 39737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian uint8* dst_u, uint8* dst_v, int pix) { 3974ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 3975ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push edi 3976ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4 + 4] // src_yuy2 3977ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 4 + 8] // dst_u 3978ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edi, [esp + 4 + 12] // dst_v 3979ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 4 + 16] // pix 3980ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 3981ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm5, 8 3982ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub edi, edx 3983ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3984ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 3985ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm0, [eax] 3986ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm1, [eax + 16] 3987ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 32] 3988ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pand xmm0, xmm5 // UYVY -> UVUV 3989ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pand xmm1, xmm5 3990ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm0, xmm1 3991ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm1, xmm0 3992ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pand xmm0, xmm5 // U 3993ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm0, xmm0 3994ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm1, 8 // V 3995ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm1, xmm1 3996ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movq qword ptr [edx], xmm0 3997ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movq qword ptr [edx + edi], xmm1 3998ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 8] 3999ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 16 4000ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 4001ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4002ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop edi 4003ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 4004ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 4005ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 4006ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif // HAS_YUY2TOYROW_SSE2 4007ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4008ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBBLENDROW_SSE2 4009ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Blend 8 pixels at a time. 40107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 4011ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, 4012ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian uint8* dst_argb, int width) { 4013ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 4014ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 4015ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4 + 4] // src_argb0 4016ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, [esp + 4 + 8] // src_argb1 4017ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 4 + 12] // dst_argb 4018ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 4 + 16] // width 4019ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pcmpeqb xmm7, xmm7 // generate constant 1 4020ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm7, 15 4021ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff 4022ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm6, 8 4023ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 4024ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psllw xmm5, 8 4025ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pcmpeqb xmm4, xmm4 // generate mask 0xff000000 4026ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pslld xmm4, 24 40277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 4 40287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian jl convertloop4b // less than 4 pixels? 4029ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4030ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // 4 pixel loop. 4031ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop4: 4032ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm3, [eax] // src argb 4033ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 16] 4034ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm0, xmm3 // src argb 4035ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pxor xmm3, xmm4 // ~alpha 4036ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm2, [esi] // _r_b 4037ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm3, 8 // alpha 4038ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufhw xmm3, xmm3, 0F5h // 8 alpha words 4039ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshuflw xmm3, xmm3, 0F5h 4040ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pand xmm2, xmm6 // _r_b 4041ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddw xmm3, xmm7 // 256 - alpha 4042ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmullw xmm2, xmm3 // _r_b * alpha 4043ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm1, [esi] // _a_g 4044ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea esi, [esi + 16] 4045ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm1, 8 // _a_g 4046ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian por xmm0, xmm4 // set alpha to 255 4047ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmullw xmm1, xmm3 // _a_g * alpha 4048ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm2, 8 // _r_b convert to 8 bits again 4049ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddusb xmm0, xmm2 // + src argb 4050ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pand xmm1, xmm5 // a_g_ convert to 8 bits again 4051ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddusb xmm0, xmm1 // + src argb 40527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx], xmm0 4053ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 16] 40547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 4 4055ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jge convertloop4 4056ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4057ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop4b: 4058ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian add ecx, 4 - 1 4059ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jl convertloop1b 4060ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4061ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // 1 pixel loop. 4062ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop1: 4063ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd xmm3, [eax] // src argb 4064ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 4] 4065ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm0, xmm3 // src argb 4066ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pxor xmm3, xmm4 // ~alpha 4067ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd xmm2, [esi] // _r_b 4068ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm3, 8 // alpha 4069ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufhw xmm3, xmm3, 0F5h // 8 alpha words 4070ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshuflw xmm3, xmm3, 0F5h 4071ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pand xmm2, xmm6 // _r_b 4072ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddw xmm3, xmm7 // 256 - alpha 4073ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmullw xmm2, xmm3 // _r_b * alpha 4074ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd xmm1, [esi] // _a_g 4075ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea esi, [esi + 4] 4076ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm1, 8 // _a_g 4077ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian por xmm0, xmm4 // set alpha to 255 4078ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmullw xmm1, xmm3 // _a_g * alpha 4079ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm2, 8 // _r_b convert to 8 bits again 4080ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddusb xmm0, xmm2 // + src argb 4081ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pand xmm1, xmm5 // a_g_ convert to 8 bits again 4082ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddusb xmm0, xmm1 // + src argb 4083ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd [edx], xmm0 4084ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 4] 40857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 1 4086ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jge convertloop1 4087ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4088ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop1b: 4089ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 4090ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 4091ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 4092ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 4093ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif // HAS_ARGBBLENDROW_SSE2 4094ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4095ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBBLENDROW_SSSE3 4096ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Shuffle table for isolating alpha. 4097ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const uvec8 kShuffleAlpha = { 4098ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, 4099ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 4100ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}; 4101ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Same as SSE2, but replaces: 4102ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// psrlw xmm3, 8 // alpha 4103ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// pshufhw xmm3, xmm3, 0F5h // 8 alpha words 4104ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// pshuflw xmm3, xmm3, 0F5h 4105ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// with.. 4106ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// pshufb xmm3, kShuffleAlpha // alpha 4107ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Blend 8 pixels at a time. 4108ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 41097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 4110ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, 4111ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian uint8* dst_argb, int width) { 4112ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 4113ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 4114ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4 + 4] // src_argb0 4115ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, [esp + 4 + 8] // src_argb1 4116ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 4 + 12] // dst_argb 4117ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 4 + 16] // width 4118ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pcmpeqb xmm7, xmm7 // generate constant 0x0001 4119ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm7, 15 4120ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff 4121ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm6, 8 4122ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 4123ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psllw xmm5, 8 4124ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pcmpeqb xmm4, xmm4 // generate mask 0xff000000 4125ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pslld xmm4, 24 41267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 4 41277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian jl convertloop4b // less than 4 pixels? 4128ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4129ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // 4 pixel loop. 4130ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop4: 4131ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm3, [eax] // src argb 4132ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 16] 4133ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm0, xmm3 // src argb 4134ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pxor xmm3, xmm4 // ~alpha 4135ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm2, [esi] // _r_b 4136ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufb xmm3, kShuffleAlpha // alpha 4137ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pand xmm2, xmm6 // _r_b 4138ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddw xmm3, xmm7 // 256 - alpha 4139ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmullw xmm2, xmm3 // _r_b * alpha 4140ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm1, [esi] // _a_g 4141ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea esi, [esi + 16] 4142ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm1, 8 // _a_g 4143ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian por xmm0, xmm4 // set alpha to 255 4144ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmullw xmm1, xmm3 // _a_g * alpha 4145ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm2, 8 // _r_b convert to 8 bits again 4146ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddusb xmm0, xmm2 // + src argb 4147ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pand xmm1, xmm5 // a_g_ convert to 8 bits again 4148ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddusb xmm0, xmm1 // + src argb 41497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx], xmm0 4150ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 16] 41517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 4 41527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian jge convertloop4 4153ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4154ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop4b: 4155ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian add ecx, 4 - 1 4156ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jl convertloop1b 4157ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4158ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // 1 pixel loop. 4159ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop1: 4160ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd xmm3, [eax] // src argb 4161ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 4] 4162ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm0, xmm3 // src argb 4163ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pxor xmm3, xmm4 // ~alpha 4164ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd xmm2, [esi] // _r_b 4165ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufb xmm3, kShuffleAlpha // alpha 4166ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pand xmm2, xmm6 // _r_b 4167ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddw xmm3, xmm7 // 256 - alpha 4168ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmullw xmm2, xmm3 // _r_b * alpha 4169ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd xmm1, [esi] // _a_g 4170ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea esi, [esi + 4] 4171ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm1, 8 // _a_g 4172ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian por xmm0, xmm4 // set alpha to 255 4173ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmullw xmm1, xmm3 // _a_g * alpha 4174ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm2, 8 // _r_b convert to 8 bits again 4175ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddusb xmm0, xmm2 // + src argb 4176ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pand xmm1, xmm5 // a_g_ convert to 8 bits again 4177ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddusb xmm0, xmm1 // + src argb 4178ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd [edx], xmm0 4179ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 4] 41807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 1 4181ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jge convertloop1 4182ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4183ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop1b: 4184ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 4185ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 4186ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 4187ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 4188ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif // HAS_ARGBBLENDROW_SSSE3 4189ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4190ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBATTENUATEROW_SSE2 4191ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Attenuate 4 pixels at a time. 41927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 4193ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { 4194ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 4195ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4] // src_argb0 4196ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8] // dst_argb 4197ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 12] // width 4198ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pcmpeqb xmm4, xmm4 // generate mask 0xff000000 4199ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pslld xmm4, 24 4200ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pcmpeqb xmm5, xmm5 // generate mask 0x00ffffff 4201ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrld xmm5, 8 4202ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4203ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 42047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm0, [eax] // read 4 pixels 4205ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm0, xmm0 // first 2 4206ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufhw xmm2, xmm0, 0FFh // 8 alpha words 4207ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshuflw xmm2, xmm2, 0FFh 4208ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmulhuw xmm0, xmm2 // rgb * a 42097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm1, [eax] // read 4 pixels 4210ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpckhbw xmm1, xmm1 // next 2 pixels 4211ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufhw xmm2, xmm1, 0FFh // 8 alpha words 4212ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshuflw xmm2, xmm2, 0FFh 4213ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmulhuw xmm1, xmm2 // rgb * a 42147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm2, [eax] // alphas 4215ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 16] 4216ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm0, 8 4217ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pand xmm2, xmm4 4218ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm1, 8 4219ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm0, xmm1 4220ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pand xmm0, xmm5 // keep original alphas 4221ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian por xmm0, xmm2 42227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx], xmm0 4223ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 16] 42247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 4 4225ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 4226ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4227ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 4228ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 4229ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 4230ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif // HAS_ARGBATTENUATEROW_SSE2 4231ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4232ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBATTENUATEROW_SSSE3 4233ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Shuffle table duplicating alpha. 4234ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const uvec8 kShuffleAlpha0 = { 4235ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u, 4236ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}; 4237ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const uvec8 kShuffleAlpha1 = { 4238ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, 4239ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u, 4240ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}; 42417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 4242ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { 4243ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 4244ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4] // src_argb0 4245ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8] // dst_argb 4246ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 12] // width 4247ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pcmpeqb xmm3, xmm3 // generate mask 0xff000000 4248ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pslld xmm3, 24 4249ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm4, kShuffleAlpha0 4250ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm5, kShuffleAlpha1 4251ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4252ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 4253ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm0, [eax] // read 4 pixels 4254ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufb xmm0, xmm4 // isolate first 2 alphas 4255ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm1, [eax] // read 4 pixels 4256ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm1, xmm1 // first 2 pixel rgbs 4257ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmulhuw xmm0, xmm1 // rgb * a 4258ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm1, [eax] // read 4 pixels 4259ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufb xmm1, xmm5 // isolate next 2 alphas 4260ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm2, [eax] // read 4 pixels 4261ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpckhbw xmm2, xmm2 // next 2 pixel rgbs 4262ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmulhuw xmm1, xmm2 // rgb * a 4263ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm2, [eax] // mask original alpha 4264ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 16] 4265ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pand xmm2, xmm3 4266ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm0, 8 4267ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm1, 8 4268ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm0, xmm1 4269ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian por xmm0, xmm2 // copy original alpha 4270ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu [edx], xmm0 4271ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 16] 42727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 4 4273ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 4274ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4275ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 4276ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 4277ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 4278ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif // HAS_ARGBATTENUATEROW_SSSE3 4279ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4280ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBATTENUATEROW_AVX2 4281ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Shuffle table duplicating alpha. 42827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianstatic const uvec8 kShuffleAlpha_AVX2 = { 42837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u 4284ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}; 42857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 4286ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { 4287ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 4288ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4] // src_argb0 4289ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8] // dst_argb 4290ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 12] // width 4291ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub edx, eax 42927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vbroadcastf128 ymm4,kShuffleAlpha_AVX2 4293ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 4294ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpslld ymm5, ymm5, 24 4295ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4296ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 4297ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovdqu ymm6, [eax] // read 8 pixels. 4298ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. 4299ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. 4300ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpshufb ymm2, ymm0, ymm4 // low 4 alphas 4301ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpshufb ymm3, ymm1, ymm4 // high 4 alphas 4302ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpmulhuw ymm0, ymm0, ymm2 // rgb * a 4303ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpmulhuw ymm1, ymm1, ymm3 // rgb * a 4304ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpand ymm6, ymm6, ymm5 // isolate alpha 4305ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpsrlw ymm0, ymm0, 8 4306ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpsrlw ymm1, ymm1, 8 4307ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpackuswb ymm0, ymm0, ymm1 // unmutated. 4308ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpor ymm0, ymm0, ymm6 // copy original alpha 4309ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovdqu [eax + edx], ymm0 4310ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 32] 43117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 8 4312ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 4313ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4314ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vzeroupper 4315ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 4316ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 4317ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 4318ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif // HAS_ARGBATTENUATEROW_AVX2 4319ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4320ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBUNATTENUATEROW_SSE2 4321ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Unattenuate 4 pixels at a time. 43227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 4323ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, 4324ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian int width) { 4325ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 4326ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 4327ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push edi 4328ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 8 + 4] // src_argb0 4329ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8 + 8] // dst_argb 4330ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 8 + 12] // width 4331ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4332ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 4333ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm0, [eax] // read 4 pixels 4334ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx esi, byte ptr [eax + 3] // first alpha 4335ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx edi, byte ptr [eax + 7] // second alpha 4336ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm0, xmm0 // first 2 4337ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd xmm2, dword ptr fixed_invtbl8[esi * 4] 4338ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd xmm3, dword ptr fixed_invtbl8[edi * 4] 4339ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words. 1, a, a, a 4340ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words 4341ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movlhps xmm2, xmm3 4342ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmulhuw xmm0, xmm2 // rgb * a 4343ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4344ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm1, [eax] // read 4 pixels 4345ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx esi, byte ptr [eax + 11] // third alpha 4346ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx edi, byte ptr [eax + 15] // forth alpha 4347ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpckhbw xmm1, xmm1 // next 2 4348ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd xmm2, dword ptr fixed_invtbl8[esi * 4] 4349ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd xmm3, dword ptr fixed_invtbl8[edi * 4] 4350ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words 4351ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words 4352ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movlhps xmm2, xmm3 4353ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmulhuw xmm1, xmm2 // rgb * a 4354ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 16] 4355ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4356ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm0, xmm1 4357ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu [edx], xmm0 4358ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 16] 43597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 4 4360ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 4361ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop edi 4362ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 4363ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 4364ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 4365ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 4366ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif // HAS_ARGBUNATTENUATEROW_SSE2 4367ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4368ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBUNATTENUATEROW_AVX2 4369ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Shuffle table duplicating alpha. 43707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianstatic const uvec8 kUnattenShuffleAlpha_AVX2 = { 43717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u 4372ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}; 4373ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// TODO(fbarchard): Enable USE_GATHER for future hardware if faster. 4374ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// USE_GATHER is not on by default, due to being a slow instruction. 4375ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef USE_GATHER 43767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 4377ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, 4378ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian int width) { 4379ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 4380ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4] // src_argb0 4381ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8] // dst_argb 4382ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 12] // width 4383ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub edx, eax 43847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vbroadcastf128 ymm4, kUnattenShuffleAlpha_AVX2 4385ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4386ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 4387ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovdqu ymm6, [eax] // read 8 pixels. 4388ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xffffffff for gather. 4389ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpsrld ymm2, ymm6, 24 // alpha in low 8 bits. 4390ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. 4391ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. 4392ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5 // ymm5 cleared. 1, a 4393ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a 4394ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated. 4395ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpshufb ymm2, ymm2, ymm4 // replicate low 4 alphas. 1, a, a, a 4396ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpshufb ymm3, ymm3, ymm4 // replicate high 4 alphas 4397ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpmulhuw ymm0, ymm0, ymm2 // rgb * ia 4398ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpmulhuw ymm1, ymm1, ymm3 // rgb * ia 4399ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpackuswb ymm0, ymm0, ymm1 // unmutated. 4400ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovdqu [eax + edx], ymm0 4401ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 32] 44027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 8 4403ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 4404ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4405ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vzeroupper 4406ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 4407ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 4408ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 4409ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#else // USE_GATHER 44107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 4411ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, 4412ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian int width) { 4413ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 4414ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4415ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4] // src_argb0 4416ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8] // dst_argb 4417ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 12] // width 4418ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub edx, eax 44197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vbroadcastf128 ymm5, kUnattenShuffleAlpha_AVX2 4420ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4421ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 4422ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push edi 4423ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4424ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 4425ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // replace VPGATHER 4426ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx esi, byte ptr [eax + 3] // alpha0 4427ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx edi, byte ptr [eax + 7] // alpha1 4428ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovd xmm0, dword ptr fixed_invtbl8[esi * 4] // [1,a0] 4429ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovd xmm1, dword ptr fixed_invtbl8[edi * 4] // [1,a1] 4430ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx esi, byte ptr [eax + 11] // alpha2 4431ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx edi, byte ptr [eax + 15] // alpha3 4432ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpunpckldq xmm6, xmm0, xmm1 // [1,a1,1,a0] 4433ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovd xmm2, dword ptr fixed_invtbl8[esi * 4] // [1,a2] 4434ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovd xmm3, dword ptr fixed_invtbl8[edi * 4] // [1,a3] 4435ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx esi, byte ptr [eax + 19] // alpha4 4436ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx edi, byte ptr [eax + 23] // alpha5 4437ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpunpckldq xmm7, xmm2, xmm3 // [1,a3,1,a2] 4438ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovd xmm0, dword ptr fixed_invtbl8[esi * 4] // [1,a4] 4439ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovd xmm1, dword ptr fixed_invtbl8[edi * 4] // [1,a5] 4440ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx esi, byte ptr [eax + 27] // alpha6 4441ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx edi, byte ptr [eax + 31] // alpha7 4442ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpunpckldq xmm0, xmm0, xmm1 // [1,a5,1,a4] 4443ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovd xmm2, dword ptr fixed_invtbl8[esi * 4] // [1,a6] 4444ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovd xmm3, dword ptr fixed_invtbl8[edi * 4] // [1,a7] 4445ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpunpckldq xmm2, xmm2, xmm3 // [1,a7,1,a6] 4446ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpunpcklqdq xmm3, xmm6, xmm7 // [1,a3,1,a2,1,a1,1,a0] 4447ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpunpcklqdq xmm0, xmm0, xmm2 // [1,a7,1,a6,1,a5,1,a4] 4448ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0] 4449ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // end of VPGATHER 4450ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4451ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovdqu ymm6, [eax] // read 8 pixels. 4452ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. 4453ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. 4454ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a 4455ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated. 4456ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpshufb ymm2, ymm2, ymm5 // replicate low 4 alphas. 1, a, a, a 4457ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpshufb ymm3, ymm3, ymm5 // replicate high 4 alphas 4458ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpmulhuw ymm0, ymm0, ymm2 // rgb * ia 4459ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpmulhuw ymm1, ymm1, ymm3 // rgb * ia 4460ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpackuswb ymm0, ymm0, ymm1 // unmutated. 4461ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovdqu [eax + edx], ymm0 4462ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 32] 44637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 8 4464ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 4465ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4466ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop edi 4467ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 4468ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vzeroupper 4469ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 4470ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 4471ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 4472ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif // USE_GATHER 4473ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif // HAS_ARGBATTENUATEROW_AVX2 4474ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4475ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBGRAYROW_SSSE3 4476ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels. 44777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 4478ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { 4479ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 4480ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4] /* src_argb */ 4481ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8] /* dst_argb */ 4482ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 12] /* width */ 4483ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm4, kARGBToYJ 4484ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm5, kAddYJ64 4485ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4486ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 44877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm0, [eax] // G 44887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm1, [eax + 16] 4489ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm0, xmm4 4490ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm1, xmm4 4491ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian phaddw xmm0, xmm1 4492ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddw xmm0, xmm5 // Add .5 for rounding. 4493ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm0, 7 4494ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm0, xmm0 // 8 G bytes 44957ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm2, [eax] // A 44967ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm3, [eax + 16] 4497ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 32] 4498ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrld xmm2, 24 4499ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrld xmm3, 24 4500ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm2, xmm3 4501ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm2, xmm2 // 8 A bytes 4502ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm3, xmm0 // Weave into GG, GA, then GGGA 4503ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm0, xmm0 // 8 GG words 4504ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm3, xmm2 // 8 GA words 4505ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm1, xmm0 4506ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklwd xmm0, xmm3 // GGGA first 4 4507ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpckhwd xmm1, xmm3 // GGGA next 4 45087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx], xmm0 45097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx + 16], xmm1 4510ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 32] 45117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 8 4512ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 4513ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 4514ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 4515ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 4516ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif // HAS_ARGBGRAYROW_SSSE3 4517ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4518ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBSEPIAROW_SSSE3 4519ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// b = (r * 35 + g * 68 + b * 17) >> 7 4520ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// g = (r * 45 + g * 88 + b * 22) >> 7 4521ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// r = (r * 50 + g * 98 + b * 24) >> 7 4522ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Constant for ARGB color to sepia tone. 4523ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const vec8 kARGBToSepiaB = { 4524ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0 4525ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}; 4526ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4527ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const vec8 kARGBToSepiaG = { 4528ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0 4529ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}; 4530ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4531ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const vec8 kARGBToSepiaR = { 4532ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0 4533ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}; 4534ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4535ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. 45367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 4537ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { 4538ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 4539ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4] /* dst_argb */ 4540ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 8] /* width */ 4541ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm2, kARGBToSepiaB 4542ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm3, kARGBToSepiaG 4543ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm4, kARGBToSepiaR 4544ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4545ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 45467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm0, [eax] // B 45477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm6, [eax + 16] 4548ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm0, xmm2 4549ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm6, xmm2 4550ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian phaddw xmm0, xmm6 4551ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm0, 7 4552ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm0, xmm0 // 8 B values 45537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm5, [eax] // G 45547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm1, [eax + 16] 4555ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm5, xmm3 4556ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm1, xmm3 4557ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian phaddw xmm5, xmm1 4558ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm5, 7 4559ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm5, xmm5 // 8 G values 4560ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm0, xmm5 // 8 BG values 45617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm5, [eax] // R 45627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm1, [eax + 16] 4563ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm5, xmm4 4564ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm1, xmm4 4565ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian phaddw xmm5, xmm1 4566ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm5, 7 4567ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm5, xmm5 // 8 R values 45687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm6, [eax] // A 45697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm1, [eax + 16] 4570ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrld xmm6, 24 4571ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrld xmm1, 24 4572ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm6, xmm1 4573ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm6, xmm6 // 8 A values 4574ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm5, xmm6 // 8 RA values 4575ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm1, xmm0 // Weave BG, RA together 4576ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklwd xmm0, xmm5 // BGRA first 4 4577ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpckhwd xmm1, xmm5 // BGRA next 4 45787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [eax], xmm0 45797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [eax + 16], xmm1 4580ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 32] 45817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 8 4582ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 4583ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 4584ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 4585ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 4586ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif // HAS_ARGBSEPIAROW_SSSE3 4587ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4588ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 4589ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Tranform 8 ARGB pixels (32 bytes) with color matrix. 4590ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Same as Sepia except matrix is provided. 4591ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R 4592ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd. 45937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 4594ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, 4595ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian const int8* matrix_argb, int width) { 4596ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 4597ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4] /* src_argb */ 4598ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8] /* dst_argb */ 4599ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 12] /* matrix_argb */ 4600ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm5, [ecx] 4601ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufd xmm2, xmm5, 0x00 4602ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufd xmm3, xmm5, 0x55 4603ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufd xmm4, xmm5, 0xaa 4604ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufd xmm5, xmm5, 0xff 4605ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 16] /* width */ 4606ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4607ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 46087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm0, [eax] // B 46097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm7, [eax + 16] 4610ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm0, xmm2 4611ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm7, xmm2 46127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm6, [eax] // G 46137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm1, [eax + 16] 4614ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm6, xmm3 4615ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm1, xmm3 4616ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian phaddsw xmm0, xmm7 // B 4617ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian phaddsw xmm6, xmm1 // G 4618ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psraw xmm0, 6 // B 4619ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psraw xmm6, 6 // G 4620ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm0, xmm0 // 8 B values 4621ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm6, xmm6 // 8 G values 4622ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm0, xmm6 // 8 BG values 46237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm1, [eax] // R 46247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm7, [eax + 16] 4625ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm1, xmm4 4626ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm7, xmm4 4627ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian phaddsw xmm1, xmm7 // R 46287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm6, [eax] // A 46297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm7, [eax + 16] 4630ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm6, xmm5 4631ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm7, xmm5 4632ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian phaddsw xmm6, xmm7 // A 4633ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psraw xmm1, 6 // R 4634ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psraw xmm6, 6 // A 4635ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm1, xmm1 // 8 R values 4636ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm6, xmm6 // 8 A values 4637ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm1, xmm6 // 8 RA values 4638ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm6, xmm0 // Weave BG, RA together 4639ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklwd xmm0, xmm1 // BGRA first 4 4640ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpckhwd xmm6, xmm1 // BGRA next 4 46417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx], xmm0 46427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx + 16], xmm6 4643ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 32] 4644ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 32] 46457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 8 4646ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 4647ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 4648ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 4649ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 4650ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif // HAS_ARGBCOLORMATRIXROW_SSSE3 4651ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4652ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBQUANTIZEROW_SSE2 4653ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Quantize 4 ARGB pixels (16 bytes). 46547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 4655ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, 4656ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian int interval_offset, int width) { 4657ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 4658ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4] /* dst_argb */ 4659ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd xmm2, [esp + 8] /* scale */ 4660ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd xmm3, [esp + 12] /* interval_size */ 4661ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd xmm4, [esp + 16] /* interval_offset */ 4662ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 20] /* width */ 4663ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshuflw xmm2, xmm2, 040h 4664ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufd xmm2, xmm2, 044h 4665ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshuflw xmm3, xmm3, 040h 4666ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufd xmm3, xmm3, 044h 4667ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshuflw xmm4, xmm4, 040h 4668ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufd xmm4, xmm4, 044h 4669ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pxor xmm5, xmm5 // constant 0 4670ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pcmpeqb xmm6, xmm6 // generate mask 0xff000000 4671ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pslld xmm6, 24 4672ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4673ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 46747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm0, [eax] // read 4 pixels 4675ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm0, xmm5 // first 2 pixels 4676ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmulhuw xmm0, xmm2 // pixel * scale >> 16 46777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm1, [eax] // read 4 pixels 4678ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpckhbw xmm1, xmm5 // next 2 pixels 4679ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmulhuw xmm1, xmm2 4680ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmullw xmm0, xmm3 // * interval_size 46817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm7, [eax] // read 4 pixels 4682ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmullw xmm1, xmm3 4683ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pand xmm7, xmm6 // mask alpha 4684ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddw xmm0, xmm4 // + interval_size / 2 4685ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddw xmm1, xmm4 4686ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm0, xmm1 4687ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian por xmm0, xmm7 46887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [eax], xmm0 4689ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 16] 46907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 4 4691ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 4692ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 4693ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 4694ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 4695ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif // HAS_ARGBQUANTIZEROW_SSE2 4696ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4697ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBSHADEROW_SSE2 4698ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Shade 4 pixels at a time by specified value. 46997ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 4700ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, 4701ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian uint32 value) { 4702ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 4703ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4] // src_argb 4704ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8] // dst_argb 4705ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 12] // width 4706ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd xmm2, [esp + 16] // value 4707ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm2, xmm2 4708ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklqdq xmm2, xmm2 4709ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4710ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 47117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm0, [eax] // read 4 pixels 4712ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 16] 4713ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm1, xmm0 4714ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm0, xmm0 // first 2 4715ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpckhbw xmm1, xmm1 // next 2 4716ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmulhuw xmm0, xmm2 // argb * value 4717ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmulhuw xmm1, xmm2 // argb * value 4718ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm0, 8 4719ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm1, 8 4720ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm0, xmm1 47217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx], xmm0 4722ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 16] 47237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 4 4724ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 4725ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4726ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 4727ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 4728ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 4729ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif // HAS_ARGBSHADEROW_SSE2 4730ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4731ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBMULTIPLYROW_SSE2 4732ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Multiply 2 rows of ARGB pixels together, 4 pixels at a time. 47337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 4734ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, 4735ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian uint8* dst_argb, int width) { 4736ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 4737ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 4738ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4 + 4] // src_argb0 4739ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, [esp + 4 + 8] // src_argb1 4740ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 4 + 12] // dst_argb 4741ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 4 + 16] // width 4742ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pxor xmm5, xmm5 // constant 0 4743ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4744ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 4745ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm0, [eax] // read 4 pixels from src_argb0 4746ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm2, [esi] // read 4 pixels from src_argb1 4747ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm1, xmm0 4748ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm3, xmm2 4749ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm0, xmm0 // first 2 4750ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpckhbw xmm1, xmm1 // next 2 4751ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm2, xmm5 // first 2 4752ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpckhbw xmm3, xmm5 // next 2 4753ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2 4754ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2 4755ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 16] 4756ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea esi, [esi + 16] 4757ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm0, xmm1 4758ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu [edx], xmm0 4759ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 16] 47607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 4 4761ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 4762ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4763ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 4764ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 4765ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 4766ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 4767ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif // HAS_ARGBMULTIPLYROW_SSE2 4768ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4769ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBADDROW_SSE2 4770ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Add 2 rows of ARGB pixels together, 4 pixels at a time. 4771ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// TODO(fbarchard): Port this to posix, neon and other math functions. 47727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 4773ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, 4774ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian uint8* dst_argb, int width) { 4775ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 4776ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 4777ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4 + 4] // src_argb0 4778ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, [esp + 4 + 8] // src_argb1 4779ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 4 + 12] // dst_argb 4780ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 4 + 16] // width 4781ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4782ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 4 4783ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jl convertloop49 4784ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4785ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop4: 4786ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm0, [eax] // read 4 pixels from src_argb0 4787ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 16] 4788ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm1, [esi] // read 4 pixels from src_argb1 4789ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea esi, [esi + 16] 4790ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddusb xmm0, xmm1 // src_argb0 + src_argb1 4791ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu [edx], xmm0 4792ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 16] 47937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 4 4794ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jge convertloop4 4795ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4796ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop49: 4797ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian add ecx, 4 - 1 4798ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jl convertloop19 4799ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4800ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop1: 4801ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd xmm0, [eax] // read 1 pixels from src_argb0 4802ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 4] 4803ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd xmm1, [esi] // read 1 pixels from src_argb1 4804ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea esi, [esi + 4] 4805ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddusb xmm0, xmm1 // src_argb0 + src_argb1 4806ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd [edx], xmm0 4807ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 4] 48087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 1 4809ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jge convertloop1 4810ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4811ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop19: 4812ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 4813ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 4814ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 4815ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 4816ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif // HAS_ARGBADDROW_SSE2 4817ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4818ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBSUBTRACTROW_SSE2 4819ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Subtract 2 rows of ARGB pixels together, 4 pixels at a time. 48207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 4821ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, 4822ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian uint8* dst_argb, int width) { 4823ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 4824ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 4825ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4 + 4] // src_argb0 4826ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, [esp + 4 + 8] // src_argb1 4827ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 4 + 12] // dst_argb 4828ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 4 + 16] // width 4829ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4830ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 4831ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm0, [eax] // read 4 pixels from src_argb0 4832ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 16] 4833ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm1, [esi] // read 4 pixels from src_argb1 4834ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea esi, [esi + 16] 4835ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psubusb xmm0, xmm1 // src_argb0 - src_argb1 4836ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu [edx], xmm0 4837ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 16] 48387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 4 4839ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 4840ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4841ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 4842ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 4843ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 4844ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 4845ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif // HAS_ARGBSUBTRACTROW_SSE2 4846ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4847ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBMULTIPLYROW_AVX2 4848ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Multiply 2 rows of ARGB pixels together, 8 pixels at a time. 48497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 4850ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, 4851ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian uint8* dst_argb, int width) { 4852ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 4853ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 4854ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4 + 4] // src_argb0 4855ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, [esp + 4 + 8] // src_argb1 4856ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 4 + 12] // dst_argb 4857ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 4 + 16] // width 4858ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpxor ymm5, ymm5, ymm5 // constant 0 4859ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4860ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 4861ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovdqu ymm1, [eax] // read 8 pixels from src_argb0 4862ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 32] 4863ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovdqu ymm3, [esi] // read 8 pixels from src_argb1 4864ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea esi, [esi + 32] 4865ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpunpcklbw ymm0, ymm1, ymm1 // low 4 4866ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpunpckhbw ymm1, ymm1, ymm1 // high 4 4867ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpunpcklbw ymm2, ymm3, ymm5 // low 4 4868ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpunpckhbw ymm3, ymm3, ymm5 // high 4 4869ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4 4870ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4 4871ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpackuswb ymm0, ymm0, ymm1 4872ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovdqu [edx], ymm0 4873ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 32] 4874ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 8 4875ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 4876ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4877ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 4878ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vzeroupper 4879ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 4880ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 4881ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 4882ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif // HAS_ARGBMULTIPLYROW_AVX2 4883ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4884ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBADDROW_AVX2 4885ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Add 2 rows of ARGB pixels together, 8 pixels at a time. 48867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 4887ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, 4888ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian uint8* dst_argb, int width) { 4889ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 4890ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 4891ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4 + 4] // src_argb0 4892ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, [esp + 4 + 8] // src_argb1 4893ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 4 + 12] // dst_argb 4894ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 4 + 16] // width 4895ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4896ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 4897ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 4898ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 32] 4899ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1 4900ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea esi, [esi + 32] 4901ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovdqu [edx], ymm0 4902ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 32] 4903ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 8 4904ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 4905ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4906ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 4907ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vzeroupper 4908ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 4909ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 4910ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 4911ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif // HAS_ARGBADDROW_AVX2 4912ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4913ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBSUBTRACTROW_AVX2 4914ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Subtract 2 rows of ARGB pixels together, 8 pixels at a time. 49157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 4916ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, 4917ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian uint8* dst_argb, int width) { 4918ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 4919ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 4920ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4 + 4] // src_argb0 4921ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, [esp + 4 + 8] // src_argb1 4922ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 4 + 12] // dst_argb 4923ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 4 + 16] // width 4924ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4925ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 4926ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 4927ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 32] 4928ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpsubusb ymm0, ymm0, [esi] // src_argb0 - src_argb1 4929ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea esi, [esi + 32] 4930ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovdqu [edx], ymm0 4931ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 32] 4932ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 8 4933ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 4934ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4935ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 4936ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vzeroupper 4937ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 4938ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 4939ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 4940ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif // HAS_ARGBSUBTRACTROW_AVX2 4941ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4942ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_SOBELXROW_SSE2 4943ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// SobelX as a matrix is 4944ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// -1 0 1 4945ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// -2 0 2 4946ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// -1 0 1 49477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 4948ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, 4949ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian const uint8* src_y2, uint8* dst_sobelx, int width) { 4950ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 4951ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 4952ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push edi 4953ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 8 + 4] // src_y0 4954ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, [esp + 8 + 8] // src_y1 4955ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edi, [esp + 8 + 12] // src_y2 4956ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8 + 16] // dst_sobelx 4957ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 8 + 20] // width 4958ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub esi, eax 4959ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub edi, eax 4960ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub edx, eax 4961ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pxor xmm5, xmm5 // constant 0 4962ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4963ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 4964ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] 4965ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2] 4966ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm0, xmm5 4967ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm1, xmm5 4968ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psubw xmm0, xmm1 4969ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] 4970ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movq xmm2, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2] 4971ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm1, xmm5 4972ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm2, xmm5 4973ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psubw xmm1, xmm2 4974ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0] 4975ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movq xmm3, qword ptr [eax + edi + 2] // read 8 pixels from src_y2[2] 4976ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm2, xmm5 4977ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm3, xmm5 4978ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psubw xmm2, xmm3 4979ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddw xmm0, xmm2 4980ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddw xmm0, xmm1 4981ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddw xmm0, xmm1 4982ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw 4983ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psubw xmm1, xmm0 4984ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaxsw xmm0, xmm1 4985ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm0, xmm0 4986ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movq qword ptr [eax + edx], xmm0 4987ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 8] 49887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 8 4989ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 4990ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4991ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop edi 4992ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 4993ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 4994ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 4995ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 4996ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif // HAS_SOBELXROW_SSE2 4997ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4998ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_SOBELYROW_SSE2 4999ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// SobelY as a matrix is 5000ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// -1 -2 -1 5001ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// 0 0 0 5002ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// 1 2 1 50037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 5004ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, 5005ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian uint8* dst_sobely, int width) { 5006ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 5007ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 5008ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4 + 4] // src_y0 5009ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, [esp + 4 + 8] // src_y1 5010ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 4 + 12] // dst_sobely 5011ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 4 + 16] // width 5012ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub esi, eax 5013ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub edx, eax 5014ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pxor xmm5, xmm5 // constant 0 5015ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5016ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 5017ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] 5018ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] 5019ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm0, xmm5 5020ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm1, xmm5 5021ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psubw xmm0, xmm1 5022ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1] 5023ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movq xmm2, qword ptr [eax + esi + 1] // read 8 pixels from src_y1[1] 5024ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm1, xmm5 5025ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm2, xmm5 5026ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psubw xmm1, xmm2 5027ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2] 5028ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movq xmm3, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2] 5029ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm2, xmm5 5030ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm3, xmm5 5031ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psubw xmm2, xmm3 5032ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddw xmm0, xmm2 5033ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddw xmm0, xmm1 5034ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddw xmm0, xmm1 5035ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw 5036ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psubw xmm1, xmm0 5037ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaxsw xmm0, xmm1 5038ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm0, xmm0 5039ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movq qword ptr [eax + edx], xmm0 5040ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 8] 50417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 8 5042ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 5043ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5044ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 5045ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 5046ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 5047ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 5048ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif // HAS_SOBELYROW_SSE2 5049ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5050ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_SOBELROW_SSE2 5051ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Adds Sobel X and Sobel Y and stores Sobel into ARGB. 5052ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// A = 255 5053ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// R = Sobel 5054ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// G = Sobel 5055ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// B = Sobel 50567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 5057ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, 5058ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian uint8* dst_argb, int width) { 5059ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 5060ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 5061ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4 + 4] // src_sobelx 5062ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, [esp + 4 + 8] // src_sobely 5063ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 4 + 12] // dst_argb 5064ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 4 + 16] // width 5065ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub esi, eax 5066ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pcmpeqb xmm5, xmm5 // alpha 255 5067ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pslld xmm5, 24 // 0xff000000 5068ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5069ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 50707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm0, [eax] // read 16 pixels src_sobelx 50717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm1, [eax + esi] // read 16 pixels src_sobely 5072ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 16] 5073ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddusb xmm0, xmm1 // sobel = sobelx + sobely 5074ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm2, xmm0 // GG 5075ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm2, xmm0 // First 8 5076ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpckhbw xmm0, xmm0 // Next 8 5077ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm1, xmm2 // GGGG 5078ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklwd xmm1, xmm2 // First 4 5079ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpckhwd xmm2, xmm2 // Next 4 5080ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian por xmm1, xmm5 // GGGA 5081ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian por xmm2, xmm5 5082ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm3, xmm0 // GGGG 5083ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklwd xmm3, xmm0 // Next 4 5084ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpckhwd xmm0, xmm0 // Last 4 5085ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian por xmm3, xmm5 // GGGA 5086ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian por xmm0, xmm5 50877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx], xmm1 50887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx + 16], xmm2 50897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx + 32], xmm3 50907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx + 48], xmm0 5091ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 64] 50927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 16 5093ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 5094ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5095ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 5096ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 5097ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 5098ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 5099ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif // HAS_SOBELROW_SSE2 5100ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5101ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_SOBELTOPLANEROW_SSE2 5102ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Adds Sobel X and Sobel Y and stores Sobel into a plane. 51037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 5104ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, 5105ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian uint8* dst_y, int width) { 5106ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 5107ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 5108ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4 + 4] // src_sobelx 5109ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, [esp + 4 + 8] // src_sobely 5110ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 4 + 12] // dst_argb 5111ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 4 + 16] // width 5112ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub esi, eax 5113ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5114ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 51157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm0, [eax] // read 16 pixels src_sobelx 51167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm1, [eax + esi] // read 16 pixels src_sobely 5117ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 16] 5118ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddusb xmm0, xmm1 // sobel = sobelx + sobely 51197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx], xmm0 5120ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 16] 51217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 16 5122ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 5123ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5124ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 5125ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 5126ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 5127ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 5128ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif // HAS_SOBELTOPLANEROW_SSE2 5129ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5130ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_SOBELXYROW_SSE2 5131ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Mixes Sobel X, Sobel Y and Sobel into ARGB. 5132ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// A = 255 5133ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// R = Sobel X 5134ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// G = Sobel 5135ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// B = Sobel Y 51367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 5137ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, 5138ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian uint8* dst_argb, int width) { 5139ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 5140ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 5141ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4 + 4] // src_sobelx 5142ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, [esp + 4 + 8] // src_sobely 5143ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 4 + 12] // dst_argb 5144ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 4 + 16] // width 5145ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub esi, eax 5146ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pcmpeqb xmm5, xmm5 // alpha 255 5147ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5148ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 51497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm0, [eax] // read 16 pixels src_sobelx 51507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm1, [eax + esi] // read 16 pixels src_sobely 5151ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 16] 5152ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm2, xmm0 5153ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddusb xmm2, xmm1 // sobel = sobelx + sobely 5154ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm3, xmm0 // XA 5155ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm3, xmm5 5156ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpckhbw xmm0, xmm5 5157ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm4, xmm1 // YS 5158ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm4, xmm2 5159ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpckhbw xmm1, xmm2 5160ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm6, xmm4 // YSXA 5161ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklwd xmm6, xmm3 // First 4 5162ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpckhwd xmm4, xmm3 // Next 4 5163ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm7, xmm1 // YSXA 5164ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklwd xmm7, xmm0 // Next 4 5165ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpckhwd xmm1, xmm0 // Last 4 51667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx], xmm6 51677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx + 16], xmm4 51687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx + 32], xmm7 51697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx + 48], xmm1 5170ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 64] 51717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 16 5172ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 5173ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5174ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 5175ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 5176ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 5177ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 5178ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif // HAS_SOBELXYROW_SSE2 5179ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5180ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 5181ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Consider float CumulativeSum. 5182ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Consider calling CumulativeSum one row at time as needed. 5183ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Consider circular CumulativeSum buffer of radius * 2 + 1 height. 5184ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Convert cumulative sum for an area to an average for 1 pixel. 5185ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// topleft is pointer to top left of CumulativeSum buffer for area. 5186ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// botleft is pointer to bottom left of CumulativeSum buffer. 5187ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// width is offset from left to right of area in CumulativeSum buffer measured 5188ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// in number of ints. 5189ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// area is the number of pixels in the area being averaged. 5190ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// dst points to pixel to store result to. 5191ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// count is number of averaged pixels to produce. 51927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Does 4 pixels at a time. 5193ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, 5194ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian int width, int area, uint8* dst, 5195ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian int count) { 5196ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 5197ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, topleft // eax topleft 5198ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, botleft // esi botleft 5199ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, width 5200ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd xmm5, area 5201ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edi, dst 5202ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, count 5203ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian cvtdq2ps xmm5, xmm5 5204ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian rcpss xmm4, xmm5 // 1.0f / area 5205ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufd xmm4, xmm4, 0 5206ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 4 5207ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jl l4b 5208ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5209ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian cmp area, 128 // 128 pixels will not overflow 15 bits. 5210ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ja l4 5211ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5212ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufd xmm5, xmm5, 0 // area 5213ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pcmpeqb xmm6, xmm6 // constant of 65536.0 - 1 = 65535.0 5214ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrld xmm6, 16 5215ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian cvtdq2ps xmm6, xmm6 5216ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian addps xmm5, xmm6 // (65536.0 + area - 1) 5217ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mulps xmm5, xmm4 // (65536.0 + area - 1) * 1 / area 5218ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian cvtps2dq xmm5, xmm5 // 0.16 fixed point 5219ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packssdw xmm5, xmm5 // 16 bit shorts 5220ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5221ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // 4 pixel loop small blocks. 5222ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian s4: 5223ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // top left 52247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm0, [eax] 52257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm1, [eax + 16] 52267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm2, [eax + 32] 52277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm3, [eax + 48] 5228ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5229ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // - top right 5230ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psubd xmm0, [eax + edx * 4] 5231ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psubd xmm1, [eax + edx * 4 + 16] 5232ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psubd xmm2, [eax + edx * 4 + 32] 5233ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psubd xmm3, [eax + edx * 4 + 48] 5234ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 64] 5235ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5236ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // - bottom left 5237ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psubd xmm0, [esi] 5238ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psubd xmm1, [esi + 16] 5239ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psubd xmm2, [esi + 32] 5240ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psubd xmm3, [esi + 48] 5241ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5242ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // + bottom right 5243ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddd xmm0, [esi + edx * 4] 5244ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddd xmm1, [esi + edx * 4 + 16] 5245ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddd xmm2, [esi + edx * 4 + 32] 5246ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddd xmm3, [esi + edx * 4 + 48] 5247ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea esi, [esi + 64] 5248ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5249ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packssdw xmm0, xmm1 // pack 4 pixels into 2 registers 5250ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packssdw xmm2, xmm3 5251ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5252ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmulhuw xmm0, xmm5 5253ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmulhuw xmm2, xmm5 5254ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5255ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm0, xmm2 5256ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu [edi], xmm0 5257ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edi, [edi + 16] 5258ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 4 5259ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jge s4 5260ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5261ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jmp l4b 5262ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5263ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // 4 pixel loop 5264ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian l4: 5265ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // top left 52667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm0, [eax] 52677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm1, [eax + 16] 52687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm2, [eax + 32] 52697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm3, [eax + 48] 5270ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5271ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // - top right 5272ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psubd xmm0, [eax + edx * 4] 5273ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psubd xmm1, [eax + edx * 4 + 16] 5274ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psubd xmm2, [eax + edx * 4 + 32] 5275ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psubd xmm3, [eax + edx * 4 + 48] 5276ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 64] 5277ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5278ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // - bottom left 5279ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psubd xmm0, [esi] 5280ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psubd xmm1, [esi + 16] 5281ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psubd xmm2, [esi + 32] 5282ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psubd xmm3, [esi + 48] 5283ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5284ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // + bottom right 5285ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddd xmm0, [esi + edx * 4] 5286ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddd xmm1, [esi + edx * 4 + 16] 5287ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddd xmm2, [esi + edx * 4 + 32] 5288ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddd xmm3, [esi + edx * 4 + 48] 5289ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea esi, [esi + 64] 5290ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5291ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area 5292ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian cvtdq2ps xmm1, xmm1 5293ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mulps xmm0, xmm4 5294ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mulps xmm1, xmm4 5295ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian cvtdq2ps xmm2, xmm2 5296ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian cvtdq2ps xmm3, xmm3 5297ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mulps xmm2, xmm4 5298ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mulps xmm3, xmm4 5299ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian cvtps2dq xmm0, xmm0 5300ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian cvtps2dq xmm1, xmm1 5301ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian cvtps2dq xmm2, xmm2 5302ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian cvtps2dq xmm3, xmm3 5303ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packssdw xmm0, xmm1 5304ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packssdw xmm2, xmm3 5305ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm0, xmm2 5306ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu [edi], xmm0 5307ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edi, [edi + 16] 5308ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 4 5309ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jge l4 5310ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5311ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian l4b: 5312ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian add ecx, 4 - 1 5313ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jl l1b 5314ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5315ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // 1 pixel loop 5316ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian l1: 53177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm0, [eax] 5318ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psubd xmm0, [eax + edx * 4] 5319ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 16] 5320ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psubd xmm0, [esi] 5321ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddd xmm0, [esi + edx * 4] 5322ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea esi, [esi + 16] 5323ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian cvtdq2ps xmm0, xmm0 5324ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mulps xmm0, xmm4 5325ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian cvtps2dq xmm0, xmm0 5326ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packssdw xmm0, xmm0 5327ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm0, xmm0 5328ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd dword ptr [edi], xmm0 5329ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edi, [edi + 4] 5330ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 1 5331ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jge l1 5332ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian l1b: 5333ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 5334ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 5335ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 5336ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5337ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 5338ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Creates a table of cumulative sums where each value is a sum of all values 5339ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// above and to the left of the value. 5340ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, 5341ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian const int32* previous_cumsum, int width) { 5342ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 5343ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, row 5344ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, cumsum 5345ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, previous_cumsum 5346ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, width 5347ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pxor xmm0, xmm0 5348ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pxor xmm1, xmm1 5349ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5350ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 4 5351ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jl l4b 5352ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian test edx, 15 5353ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jne l4b 5354ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5355ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // 4 pixel loop 5356ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian l4: 5357ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm2, [eax] // 4 argb pixels 16 bytes. 5358ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 16] 5359ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm4, xmm2 5360ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5361ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm2, xmm1 5362ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm3, xmm2 5363ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklwd xmm2, xmm1 5364ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpckhwd xmm3, xmm1 5365ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5366ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpckhbw xmm4, xmm1 5367ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm5, xmm4 5368ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklwd xmm4, xmm1 5369ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpckhwd xmm5, xmm1 5370ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5371ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddd xmm0, xmm2 53727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm2, [esi] // previous row above. 5373ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddd xmm2, xmm0 5374ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5375ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddd xmm0, xmm3 53767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm3, [esi + 16] 5377ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddd xmm3, xmm0 5378ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5379ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddd xmm0, xmm4 53807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm4, [esi + 32] 5381ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddd xmm4, xmm0 5382ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5383ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddd xmm0, xmm5 53847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm5, [esi + 48] 5385ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea esi, [esi + 64] 5386ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddd xmm5, xmm0 5387ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 53887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx], xmm2 53897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx + 16], xmm3 53907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx + 32], xmm4 53917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx + 48], xmm5 5392ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5393ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 64] 5394ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 4 5395ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jge l4 5396ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5397ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian l4b: 5398ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian add ecx, 4 - 1 5399ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jl l1b 5400ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5401ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // 1 pixel loop 5402ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian l1: 5403ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes. 5404ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 4] 5405ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm2, xmm1 5406ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklwd xmm2, xmm1 5407ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddd xmm0, xmm2 5408ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm2, [esi] 5409ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea esi, [esi + 16] 5410ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddd xmm2, xmm0 5411ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu [edx], xmm2 5412ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 16] 5413ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 1 5414ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jge l1 5415ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5416ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian l1b: 5417ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 5418ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 5419ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 5420ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5421ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBAFFINEROW_SSE2 5422ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Copy ARGB pixels from source image with slope to a row of destination. 54237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 5424ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianLIBYUV_API 5425ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, 5426ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian uint8* dst_argb, const float* uv_dudv, int width) { 5427ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 5428ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 5429ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push edi 5430ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 12] // src_argb 5431ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, [esp + 16] // stride 5432ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 20] // dst_argb 5433ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 24] // pointer to uv_dudv 5434ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movq xmm2, qword ptr [ecx] // uv 5435ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movq xmm7, qword ptr [ecx + 8] // dudv 5436ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 28] // width 5437ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian shl esi, 16 // 4, stride 5438ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian add esi, 4 5439ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd xmm5, esi 5440ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 4 5441ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jl l4b 5442ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5443ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // setup for 4 pixel loop 5444ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufd xmm7, xmm7, 0x44 // dup dudv 5445ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufd xmm5, xmm5, 0 // dup 4, stride 5446ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm0, xmm2 // x0, y0, x1, y1 5447ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian addps xmm0, xmm7 5448ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movlhps xmm2, xmm0 5449ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm4, xmm7 5450ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian addps xmm4, xmm4 // dudv *= 2 5451ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm3, xmm2 // x2, y2, x3, y3 5452ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian addps xmm3, xmm4 5453ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian addps xmm4, xmm4 // dudv *= 4 5454ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5455ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // 4 pixel loop 5456ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian l4: 5457ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian cvttps2dq xmm0, xmm2 // x, y float to int first 2 5458ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian cvttps2dq xmm1, xmm3 // x, y float to int next 2 5459ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packssdw xmm0, xmm1 // x, y as 8 shorts 5460ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride. 5461ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd esi, xmm0 5462ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufd xmm0, xmm0, 0x39 // shift right 5463ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd edi, xmm0 5464ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufd xmm0, xmm0, 0x39 // shift right 5465ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd xmm1, [eax + esi] // read pixel 0 5466ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd xmm6, [eax + edi] // read pixel 1 5467ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpckldq xmm1, xmm6 // combine pixel 0 and 1 5468ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian addps xmm2, xmm4 // x, y += dx, dy first 2 5469ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movq qword ptr [edx], xmm1 5470ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd esi, xmm0 5471ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufd xmm0, xmm0, 0x39 // shift right 5472ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd edi, xmm0 5473ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd xmm6, [eax + esi] // read pixel 2 5474ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd xmm0, [eax + edi] // read pixel 3 5475ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpckldq xmm6, xmm0 // combine pixel 2 and 3 5476ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian addps xmm3, xmm4 // x, y += dx, dy next 2 5477ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movq qword ptr 8[edx], xmm6 54787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian lea edx, [edx + 16] 54797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 4 54807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian jge l4 5481ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 54827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian l4b: 54837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian add ecx, 4 - 1 54847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian jl l1b 5485ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 54867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian // 1 pixel loop 54877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian l1: 54887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian cvttps2dq xmm0, xmm2 // x, y float to int 54897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian packssdw xmm0, xmm0 // x, y as shorts 54907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride 54917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian addps xmm2, xmm7 // x, y += dx, dy 54927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movd esi, xmm0 54937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movd xmm0, [eax + esi] // copy a pixel 54947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movd [edx], xmm0 54957ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian lea edx, [edx + 4] 54967ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 1 54977ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian jge l1 54987ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian l1b: 5499ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop edi 5500ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 5501ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 5502ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 5503ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 55047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif // HAS_ARGBAFFINEROW_SSE2 5505ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 55067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#ifdef HAS_INTERPOLATEROW_AVX2 55077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Bilinear filter 32x2 -> 32x1 55087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 55097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, 5510ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ptrdiff_t src_stride, int dst_width, 5511ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian int source_y_fraction) { 5512ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 5513ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 5514ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push edi 5515ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edi, [esp + 8 + 4] // dst_ptr 5516ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, [esp + 8 + 8] // src_ptr 5517ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8 + 12] // src_stride 5518ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 8 + 16] // dst_width 5519ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 8 + 20] // source_y_fraction (0..255) 55207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian shr eax, 1 5521ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // Dispatch to specialized filters if applicable. 5522ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian cmp eax, 0 55237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian je xloop100 // 0 / 128. Blend 100 / 0. 55247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub edi, esi 55257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian cmp eax, 32 55267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian je xloop75 // 32 / 128 is 0.25. Blend 75 / 25. 5527ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian cmp eax, 64 55287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian je xloop50 // 64 / 128 is 0.50. Blend 50 / 50. 55297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian cmp eax, 96 55307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian je xloop25 // 96 / 128 is 0.75. Blend 25 / 75. 5531ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 55327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovd xmm0, eax // high fraction 0..127 55337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian neg eax 55347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian add eax, 128 55357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovd xmm5, eax // low fraction 128..1 55367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpunpcklbw xmm5, xmm5, xmm0 55377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpunpcklwd xmm5, xmm5, xmm5 55387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpxor ymm0, ymm0, ymm0 55397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpermd ymm5, ymm0, ymm5 5540ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5541ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian xloop: 55427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu ymm0, [esi] 55437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu ymm2, [esi + edx] 55447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpunpckhbw ymm1, ymm0, ymm2 // mutates 55457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpunpcklbw ymm0, ymm0, ymm2 // mutates 55467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpmaddubsw ymm0, ymm0, ymm5 55477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpmaddubsw ymm1, ymm1, ymm5 55487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpsrlw ymm0, ymm0, 7 55497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpsrlw ymm1, ymm1, 7 55507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpackuswb ymm0, ymm0, ymm1 // unmutates 55517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu [esi + edi], ymm0 55527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian lea esi, [esi + 32] 55537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 32 5554ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg xloop 5555ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jmp xloop99 5556ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 55577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian // Blend 25 / 75. 55587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian xloop25: 55597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu ymm0, [esi] 55607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu ymm1, [esi + edx] 55617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpavgb ymm0, ymm0, ymm1 55627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpavgb ymm0, ymm0, ymm1 55637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu [esi + edi], ymm0 55647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian lea esi, [esi + 32] 55657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 32 55667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian jg xloop25 55677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian jmp xloop99 55687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 55697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian // Blend 50 / 50. 55707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian xloop50: 55717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu ymm0, [esi] 55727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpavgb ymm0, ymm0, [esi + edx] 55737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu [esi + edi], ymm0 55747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian lea esi, [esi + 32] 55757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 32 55767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian jg xloop50 55777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian jmp xloop99 55787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 55797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian // Blend 75 / 25. 55807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian xloop75: 55817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu ymm1, [esi] 55827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu ymm0, [esi + edx] 55837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpavgb ymm0, ymm0, ymm1 55847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpavgb ymm0, ymm0, ymm1 55857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu [esi + edi], ymm0 55867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian lea esi, [esi + 32] 55877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 32 55887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian jg xloop75 55897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian jmp xloop99 55907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 55917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian // Blend 100 / 0 - Copy row unchanged. 55927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian xloop100: 55937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian rep movsb 5594ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5595ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian xloop99: 5596ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop edi 5597ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 55987ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vzeroupper 5599ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 5600ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 5601ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 56027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif // HAS_INTERPOLATEROW_AVX2 5603ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5604ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Bilinear filter 16x2 -> 16x1 56057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 56067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, 56077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian ptrdiff_t src_stride, int dst_width, 56087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian int source_y_fraction) { 5609ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 5610ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 5611ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push edi 5612ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edi, [esp + 8 + 4] // dst_ptr 5613ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, [esp + 8 + 8] // src_ptr 5614ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8 + 12] // src_stride 5615ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 8 + 16] // dst_width 5616ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 8 + 20] // source_y_fraction (0..255) 5617ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub edi, esi 5618ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian shr eax, 1 5619ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // Dispatch to specialized filters if applicable. 5620ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian cmp eax, 0 5621ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian je xloop100 // 0 / 128. Blend 100 / 0. 5622ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian cmp eax, 32 5623ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian je xloop75 // 32 / 128 is 0.25. Blend 75 / 25. 5624ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian cmp eax, 64 5625ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian je xloop50 // 64 / 128 is 0.50. Blend 50 / 50. 5626ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian cmp eax, 96 5627ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian je xloop25 // 96 / 128 is 0.75. Blend 25 / 75. 5628ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5629ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd xmm0, eax // high fraction 0..127 5630ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian neg eax 5631ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian add eax, 128 5632ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd xmm5, eax // low fraction 128..1 5633ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm5, xmm0 5634ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklwd xmm5, xmm5 5635ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufd xmm5, xmm5, 0 5636ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5637ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian xloop: 5638ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm0, [esi] 5639ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm2, [esi + edx] 5640ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm1, xmm0 5641ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm0, xmm2 5642ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpckhbw xmm1, xmm2 5643ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm0, xmm5 5644ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm1, xmm5 5645ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm0, 7 5646ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm1, 7 5647ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm0, xmm1 5648ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu [esi + edi], xmm0 5649ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea esi, [esi + 16] 56507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 16 5651ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg xloop 5652ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jmp xloop99 5653ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5654ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // Blend 25 / 75. 5655ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian xloop25: 5656ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm0, [esi] 5657ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm1, [esi + edx] 5658ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgb xmm0, xmm1 5659ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgb xmm0, xmm1 5660ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu [esi + edi], xmm0 5661ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea esi, [esi + 16] 56627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 16 5663ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg xloop25 5664ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jmp xloop99 5665ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5666ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // Blend 50 / 50. 5667ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian xloop50: 5668ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm0, [esi] 5669ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm1, [esi + edx] 5670ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgb xmm0, xmm1 5671ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu [esi + edi], xmm0 5672ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea esi, [esi + 16] 56737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 16 5674ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg xloop50 5675ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jmp xloop99 5676ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5677ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // Blend 75 / 25. 5678ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian xloop75: 5679ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm1, [esi] 5680ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm0, [esi + edx] 5681ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgb xmm0, xmm1 5682ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgb xmm0, xmm1 5683ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu [esi + edi], xmm0 5684ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea esi, [esi + 16] 56857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 16 5686ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg xloop75 5687ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jmp xloop99 5688ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5689ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // Blend 100 / 0 - Copy row unchanged. 5690ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian xloop100: 5691ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm0, [esi] 5692ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu [esi + edi], xmm0 5693ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea esi, [esi + 16] 56947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 16 5695ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg xloop100 5696ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5697ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian xloop99: 5698ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop edi 5699ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 5700ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 5701ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 5702ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 5703ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5704ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_INTERPOLATEROW_SSE2 5705ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Bilinear filter 16x2 -> 16x1 57067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 57077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, 57087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian ptrdiff_t src_stride, int dst_width, 57097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian int source_y_fraction) { 5710ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 5711ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 5712ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push edi 5713ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edi, [esp + 8 + 4] // dst_ptr 5714ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, [esp + 8 + 8] // src_ptr 5715ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8 + 12] // src_stride 5716ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 8 + 16] // dst_width 5717ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 8 + 20] // source_y_fraction (0..255) 5718ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub edi, esi 5719ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // Dispatch to specialized filters if applicable. 5720ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian cmp eax, 0 5721ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian je xloop100 // 0 / 256. Blend 100 / 0. 5722ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian cmp eax, 64 5723ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian je xloop75 // 64 / 256 is 0.25. Blend 75 / 25. 5724ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian cmp eax, 128 5725ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian je xloop50 // 128 / 256 is 0.50. Blend 50 / 50. 5726ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian cmp eax, 192 5727ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian je xloop25 // 192 / 256 is 0.75. Blend 25 / 75. 5728ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5729ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd xmm5, eax // xmm5 = y fraction 5730ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm5, xmm5 5731ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm5, 1 5732ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklwd xmm5, xmm5 5733ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpckldq xmm5, xmm5 5734ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklqdq xmm5, xmm5 5735ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pxor xmm4, xmm4 5736ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5737ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian xloop: 5738ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm0, [esi] // row0 5739ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm2, [esi + edx] // row1 5740ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm1, xmm0 5741ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm3, xmm2 5742ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm2, xmm4 5743ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpckhbw xmm3, xmm4 5744ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm0, xmm4 5745ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpckhbw xmm1, xmm4 5746ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psubw xmm2, xmm0 // row1 - row0 5747ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psubw xmm3, xmm1 5748ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddw xmm2, xmm2 // 9 bits * 15 bits = 8.16 5749ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddw xmm3, xmm3 5750ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmulhw xmm2, xmm5 // scale diff 5751ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmulhw xmm3, xmm5 5752ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddw xmm0, xmm2 // sum rows 5753ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddw xmm1, xmm3 5754ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm0, xmm1 5755ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu [esi + edi], xmm0 5756ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea esi, [esi + 16] 57577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 16 5758ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg xloop 5759ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jmp xloop99 5760ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5761ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // Blend 25 / 75. 5762ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian xloop25: 5763ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm0, [esi] 5764ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm1, [esi + edx] 5765ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgb xmm0, xmm1 5766ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgb xmm0, xmm1 5767ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu [esi + edi], xmm0 5768ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea esi, [esi + 16] 57697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 16 5770ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg xloop25 5771ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jmp xloop99 5772ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5773ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // Blend 50 / 50. 5774ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian xloop50: 5775ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm0, [esi] 5776ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm1, [esi + edx] 5777ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgb xmm0, xmm1 5778ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu [esi + edi], xmm0 5779ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea esi, [esi + 16] 57807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 16 5781ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg xloop50 5782ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jmp xloop99 5783ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5784ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // Blend 75 / 25. 5785ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian xloop75: 5786ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm1, [esi] 5787ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm0, [esi + edx] 5788ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgb xmm0, xmm1 5789ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgb xmm0, xmm1 5790ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu [esi + edi], xmm0 5791ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea esi, [esi + 16] 57927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 16 5793ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg xloop75 5794ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jmp xloop99 5795ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5796ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // Blend 100 / 0 - Copy row unchanged. 5797ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian xloop100: 5798ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm0, [esi] 5799ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu [esi + edi], xmm0 5800ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea esi, [esi + 16] 58017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 16 5802ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg xloop100 5803ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5804ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian xloop99: 5805ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop edi 5806ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 5807ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 5808ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 5809ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 5810ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif // HAS_INTERPOLATEROW_SSE2 5811ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5812ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. 58137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 5814ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, 5815ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian const uint8* shuffler, int pix) { 5816ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 5817ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4] // src_argb 5818ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8] // dst_argb 5819ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 12] // shuffler 58207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm5, [ecx] 5821ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 16] // pix 5822ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5823ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian wloop: 5824ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm0, [eax] 5825ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm1, [eax + 16] 5826ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 32] 5827ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufb xmm0, xmm5 5828ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufb xmm1, xmm5 5829ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu [edx], xmm0 5830ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu [edx + 16], xmm1 5831ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 32] 58327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 8 5833ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg wloop 5834ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 5835ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 5836ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 5837ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5838ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBSHUFFLEROW_AVX2 58397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 5840ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, 5841ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian const uint8* shuffler, int pix) { 5842ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 5843ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4] // src_argb 5844ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8] // dst_argb 5845ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 12] // shuffler 5846ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vbroadcastf128 ymm5, [ecx] // same shuffle in high as low. 5847ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 16] // pix 5848ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5849ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian wloop: 5850ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovdqu ymm0, [eax] 5851ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovdqu ymm1, [eax + 32] 5852ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 64] 5853ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpshufb ymm0, ymm0, ymm5 5854ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpshufb ymm1, ymm1, ymm5 5855ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovdqu [edx], ymm0 5856ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovdqu [edx + 32], ymm1 5857ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 64] 58587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 16 5859ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg wloop 5860ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5861ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vzeroupper 5862ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 5863ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 5864ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 5865ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif // HAS_ARGBSHUFFLEROW_AVX2 5866ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 58677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 5868ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, 5869ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian const uint8* shuffler, int pix) { 5870ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 5871ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push ebx 5872ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 5873ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 8 + 4] // src_argb 5874ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8 + 8] // dst_argb 5875ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, [esp + 8 + 12] // shuffler 5876ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 8 + 16] // pix 5877ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pxor xmm5, xmm5 5878ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5879ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ebx, [esi] // shuffler 5880ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian cmp ebx, 0x03000102 5881ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian je shuf_3012 5882ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian cmp ebx, 0x00010203 5883ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian je shuf_0123 5884ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian cmp ebx, 0x00030201 5885ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian je shuf_0321 5886ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian cmp ebx, 0x02010003 5887ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian je shuf_2103 5888ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5889ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // TODO(fbarchard): Use one source pointer and 3 offsets. 5890ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian shuf_any1: 5891ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx ebx, byte ptr [esi] 5892ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx ebx, byte ptr [eax + ebx] 5893ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov [edx], bl 5894ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx ebx, byte ptr [esi + 1] 5895ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx ebx, byte ptr [eax + ebx] 5896ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov [edx + 1], bl 5897ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx ebx, byte ptr [esi + 2] 5898ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx ebx, byte ptr [eax + ebx] 5899ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov [edx + 2], bl 5900ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx ebx, byte ptr [esi + 3] 5901ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx ebx, byte ptr [eax + ebx] 5902ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov [edx + 3], bl 5903ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 4] 5904ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 4] 5905ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 1 5906ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg shuf_any1 5907ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jmp shuf99 5908ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5909ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian shuf_0123: 5910ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm0, [eax] 5911ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 16] 5912ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm1, xmm0 5913ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm0, xmm5 5914ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpckhbw xmm1, xmm5 5915ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufhw xmm0, xmm0, 01Bh // 1B = 00011011 = 0x0123 = BGRAToARGB 5916ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshuflw xmm0, xmm0, 01Bh 5917ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufhw xmm1, xmm1, 01Bh 5918ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshuflw xmm1, xmm1, 01Bh 5919ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm0, xmm1 5920ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu [edx], xmm0 5921ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 16] 59227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 4 5923ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg shuf_0123 5924ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jmp shuf99 5925ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5926ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian shuf_0321: 5927ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm0, [eax] 5928ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 16] 5929ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm1, xmm0 5930ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm0, xmm5 5931ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpckhbw xmm1, xmm5 5932ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufhw xmm0, xmm0, 039h // 39 = 00111001 = 0x0321 = RGBAToARGB 5933ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshuflw xmm0, xmm0, 039h 5934ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufhw xmm1, xmm1, 039h 5935ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshuflw xmm1, xmm1, 039h 5936ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm0, xmm1 5937ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu [edx], xmm0 5938ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 16] 59397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 4 5940ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg shuf_0321 5941ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jmp shuf99 5942ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5943ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian shuf_2103: 5944ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm0, [eax] 5945ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 16] 5946ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm1, xmm0 5947ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm0, xmm5 5948ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpckhbw xmm1, xmm5 5949ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufhw xmm0, xmm0, 093h // 93 = 10010011 = 0x2103 = ARGBToRGBA 5950ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshuflw xmm0, xmm0, 093h 5951ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufhw xmm1, xmm1, 093h 5952ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshuflw xmm1, xmm1, 093h 5953ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm0, xmm1 5954ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu [edx], xmm0 5955ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 16] 59567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 4 5957ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg shuf_2103 5958ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jmp shuf99 5959ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5960ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian shuf_3012: 5961ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm0, [eax] 5962ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 16] 5963ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm1, xmm0 5964ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm0, xmm5 5965ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpckhbw xmm1, xmm5 5966ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufhw xmm0, xmm0, 0C6h // C6 = 11000110 = 0x3012 = ABGRToARGB 5967ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshuflw xmm0, xmm0, 0C6h 5968ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufhw xmm1, xmm1, 0C6h 5969ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshuflw xmm1, xmm1, 0C6h 5970ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm0, xmm1 5971ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu [edx], xmm0 5972ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 16] 59737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 4 5974ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg shuf_3012 5975ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5976ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian shuf99: 5977ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 5978ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop ebx 5979ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 5980ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 5981ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 5982ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5983ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// YUY2 - Macro-pixel = 2 image pixels 5984ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4.... 5985ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 5986ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// UYVY - Macro-pixel = 2 image pixels 5987ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// U0Y0V0Y1 5988ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 59897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 5990ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid I422ToYUY2Row_SSE2(const uint8* src_y, 5991ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian const uint8* src_u, 5992ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian const uint8* src_v, 5993ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian uint8* dst_frame, int width) { 5994ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 5995ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 5996ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push edi 5997ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 8 + 4] // src_y 5998ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, [esp + 8 + 8] // src_u 5999ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8 + 12] // src_v 6000ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edi, [esp + 8 + 16] // dst_frame 6001ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 8 + 20] // width 6002ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub edx, esi 6003ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 6004ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 6005ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movq xmm2, qword ptr [esi] // U 6006ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movq xmm3, qword ptr [esi + edx] // V 6007ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea esi, [esi + 8] 6008ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm2, xmm3 // UV 6009ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm0, [eax] // Y 6010ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 16] 6011ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm1, xmm0 6012ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm0, xmm2 // YUYV 6013ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpckhbw xmm1, xmm2 6014ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu [edi], xmm0 6015ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu [edi + 16], xmm1 6016ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edi, [edi + 32] 6017ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 16 6018ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 6019ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 6020ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop edi 6021ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 6022ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 6023ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 6024ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 6025ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 60267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 6027ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid I422ToUYVYRow_SSE2(const uint8* src_y, 6028ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian const uint8* src_u, 6029ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian const uint8* src_v, 6030ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian uint8* dst_frame, int width) { 6031ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 6032ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 6033ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push edi 6034ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 8 + 4] // src_y 6035ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, [esp + 8 + 8] // src_u 6036ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8 + 12] // src_v 6037ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edi, [esp + 8 + 16] // dst_frame 6038ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 8 + 20] // width 6039ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub edx, esi 6040ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 6041ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 6042ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movq xmm2, qword ptr [esi] // U 6043ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movq xmm3, qword ptr [esi + edx] // V 6044ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea esi, [esi + 8] 6045ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm2, xmm3 // UV 6046ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm0, [eax] // Y 6047ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm1, xmm2 6048ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 16] 6049ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm1, xmm0 // UYVY 6050ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpckhbw xmm2, xmm0 6051ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu [edi], xmm1 6052ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu [edi + 16], xmm2 6053ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edi, [edi + 32] 6054ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 16 6055ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 6056ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 6057ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop edi 6058ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 6059ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 6060ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 6061ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 6062ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 6063ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBPOLYNOMIALROW_SSE2 60647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 6065ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBPolynomialRow_SSE2(const uint8* src_argb, 6066ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian uint8* dst_argb, const float* poly, 6067ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian int width) { 6068ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 6069ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 6070ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4 + 4] /* src_argb */ 6071ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 4 + 8] /* dst_argb */ 6072ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, [esp + 4 + 12] /* poly */ 6073ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 4 + 16] /* width */ 6074ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints. 6075ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 6076ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // 2 pixel loop. 6077ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 6078ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// pmovzxbd xmm0, dword ptr [eax] // BGRA pixel 6079ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel 6080ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movq xmm0, qword ptr [eax] // BGRABGRA 6081ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 8] 6082ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm0, xmm3 6083ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm4, xmm0 6084ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklwd xmm0, xmm3 // pixel 0 6085ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpckhwd xmm4, xmm3 // pixel 1 6086ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian cvtdq2ps xmm0, xmm0 // 4 floats 6087ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian cvtdq2ps xmm4, xmm4 6088ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm1, xmm0 // X 6089ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm5, xmm4 6090ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mulps xmm0, [esi + 16] // C1 * X 6091ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mulps xmm4, [esi + 16] 6092ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian addps xmm0, [esi] // result = C0 + C1 * X 6093ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian addps xmm4, [esi] 6094ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm2, xmm1 6095ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm6, xmm5 6096ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mulps xmm2, xmm1 // X * X 6097ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mulps xmm6, xmm5 6098ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mulps xmm1, xmm2 // X * X * X 6099ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mulps xmm5, xmm6 6100ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mulps xmm2, [esi + 32] // C2 * X * X 6101ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mulps xmm6, [esi + 32] 6102ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mulps xmm1, [esi + 48] // C3 * X * X * X 6103ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mulps xmm5, [esi + 48] 6104ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian addps xmm0, xmm2 // result += C2 * X * X 6105ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian addps xmm4, xmm6 6106ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian addps xmm0, xmm1 // result += C3 * X * X * X 6107ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian addps xmm4, xmm5 6108ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian cvttps2dq xmm0, xmm0 6109ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian cvttps2dq xmm4, xmm4 6110ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm0, xmm4 6111ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm0, xmm0 6112ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movq qword ptr [edx], xmm0 6113ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 8] 61147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 2 6115ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 6116ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 6117ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 6118ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 6119ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 6120ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif // HAS_ARGBPOLYNOMIALROW_SSE2 6121ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 6122ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBPOLYNOMIALROW_AVX2 61237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 6124ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBPolynomialRow_AVX2(const uint8* src_argb, 6125ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian uint8* dst_argb, const float* poly, 6126ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian int width) { 6127ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 6128ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4] /* src_argb */ 6129ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8] /* dst_argb */ 6130ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 12] /* poly */ 6131ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vbroadcastf128 ymm4, [ecx] // C0 6132ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vbroadcastf128 ymm5, [ecx + 16] // C1 6133ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vbroadcastf128 ymm6, [ecx + 32] // C2 6134ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vbroadcastf128 ymm7, [ecx + 48] // C3 6135ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 16] /* width */ 6136ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 6137ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // 2 pixel loop. 6138ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 6139ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels 6140ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 8] 6141ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vcvtdq2ps ymm0, ymm0 // X 8 floats 6142ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmulps ymm2, ymm0, ymm0 // X * X 6143ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmulps ymm3, ymm0, ymm7 // C3 * X 6144ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vfmadd132ps ymm0, ymm4, ymm5 // result = C0 + C1 * X 6145ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vfmadd231ps ymm0, ymm2, ymm6 // result += C2 * X * X 6146ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vfmadd231ps ymm0, ymm2, ymm3 // result += C3 * X * X * X 6147ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vcvttps2dq ymm0, ymm0 6148ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpackusdw ymm0, ymm0, ymm0 // b0g0r0a0_00000000_b0g0r0a0_00000000 6149ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000 6150ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vpackuswb xmm0, xmm0, xmm0 // bgrabgra_00000000_00000000_00000000 6151ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vmovq qword ptr [edx], xmm0 6152ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 8] 61537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 2 6154ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 6155ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian vzeroupper 6156ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 6157ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 6158ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 6159ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif // HAS_ARGBPOLYNOMIALROW_AVX2 6160ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 6161ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBCOLORTABLEROW_X86 6162ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Tranform ARGB pixels with color table. 61637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 6164ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, 6165ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian int width) { 6166ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 6167ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 6168ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4 + 4] /* dst_argb */ 6169ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, [esp + 4 + 8] /* table_argb */ 6170ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 4 + 12] /* width */ 6171ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 6172ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // 1 pixel loop. 6173ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 6174ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx edx, byte ptr [eax] 6175ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 4] 6176ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx edx, byte ptr [esi + edx * 4] 6177ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov byte ptr [eax - 4], dl 6178ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx edx, byte ptr [eax - 4 + 1] 6179ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx edx, byte ptr [esi + edx * 4 + 1] 6180ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov byte ptr [eax - 4 + 1], dl 6181ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx edx, byte ptr [eax - 4 + 2] 6182ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx edx, byte ptr [esi + edx * 4 + 2] 6183ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov byte ptr [eax - 4 + 2], dl 6184ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx edx, byte ptr [eax - 4 + 3] 6185ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx edx, byte ptr [esi + edx * 4 + 3] 6186ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov byte ptr [eax - 4 + 3], dl 6187ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian dec ecx 6188ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 6189ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 6190ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 6191ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 6192ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 6193ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif // HAS_ARGBCOLORTABLEROW_X86 6194ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 6195ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_RGBCOLORTABLEROW_X86 6196ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Tranform RGB pixels with color table. 61977ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 6198ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { 6199ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 6200ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 6201ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4 + 4] /* dst_argb */ 6202ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, [esp + 4 + 8] /* table_argb */ 6203ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 4 + 12] /* width */ 6204ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 6205ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // 1 pixel loop. 6206ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 6207ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx edx, byte ptr [eax] 6208ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 4] 6209ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx edx, byte ptr [esi + edx * 4] 6210ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov byte ptr [eax - 4], dl 6211ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx edx, byte ptr [eax - 4 + 1] 6212ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx edx, byte ptr [esi + edx * 4 + 1] 6213ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov byte ptr [eax - 4 + 1], dl 6214ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx edx, byte ptr [eax - 4 + 2] 6215ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx edx, byte ptr [esi + edx * 4 + 2] 6216ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov byte ptr [eax - 4 + 2], dl 6217ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian dec ecx 6218ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 6219ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 6220ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 6221ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 6222ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 6223ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 6224ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif // HAS_RGBCOLORTABLEROW_X86 6225ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 6226ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 6227ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Tranform RGB pixels with luma table. 62287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 6229ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, 6230ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian int width, 6231ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian const uint8* luma, uint32 lumacoeff) { 6232ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 6233ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 6234ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push edi 6235ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 8 + 4] /* src_argb */ 6236ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edi, [esp + 8 + 8] /* dst_argb */ 6237ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 8 + 12] /* width */ 6238ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd xmm2, dword ptr [esp + 8 + 16] // luma table 6239ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd xmm3, dword ptr [esp + 8 + 20] // lumacoeff 6240ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufd xmm2, xmm2, 0 6241ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufd xmm3, xmm3, 0 6242ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pcmpeqb xmm4, xmm4 // generate mask 0xff00ff00 6243ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psllw xmm4, 8 6244ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pxor xmm5, xmm5 6245ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 6246ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // 4 pixel loop. 6247ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop: 6248ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm0, qword ptr [eax] // generate luma ptr 6249ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm0, xmm3 6250ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian phaddw xmm0, xmm0 6251ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pand xmm0, xmm4 // mask out low bits 6252ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklwd xmm0, xmm5 6253ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddd xmm0, xmm2 // add table base 6254ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd esi, xmm0 6255ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 6256ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 6257ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx edx, byte ptr [eax] 6258ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx edx, byte ptr [esi + edx] 6259ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov byte ptr [edi], dl 6260ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx edx, byte ptr [eax + 1] 6261ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx edx, byte ptr [esi + edx] 6262ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov byte ptr [edi + 1], dl 6263ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx edx, byte ptr [eax + 2] 6264ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx edx, byte ptr [esi + edx] 6265ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov byte ptr [edi + 2], dl 6266ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx edx, byte ptr [eax + 3] // copy alpha. 6267ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov byte ptr [edi + 3], dl 6268ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 6269ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd esi, xmm0 6270ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 6271ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 6272ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx edx, byte ptr [eax + 4] 6273ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx edx, byte ptr [esi + edx] 6274ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov byte ptr [edi + 4], dl 6275ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx edx, byte ptr [eax + 5] 6276ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx edx, byte ptr [esi + edx] 6277ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov byte ptr [edi + 5], dl 6278ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx edx, byte ptr [eax + 6] 6279ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx edx, byte ptr [esi + edx] 6280ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov byte ptr [edi + 6], dl 6281ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx edx, byte ptr [eax + 7] // copy alpha. 6282ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov byte ptr [edi + 7], dl 6283ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 6284ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd esi, xmm0 6285ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 6286ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 6287ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx edx, byte ptr [eax + 8] 6288ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx edx, byte ptr [esi + edx] 6289ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov byte ptr [edi + 8], dl 6290ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx edx, byte ptr [eax + 9] 6291ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx edx, byte ptr [esi + edx] 6292ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov byte ptr [edi + 9], dl 6293ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx edx, byte ptr [eax + 10] 6294ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx edx, byte ptr [esi + edx] 6295ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov byte ptr [edi + 10], dl 6296ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx edx, byte ptr [eax + 11] // copy alpha. 6297ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov byte ptr [edi + 11], dl 6298ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 6299ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd esi, xmm0 6300ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 6301ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx edx, byte ptr [eax + 12] 6302ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx edx, byte ptr [esi + edx] 6303ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov byte ptr [edi + 12], dl 6304ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx edx, byte ptr [eax + 13] 6305ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx edx, byte ptr [esi + edx] 6306ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov byte ptr [edi + 13], dl 6307ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx edx, byte ptr [eax + 14] 6308ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx edx, byte ptr [esi + edx] 6309ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov byte ptr [edi + 14], dl 6310ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx edx, byte ptr [eax + 15] // copy alpha. 6311ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov byte ptr [edi + 15], dl 6312ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 6313ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 16] 6314ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edi, [edi + 16] 63157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 4 6316ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg convertloop 6317ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 6318ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop edi 6319ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 6320ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 6321ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 6322ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 6323ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 6324ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 6325ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif // defined(_M_X64) 63267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) 6327ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 6328ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef __cplusplus 6329ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} // extern "C" 6330ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} // namespace libyuv 6331ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif 6332