1ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian/*
2ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
3ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian *
4ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian *  Use of this source code is governed by a BSD-style license
5ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian *  that can be found in the LICENSE file in the root of the source
6ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian *  tree. An additional intellectual property rights grant can be found
7ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian *  in the file PATENTS. All contributing project authors may
8ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian *  be found in the AUTHORS file in the root of the source tree.
9ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian */
10ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
11ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#include "libyuv/row.h"
12ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#if !defined(LIBYUV_DISABLE_X86) && defined(_M_X64) && \
147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    defined(_MSC_VER) && !defined(__clang__)
15ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#include <emmintrin.h>
16ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#include <tmmintrin.h>  // For _mm_maddubs_epi16
17ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif
18ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
19ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef __cplusplus
20ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramaniannamespace libyuv {
21ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianextern "C" {
22ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif
23ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
24ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// This module is for Visual C.
257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#if !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) && \
267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    defined(_MSC_VER) && !defined(__clang__)
277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianstruct YuvConstants {
297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  lvec8 kUVToB;     // 0
307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  lvec8 kUVToG;     // 32
317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  lvec8 kUVToR;     // 64
327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  lvec16 kUVBiasB;  // 96
337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  lvec16 kUVBiasG;  // 128
347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  lvec16 kUVBiasR;  // 160
357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  lvec16 kYToRgb;   // 192
36ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian};
37ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// BT.601 YUV to RGB reference
397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian//  R = (Y - 16) * 1.164              - V * -1.596
407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian//  G = (Y - 16) * 1.164 - U *  0.391 - V *  0.813
417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian//  B = (Y - 16) * 1.164 - U * -2.018
427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Y contribution to R,G,B.  Scale and bias.
447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// TODO(fbarchard): Consider moving constants into a common header.
457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// U and V contributions to R,G,B.
497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define UB -128 /* max(-128, round(-2.018 * 64)) */
507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define UG 25 /* round(0.391 * 64) */
517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define VG 52 /* round(0.813 * 64) */
527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define VR -102 /* round(-1.596 * 64) */
537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Bias values to subtract 16 from Y and 128 from U and V.
557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define BB (UB * 128            + YGB)
567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define BG (UG * 128 + VG * 128 + YGB)
577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define BR            (VR * 128 + YGB)
587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// BT601 constants for YUV to RGB.
607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianstatic YuvConstants SIMD_ALIGNED(kYuvConstants) = {
617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
71ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian};
72ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// BT601 constants for NV21 where chroma plane is VU instead of UV.
747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianstatic YuvConstants SIMD_ALIGNED(kYvuConstants) = {
757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
85ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian};
86ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#undef YG
887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#undef YGB
897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#undef UB
907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#undef UG
917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#undef VG
927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#undef VR
937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#undef BB
947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#undef BG
957ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#undef BR
967ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
977ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// JPEG YUV to RGB reference
987ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// *  R = Y                - V * -1.40200
997ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// *  G = Y - U *  0.34414 - V *  0.71414
1007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// *  B = Y - U * -1.77200
1017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
1027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Y contribution to R,G,B.  Scale and bias.
1037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// TODO(fbarchard): Consider moving constants into a common header.
1047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
1057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define YGBJ 32  /* 64 / 2 */
1067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
1077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// U and V contributions to R,G,B.
1087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define UBJ -113 /* round(-1.77200 * 64) */
1097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define UGJ 22 /* round(0.34414 * 64) */
1107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define VGJ 46 /* round(0.71414  * 64) */
1117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define VRJ -90 /* round(-1.40200 * 64) */
1127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
1137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Bias values to subtract 16 from Y and 128 from U and V.
1147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define BBJ (UBJ * 128             + YGBJ)
1157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define BGJ (UGJ * 128 + VGJ * 128 + YGBJ)
1167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define BRJ             (VRJ * 128 + YGBJ)
1177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
1187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// JPEG constants for YUV to RGB.
1197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianstatic YuvConstants SIMD_ALIGNED(kYuvJConstants) = {
1207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  { UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0,
1217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0 },
1227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  { UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
1237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
1247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
1257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ },
1267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  { 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ,
1277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ },
1287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  { BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ,
1297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ },
1307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  { BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ,
1317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ },
1327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  { BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ,
1337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ },
1347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  { YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ,
1357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ }
136ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian};
137ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#undef YGJ
1397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#undef YGBJ
1407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#undef UBJ
1417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#undef UGJ
1427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#undef VGJ
1437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#undef VRJ
1447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#undef BBJ
1457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#undef BGJ
1467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#undef BRJ
147ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
148ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// 64 bit
149ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#if defined(_M_X64)
1507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#if defined(HAS_I422TOARGBROW_SSSE3)
151ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid I422ToARGBRow_SSSE3(const uint8* y_buf,
152ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                         const uint8* u_buf,
153ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                         const uint8* v_buf,
154ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                         uint8* dst_argb,
155ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                         int width) {
156ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __m128i xmm0, xmm1, xmm2, xmm3;
157ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  const __m128i xmm5 = _mm_set1_epi8(-1);
158ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
159ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
160ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  while (width > 0) {
161ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);
162ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));
163ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
164ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);
1657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    xmm1 = _mm_loadu_si128(&xmm0);
1667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    xmm2 = _mm_loadu_si128(&xmm0);
1677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kYuvConstants.kUVToB);
1687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kYuvConstants.kUVToG);
1697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kYuvConstants.kUVToR);
1707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    xmm0 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasB, xmm0);
1717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    xmm1 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasG, xmm1);
1727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    xmm2 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasR, xmm2);
173ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    xmm3 = _mm_loadl_epi64((__m128i*)y_buf);
1747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    xmm3 = _mm_unpacklo_epi8(xmm3, xmm3);
1757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    xmm3 = _mm_mulhi_epu16(xmm3, *(__m128i*)kYuvConstants.kYToRgb);
176ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    xmm0 = _mm_adds_epi16(xmm0, xmm3);
177ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    xmm1 = _mm_adds_epi16(xmm1, xmm3);
178ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    xmm2 = _mm_adds_epi16(xmm2, xmm3);
179ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    xmm0 = _mm_srai_epi16(xmm0, 6);
180ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    xmm1 = _mm_srai_epi16(xmm1, 6);
181ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    xmm2 = _mm_srai_epi16(xmm2, 6);
182ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    xmm0 = _mm_packus_epi16(xmm0, xmm0);
183ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    xmm1 = _mm_packus_epi16(xmm1, xmm1);
184ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    xmm2 = _mm_packus_epi16(xmm2, xmm2);
185ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
186ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);
1877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    xmm1 = _mm_loadu_si128(&xmm0);
188ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);
189ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);
190ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
191ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    _mm_storeu_si128((__m128i *)dst_argb, xmm0);
192ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1);
193ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
194ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    y_buf += 8;
195ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    u_buf += 4;
196ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    dst_argb += 32;
197ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    width -= 8;
198ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
199ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif
201ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// 32 bit
202ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#else  // defined(_M_X64)
203ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBTOYROW_SSSE3
204ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
205ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Constants for ARGB.
206ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const vec8 kARGBToY = {
207ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
208ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian};
209ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
210ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// JPeg full range.
211ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const vec8 kARGBToYJ = {
212ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
213ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian};
214ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
215ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const vec8 kARGBToU = {
216ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
217ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian};
218ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
219ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const vec8 kARGBToUJ = {
220ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
221ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian};
222ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
223ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const vec8 kARGBToV = {
224ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
225ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian};
226ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
227ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const vec8 kARGBToVJ = {
228ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
229ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian};
230ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
231ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// vpshufb for vphaddw + vpackuswb packed to shorts.
232ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const lvec8 kShufARGBToUV_AVX = {
233ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
2347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
235ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian};
236ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
237ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Constants for BGRA.
238ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const vec8 kBGRAToY = {
239ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
240ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian};
241ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
242ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const vec8 kBGRAToU = {
243ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
244ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian};
245ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
246ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const vec8 kBGRAToV = {
247ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
248ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian};
249ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
250ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Constants for ABGR.
251ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const vec8 kABGRToY = {
252ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
253ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian};
254ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
255ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const vec8 kABGRToU = {
256ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
257ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian};
258ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
259ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const vec8 kABGRToV = {
260ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
261ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian};
262ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
263ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Constants for RGBA.
264ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const vec8 kRGBAToY = {
265ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
266ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian};
267ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
268ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const vec8 kRGBAToU = {
269ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
270ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian};
271ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
272ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const vec8 kRGBAToV = {
273ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
274ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian};
275ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
276ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const uvec8 kAddY16 = {
277ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
278ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian};
279ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 7 bit fixed point 0.5.
281ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const vec16 kAddYJ64 = {
282ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  64, 64, 64, 64, 64, 64, 64, 64
283ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian};
284ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
285ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const uvec8 kAddUV128 = {
286ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
287ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
288ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian};
289ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
290ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const uvec16 kAddUVJ128 = {
291ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
292ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian};
293ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
294ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Shuffle table for converting RGB24 to ARGB.
295ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const uvec8 kShuffleMaskRGB24ToARGB = {
296ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
297ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian};
298ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
299ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Shuffle table for converting RAW to ARGB.
300ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const uvec8 kShuffleMaskRAWToARGB = {
301ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
302ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian};
303ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
304ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Shuffle table for converting ARGB to RGB24.
305ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const uvec8 kShuffleMaskARGBToRGB24 = {
306ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
307ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian};
308ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
309ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Shuffle table for converting ARGB to RAW.
310ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const uvec8 kShuffleMaskARGBToRAW = {
311ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
312ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian};
313ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
314ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
315ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const uvec8 kShuffleMaskARGBToRGB24_0 = {
316ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
317ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian};
318ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
319ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Shuffle table for converting ARGB to RAW.
320ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const uvec8 kShuffleMaskARGBToRAW_0 = {
321ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
322ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian};
323ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
324ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Duplicates gray value 3 times and fills in alpha opaque.
3257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
3267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
327ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
328ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4]        // src_y
329ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 8]        // dst_argb
330ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 12]       // pix
331ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pcmpeqb    xmm5, xmm5            // generate mask 0xff000000
332ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pslld      xmm5, 24
333ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
334ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  convertloop:
335ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movq       xmm0, qword ptr [eax]
336ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax,  [eax + 8]
337ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw  xmm0, xmm0
338ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm1, xmm0
339ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklwd  xmm0, xmm0
340ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpckhwd  xmm1, xmm1
341ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    por        xmm0, xmm5
342ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    por        xmm1, xmm5
3437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     [edx], xmm0
3447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     [edx + 16], xmm1
345ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 32]
346ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        ecx, 8
347ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
348ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
349ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
350ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
351ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#ifdef HAS_J400TOARGBROW_AVX2
3537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Duplicates gray value 3 times and fills in alpha opaque.
3547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
3557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int pix) {
356ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
3577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov         eax, [esp + 4]        // src_y
3587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov         edx, [esp + 8]        // dst_argb
3597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov         ecx, [esp + 12]       // pix
3607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpcmpeqb    ymm5, ymm5, ymm5      // generate mask 0xff000000
3617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpslld      ymm5, ymm5, 24
362ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
363ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  convertloop:
3647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu     xmm0, [eax]
3657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    lea         eax,  [eax + 16]
3667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpermq      ymm0, ymm0, 0xd8
3677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpunpcklbw  ymm0, ymm0, ymm0
3687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpermq      ymm0, ymm0, 0xd8
3697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpunpckhwd  ymm1, ymm0, ymm0
3707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpunpcklwd  ymm0, ymm0, ymm0
3717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpor        ymm0, ymm0, ymm5
3727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpor        ymm1, ymm1, ymm5
3737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu     [edx], ymm0
3747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu     [edx + 32], ymm1
3757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    lea         edx, [edx + 64]
3767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub         ecx, 16
3777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    jg          convertloop
3787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vzeroupper
379ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
380ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
381ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
3827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif  // HAS_J400TOARGBROW_AVX2
383ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
385ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
386ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
387ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov       eax, [esp + 4]   // src_rgb24
388ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov       edx, [esp + 8]   // dst_argb
389ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov       ecx, [esp + 12]  // pix
390ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
391ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pslld     xmm5, 24
392ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa    xmm4, kShuffleMaskRGB24ToARGB
393ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
394ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
395ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu    xmm0, [eax]
396ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu    xmm1, [eax + 16]
397ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu    xmm3, [eax + 32]
398ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea       eax, [eax + 48]
399ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa    xmm2, xmm3
400ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
401ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufb    xmm2, xmm4
402ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    por       xmm2, xmm5
403ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
404ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufb    xmm0, xmm4
4057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu    [edx + 32], xmm2
406ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    por       xmm0, xmm5
407ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufb    xmm1, xmm4
4087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu    [edx], xmm0
409ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    por       xmm1, xmm5
410ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
411ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufb    xmm3, xmm4
4127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu    [edx + 16], xmm1
413ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    por       xmm3, xmm5
4147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu    [edx + 48], xmm3
415ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea       edx, [edx + 64]
4167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub       ecx, 16
417ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg        convertloop
418ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
419ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
420ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
421ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
423ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
424ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        int pix) {
425ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
426ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov       eax, [esp + 4]   // src_raw
427ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov       edx, [esp + 8]   // dst_argb
428ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov       ecx, [esp + 12]  // pix
429ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
430ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pslld     xmm5, 24
431ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa    xmm4, kShuffleMaskRAWToARGB
432ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
433ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
434ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu    xmm0, [eax]
435ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu    xmm1, [eax + 16]
436ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu    xmm3, [eax + 32]
437ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea       eax, [eax + 48]
438ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa    xmm2, xmm3
439ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
440ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufb    xmm2, xmm4
441ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    por       xmm2, xmm5
442ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
443ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufb    xmm0, xmm4
4447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu    [edx + 32], xmm2
445ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    por       xmm0, xmm5
446ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufb    xmm1, xmm4
4477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu    [edx], xmm0
448ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    por       xmm1, xmm5
449ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
450ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufb    xmm3, xmm4
4517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu    [edx + 16], xmm1
452ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    por       xmm3, xmm5
4537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu    [edx + 48], xmm3
454ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea       edx, [edx + 64]
4557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub       ecx, 16
456ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg        convertloop
457ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
458ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
459ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
460ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
461ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// pmul method to replicate bits.
462ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Math to replicate bits:
463ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// (v << 8) | (v << 3)
464ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// v * 256 + v * 8
465ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// v * (256 + 8)
466ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
467ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// 20 instructions.
4687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
469ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
470ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                          int pix) {
471ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
472ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
473ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd      xmm5, eax
474ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufd    xmm5, xmm5, 0
475ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov       eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
476ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd      xmm6, eax
477ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufd    xmm6, xmm6, 0
478ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
479ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psllw     xmm3, 11
480ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pcmpeqb   xmm4, xmm4       // generate mask 0x07e007e0 for Green
481ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psllw     xmm4, 10
482ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw     xmm4, 5
483ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
484ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psllw     xmm7, 8
485ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
486ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov       eax, [esp + 4]   // src_rgb565
487ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov       edx, [esp + 8]   // dst_argb
488ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov       ecx, [esp + 12]  // pix
489ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub       edx, eax
490ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub       edx, eax
491ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
492ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
493ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu    xmm0, [eax]   // fetch 8 pixels of bgr565
494ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa    xmm1, xmm0
495ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa    xmm2, xmm0
496ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pand      xmm1, xmm3    // R in upper 5 bits
497ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psllw     xmm2, 11      // B in upper 5 bits
498ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmulhuw   xmm1, xmm5    // * (256 + 8)
499ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmulhuw   xmm2, xmm5    // * (256 + 8)
500ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psllw     xmm1, 8
501ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    por       xmm1, xmm2    // RB
502ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pand      xmm0, xmm4    // G in middle 6 bits
503ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmulhuw   xmm0, xmm6    // << 5 * (256 + 4)
504ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    por       xmm0, xmm7    // AG
505ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa    xmm2, xmm1
506ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw xmm1, xmm0
507ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpckhbw xmm2, xmm0
5087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
5097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
510ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea       eax, [eax + 16]
511ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub       ecx, 8
512ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg        convertloop
513ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
514ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
515ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
516ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#ifdef HAS_RGB565TOARGBROW_AVX2
5187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// pmul method to replicate bits.
5197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Math to replicate bits:
5207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// (v << 8) | (v << 3)
5217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// v * 256 + v * 8
5227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// v * (256 + 8)
5237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
5247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
5257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb,
5267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                          int pix) {
5277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  __asm {
5287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        eax, 0x01080108  // generate multiplier to repeat 5 bits
5297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovd      xmm5, eax
5307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vbroadcastss ymm5, xmm5
5317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
5327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movd       xmm6, eax
5337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vbroadcastss ymm6, xmm6
5347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpcmpeqb   ymm3, ymm3, ymm3       // generate mask 0xf800f800 for Red
5357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpsllw     ymm3, ymm3, 11
5367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpcmpeqb   ymm4, ymm4, ymm4       // generate mask 0x07e007e0 for Green
5377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpsllw     ymm4, ymm4, 10
5387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpsrlw     ymm4, ymm4, 5
5397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpcmpeqb   ymm7, ymm7, ymm7       // generate mask 0xff00ff00 for Alpha
5407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpsllw     ymm7, ymm7, 8
5417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
5427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        eax, [esp + 4]   // src_rgb565
5437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        edx, [esp + 8]   // dst_argb
5447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        ecx, [esp + 12]  // pix
5457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        edx, eax
5467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        edx, eax
5477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
5487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian convertloop:
5497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu    ymm0, [eax]   // fetch 16 pixels of bgr565
5507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpand      ymm1, ymm0, ymm3    // R in upper 5 bits
5517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpsllw     ymm2, ymm0, 11      // B in upper 5 bits
5527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpmulhuw   ymm1, ymm1, ymm5    // * (256 + 8)
5537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpmulhuw   ymm2, ymm2, ymm5    // * (256 + 8)
5547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpsllw     ymm1, ymm1, 8
5557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpor       ymm1, ymm1, ymm2    // RB
5567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpand      ymm0, ymm0, ymm4    // G in middle 6 bits
5577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpmulhuw   ymm0, ymm0, ymm6    // << 5 * (256 + 4)
5587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpor       ymm0, ymm0, ymm7    // AG
5597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpermq     ymm0, ymm0, 0xd8    // mutate for unpack
5607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpermq     ymm1, ymm1, 0xd8
5617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpunpckhbw ymm2, ymm1, ymm0
5627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpunpcklbw ymm1, ymm1, ymm0
5637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu    [eax * 2 + edx], ymm1  // store 4 pixels of ARGB
5647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu    [eax * 2 + edx + 32], ymm2  // store next 4 pixels of ARGB
5657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    lea       eax, [eax + 32]
5667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub       ecx, 16
5677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    jg        convertloop
5687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vzeroupper
5697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ret
5707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  }
5717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
5727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif  // HAS_RGB565TOARGBROW_AVX2
5737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
5747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#ifdef HAS_ARGB1555TOARGBROW_AVX2
5757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
5767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
5777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                            int pix) {
5787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  __asm {
5797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        eax, 0x01080108  // generate multiplier to repeat 5 bits
5807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovd      xmm5, eax
5817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vbroadcastss ymm5, xmm5
5827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
5837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movd       xmm6, eax
5847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vbroadcastss ymm6, xmm6
5857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpcmpeqb   ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
5867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpsllw     ymm3, ymm3, 11
5877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpsrlw     ymm4, ymm3, 6    // generate mask 0x03e003e0 for Green
5887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpcmpeqb   ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
5897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpsllw     ymm7, ymm7, 8
5907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
5917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        eax,  [esp + 4]   // src_argb1555
5927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        edx,  [esp + 8]   // dst_argb
5937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        ecx,  [esp + 12]  // pix
5947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        edx,  eax
5957ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        edx,  eax
5967ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
5977ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian convertloop:
5987ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu    ymm0, [eax]         // fetch 16 pixels of 1555
5997ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpsllw     ymm1, ymm0, 1       // R in upper 5 bits
6007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpsllw     ymm2, ymm0, 11      // B in upper 5 bits
6017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpand      ymm1, ymm1, ymm3
6027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpmulhuw   ymm2, ymm2, ymm5    // * (256 + 8)
6037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpmulhuw   ymm1, ymm1, ymm5    // * (256 + 8)
6047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpsllw     ymm1, ymm1, 8
6057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpor       ymm1, ymm1, ymm2    // RB
6067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpsraw     ymm2, ymm0, 8       // A
6077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpand      ymm0, ymm0, ymm4    // G in middle 5 bits
6087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpmulhuw   ymm0, ymm0, ymm6    // << 6 * (256 + 8)
6097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpand      ymm2, ymm2, ymm7
6107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpor       ymm0, ymm0, ymm2    // AG
6117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpermq     ymm0, ymm0, 0xd8    // mutate for unpack
6127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpermq     ymm1, ymm1, 0xd8
6137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpunpckhbw ymm2, ymm1, ymm0
6147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpunpcklbw ymm1, ymm1, ymm0
6157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu    [eax * 2 + edx], ymm1  // store 8 pixels of ARGB
6167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu    [eax * 2 + edx + 32], ymm2  // store next 8 pixels of ARGB
6177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    lea       eax, [eax + 32]
6187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub       ecx, 16
6197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    jg        convertloop
6207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vzeroupper
6217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ret
6227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  }
6237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
6247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif  // HAS_ARGB1555TOARGBROW_AVX2
6257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
6267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#ifdef HAS_ARGB4444TOARGBROW_AVX2
6277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
6287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,
6297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                            int pix) {
6307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  __asm {
6317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov       eax,  0x0f0f0f0f  // generate mask 0x0f0f0f0f
6327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovd     xmm4, eax
6337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vbroadcastss ymm4, xmm4
6347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpslld    ymm5, ymm4, 4     // 0xf0f0f0f0 for high nibbles
6357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov       eax,  [esp + 4]   // src_argb4444
6367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov       edx,  [esp + 8]   // dst_argb
6377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov       ecx,  [esp + 12]  // pix
6387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub       edx,  eax
6397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub       edx,  eax
6407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
6417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian convertloop:
6427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu    ymm0, [eax]         // fetch 16 pixels of bgra4444
6437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpand      ymm2, ymm0, ymm5    // mask high nibbles
6447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpand      ymm0, ymm0, ymm4    // mask low nibbles
6457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpsrlw     ymm3, ymm2, 4
6467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpsllw     ymm1, ymm0, 4
6477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpor       ymm2, ymm2, ymm3
6487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpor       ymm0, ymm0, ymm1
6497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpermq     ymm0, ymm0, 0xd8    // mutate for unpack
6507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpermq     ymm2, ymm2, 0xd8
6517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpunpckhbw ymm1, ymm0, ymm2
6527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpunpcklbw ymm0, ymm0, ymm2
6537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu    [eax * 2 + edx], ymm0  // store 8 pixels of ARGB
6547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu    [eax * 2 + edx + 32], ymm1  // store next 8 pixels of ARGB
6557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    lea       eax, [eax + 32]
6567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub       ecx, 16
6577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    jg        convertloop
6587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vzeroupper
6597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ret
6607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  }
6617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
6627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif  // HAS_ARGB4444TOARGBROW_AVX2
6637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
664ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// 24 instructions
6657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
666ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
667ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                            int pix) {
668ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
669ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
670ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd      xmm5, eax
671ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufd    xmm5, xmm5, 0
672ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov       eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
673ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd      xmm6, eax
674ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufd    xmm6, xmm6, 0
675ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
676ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psllw     xmm3, 11
677ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa    xmm4, xmm3       // generate mask 0x03e003e0 for Green
678ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw     xmm4, 6
679ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
680ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psllw     xmm7, 8
681ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
682ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov       eax, [esp + 4]   // src_argb1555
683ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov       edx, [esp + 8]   // dst_argb
684ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov       ecx, [esp + 12]  // pix
685ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub       edx, eax
686ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub       edx, eax
687ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
688ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
689ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu    xmm0, [eax]   // fetch 8 pixels of 1555
690ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa    xmm1, xmm0
691ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa    xmm2, xmm0
692ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psllw     xmm1, 1       // R in upper 5 bits
693ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psllw     xmm2, 11      // B in upper 5 bits
694ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pand      xmm1, xmm3
695ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmulhuw   xmm2, xmm5    // * (256 + 8)
696ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmulhuw   xmm1, xmm5    // * (256 + 8)
697ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psllw     xmm1, 8
698ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    por       xmm1, xmm2    // RB
699ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa    xmm2, xmm0
700ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pand      xmm0, xmm4    // G in middle 5 bits
701ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psraw     xmm2, 8       // A
702ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmulhuw   xmm0, xmm6    // << 6 * (256 + 8)
703ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pand      xmm2, xmm7
704ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    por       xmm0, xmm2    // AG
705ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa    xmm2, xmm1
706ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw xmm1, xmm0
707ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpckhbw xmm2, xmm0
7087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
7097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
710ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea       eax, [eax + 16]
711ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub       ecx, 8
712ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg        convertloop
713ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
714ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
715ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
716ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
717ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// 18 instructions.
7187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
719ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
720ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                            int pix) {
721ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
722ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov       eax, 0x0f0f0f0f  // generate mask 0x0f0f0f0f
723ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd      xmm4, eax
724ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufd    xmm4, xmm4, 0
725ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa    xmm5, xmm4       // 0xf0f0f0f0 for high nibbles
726ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pslld     xmm5, 4
727ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov       eax, [esp + 4]   // src_argb4444
728ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov       edx, [esp + 8]   // dst_argb
729ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov       ecx, [esp + 12]  // pix
730ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub       edx, eax
731ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub       edx, eax
732ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
733ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
734ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu    xmm0, [eax]   // fetch 8 pixels of bgra4444
735ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa    xmm2, xmm0
736ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pand      xmm0, xmm4    // mask low nibbles
737ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pand      xmm2, xmm5    // mask high nibbles
738ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa    xmm1, xmm0
739ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa    xmm3, xmm2
740ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psllw     xmm1, 4
741ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw     xmm3, 4
742ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    por       xmm0, xmm1
743ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    por       xmm2, xmm3
744ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa    xmm1, xmm0
745ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw xmm0, xmm2
746ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpckhbw xmm1, xmm2
7477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu    [eax * 2 + edx], xmm0  // store 4 pixels of ARGB
7487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu    [eax * 2 + edx + 16], xmm1  // store next 4 pixels of ARGB
749ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea       eax, [eax + 16]
750ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub       ecx, 8
751ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg        convertloop
752ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
753ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
754ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
755ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
7567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
757ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
758ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
759ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov       eax, [esp + 4]   // src_argb
760ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov       edx, [esp + 8]   // dst_rgb
761ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov       ecx, [esp + 12]  // pix
762ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa    xmm6, kShuffleMaskARGBToRGB24
763ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
764ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
765ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu    xmm0, [eax]   // fetch 16 pixels of argb
766ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu    xmm1, [eax + 16]
767ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu    xmm2, [eax + 32]
768ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu    xmm3, [eax + 48]
769ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea       eax, [eax + 64]
770ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
771ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufb    xmm1, xmm6
772ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufb    xmm2, xmm6
773ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufb    xmm3, xmm6
774ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa    xmm4, xmm1   // 4 bytes from 1 for 0
775ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrldq    xmm1, 4      // 8 bytes from 1
776ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pslldq    xmm4, 12     // 4 bytes from 1 for 0
777ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
778ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    por       xmm0, xmm4   // 4 bytes from 1 for 0
779ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pslldq    xmm5, 8      // 8 bytes from 2 for 1
780ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu    [edx], xmm0  // store 0
781ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    por       xmm1, xmm5   // 8 bytes from 2 for 1
782ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrldq    xmm2, 8      // 4 bytes from 2
783ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pslldq    xmm3, 4      // 12 bytes from 3 for 2
784ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    por       xmm2, xmm3   // 12 bytes from 3 for 2
785ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu    [edx + 16], xmm1   // store 1
786ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu    [edx + 32], xmm2   // store 2
787ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea       edx, [edx + 48]
788ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub       ecx, 16
789ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg        convertloop
790ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
791ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
792ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
793ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
7947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
795ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
796ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
797ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov       eax, [esp + 4]   // src_argb
798ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov       edx, [esp + 8]   // dst_rgb
799ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov       ecx, [esp + 12]  // pix
800ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa    xmm6, kShuffleMaskARGBToRAW
801ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
802ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
803ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu    xmm0, [eax]   // fetch 16 pixels of argb
804ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu    xmm1, [eax + 16]
805ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu    xmm2, [eax + 32]
806ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu    xmm3, [eax + 48]
807ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea       eax, [eax + 64]
808ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
809ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufb    xmm1, xmm6
810ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufb    xmm2, xmm6
811ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufb    xmm3, xmm6
812ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa    xmm4, xmm1   // 4 bytes from 1 for 0
813ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrldq    xmm1, 4      // 8 bytes from 1
814ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pslldq    xmm4, 12     // 4 bytes from 1 for 0
815ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
816ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    por       xmm0, xmm4   // 4 bytes from 1 for 0
817ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pslldq    xmm5, 8      // 8 bytes from 2 for 1
818ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu    [edx], xmm0  // store 0
819ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    por       xmm1, xmm5   // 8 bytes from 2 for 1
820ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrldq    xmm2, 8      // 4 bytes from 2
821ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pslldq    xmm3, 4      // 12 bytes from 3 for 2
822ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    por       xmm2, xmm3   // 12 bytes from 3 for 2
823ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu    [edx + 16], xmm1   // store 1
824ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu    [edx + 32], xmm2   // store 2
825ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea       edx, [edx + 48]
826ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub       ecx, 16
827ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg        convertloop
828ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
829ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
830ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
831ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
8327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 4 pixels
8337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
834ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
835ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
836ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov       eax, [esp + 4]   // src_argb
837ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov       edx, [esp + 8]   // dst_rgb
838ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov       ecx, [esp + 12]  // pix
839ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pcmpeqb   xmm3, xmm3       // generate mask 0x0000001f
840ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrld     xmm3, 27
841ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pcmpeqb   xmm4, xmm4       // generate mask 0x000007e0
842ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrld     xmm4, 26
843ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pslld     xmm4, 5
844ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pcmpeqb   xmm5, xmm5       // generate mask 0xfffff800
845ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pslld     xmm5, 11
846ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
847ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
8487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
8497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqa    xmm1, xmm0    // B
8507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqa    xmm2, xmm0    // G
8517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    pslld     xmm0, 8       // R
8527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    psrld     xmm1, 3       // B
8537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    psrld     xmm2, 5       // G
8547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    psrad     xmm0, 16      // R
8557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    pand      xmm1, xmm3    // B
8567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    pand      xmm2, xmm4    // G
8577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    pand      xmm0, xmm5    // R
8587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    por       xmm1, xmm2    // BG
8597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    por       xmm0, xmm1    // BGR
8607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    packssdw  xmm0, xmm0
8617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    lea       eax, [eax + 16]
8627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565
8637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    lea       edx, [edx + 8]
8647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub       ecx, 4
8657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    jg        convertloop
8667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ret
8677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  }
8687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
8697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
8707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 8 pixels
8717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
8727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
8737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                const uint32 dither4, int pix) {
8747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  __asm {
8757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
8767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov       eax, [esp + 4]   // src_argb
8777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov       edx, [esp + 8]   // dst_rgb
8787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movd      xmm6, [esp + 12] // dither4
8797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov       ecx, [esp + 16]  // pix
8807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    punpcklbw xmm6, xmm6       // make dither 16 bytes
8817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqa    xmm7, xmm6
8827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    punpcklwd xmm6, xmm6
8837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    punpckhwd xmm7, xmm7
8847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    pcmpeqb   xmm3, xmm3       // generate mask 0x0000001f
8857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    psrld     xmm3, 27
8867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    pcmpeqb   xmm4, xmm4       // generate mask 0x000007e0
8877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    psrld     xmm4, 26
8887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    pslld     xmm4, 5
8897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    pcmpeqb   xmm5, xmm5       // generate mask 0xfffff800
8907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    pslld     xmm5, 11
8917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
8927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian convertloop:
8937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
8947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    paddusb   xmm0, xmm6    // add dither
895ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa    xmm1, xmm0    // B
896ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa    xmm2, xmm0    // G
897ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pslld     xmm0, 8       // R
898ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrld     xmm1, 3       // B
899ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrld     xmm2, 5       // G
900ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrad     xmm0, 16      // R
901ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pand      xmm1, xmm3    // B
902ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pand      xmm2, xmm4    // G
903ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pand      xmm0, xmm5    // R
904ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    por       xmm1, xmm2    // BG
905ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    por       xmm0, xmm1    // BGR
906ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packssdw  xmm0, xmm0
907ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea       eax, [eax + 16]
908ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565
909ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea       edx, [edx + 8]
910ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub       ecx, 4
911ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg        convertloop
912ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
913ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
914ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
915ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
9167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#ifdef HAS_ARGBTORGB565DITHERROW_AVX2
9177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
9187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb,
9197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                const uint32 dither4, int pix) {
9207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  __asm {
9217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        eax, [esp + 4]      // src_argb
9227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        edx, [esp + 8]      // dst_rgb
9237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vbroadcastss xmm6, [esp + 12]  // dither4
9247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        ecx, [esp + 16]     // pix
9257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpunpcklbw xmm6, xmm6, xmm6    // make dither 32 bytes
9267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpermq     ymm6, ymm6, 0xd8
9277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpunpcklwd ymm6, ymm6, ymm6
9287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpcmpeqb   ymm3, ymm3, ymm3    // generate mask 0x0000001f
9297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpsrld     ymm3, ymm3, 27
9307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpcmpeqb   ymm4, ymm4, ymm4    // generate mask 0x000007e0
9317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpsrld     ymm4, ymm4, 26
9327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpslld     ymm4, ymm4, 5
9337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpslld     ymm5, ymm3, 11      // generate mask 0x0000f800
9347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
9357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian convertloop:
9367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
9377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpaddusb   ymm0, ymm0, ymm6    // add dither
9387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpsrld     ymm2, ymm0, 5       // G
9397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpsrld     ymm1, ymm0, 3       // B
9407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpsrld     ymm0, ymm0, 8       // R
9417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpand      ymm2, ymm2, ymm4    // G
9427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpand      ymm1, ymm1, ymm3    // B
9437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpand      ymm0, ymm0, ymm5    // R
9447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpor       ymm1, ymm1, ymm2    // BG
9457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpor       ymm0, ymm0, ymm1    // BGR
9467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpackusdw  ymm0, ymm0, ymm0
9477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpermq     ymm0, ymm0, 0xd8
9487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    lea        eax, [eax + 32]
9497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu    [edx], xmm0         // store 8 pixels of RGB565
9507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    lea        edx, [edx + 16]
9517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 8
9527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    jg         convertloop
9537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vzeroupper
9547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ret
9557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  }
9567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
9577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif  // HAS_ARGBTORGB565DITHERROW_AVX2
9587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
959ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// TODO(fbarchard): Improve sign extension/packing.
9607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
961ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
962ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
963ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov       eax, [esp + 4]   // src_argb
964ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov       edx, [esp + 8]   // dst_rgb
965ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov       ecx, [esp + 12]  // pix
966ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pcmpeqb   xmm4, xmm4       // generate mask 0x0000001f
967ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrld     xmm4, 27
968ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa    xmm5, xmm4       // generate mask 0x000003e0
969ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pslld     xmm5, 5
970ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa    xmm6, xmm4       // generate mask 0x00007c00
971ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pslld     xmm6, 10
972ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pcmpeqb   xmm7, xmm7       // generate mask 0xffff8000
973ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pslld     xmm7, 15
974ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
975ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
9767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
977ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa    xmm1, xmm0    // B
978ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa    xmm2, xmm0    // G
979ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa    xmm3, xmm0    // R
980ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrad     xmm0, 16      // A
981ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrld     xmm1, 3       // B
982ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrld     xmm2, 6       // G
983ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrld     xmm3, 9       // R
984ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pand      xmm0, xmm7    // A
985ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pand      xmm1, xmm4    // B
986ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pand      xmm2, xmm5    // G
987ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pand      xmm3, xmm6    // R
988ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    por       xmm0, xmm1    // BA
989ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    por       xmm2, xmm3    // GR
990ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    por       xmm0, xmm2    // BGRA
991ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packssdw  xmm0, xmm0
992ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea       eax, [eax + 16]
993ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB1555
994ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea       edx, [edx + 8]
995ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub       ecx, 4
996ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg        convertloop
997ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
998ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
999ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1000ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
10017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
1002ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
1003ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
1004ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov       eax, [esp + 4]   // src_argb
1005ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov       edx, [esp + 8]   // dst_rgb
1006ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov       ecx, [esp + 12]  // pix
1007ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pcmpeqb   xmm4, xmm4       // generate mask 0xf000f000
1008ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psllw     xmm4, 12
1009ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa    xmm3, xmm4       // generate mask 0x00f000f0
1010ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw     xmm3, 8
1011ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1012ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
10137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
1014ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa    xmm1, xmm0
1015ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pand      xmm0, xmm3    // low nibble
1016ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pand      xmm1, xmm4    // high nibble
10177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    psrld     xmm0, 4
10187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    psrld     xmm1, 8
1019ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    por       xmm0, xmm1
1020ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb  xmm0, xmm0
1021ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea       eax, [eax + 16]
1022ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB4444
1023ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea       edx, [edx + 8]
1024ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub       ecx, 4
1025ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg        convertloop
1026ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
1027ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
1028ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1029ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
10307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#ifdef HAS_ARGBTORGB565ROW_AVX2
10317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
10327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
10337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  __asm {
10347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        eax, [esp + 4]      // src_argb
10357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        edx, [esp + 8]      // dst_rgb
10367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        ecx, [esp + 12]     // pix
10377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpcmpeqb   ymm3, ymm3, ymm3    // generate mask 0x0000001f
10387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpsrld     ymm3, ymm3, 27
10397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpcmpeqb   ymm4, ymm4, ymm4    // generate mask 0x000007e0
10407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpsrld     ymm4, ymm4, 26
10417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpslld     ymm4, ymm4, 5
10427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpslld     ymm5, ymm3, 11      // generate mask 0x0000f800
10437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
10447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian convertloop:
10457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
10467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpsrld     ymm2, ymm0, 5       // G
10477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpsrld     ymm1, ymm0, 3       // B
10487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpsrld     ymm0, ymm0, 8       // R
10497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpand      ymm2, ymm2, ymm4    // G
10507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpand      ymm1, ymm1, ymm3    // B
10517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpand      ymm0, ymm0, ymm5    // R
10527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpor       ymm1, ymm1, ymm2    // BG
10537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpor       ymm0, ymm0, ymm1    // BGR
10547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpackusdw  ymm0, ymm0, ymm0
10557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpermq     ymm0, ymm0, 0xd8
10567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    lea        eax, [eax + 32]
10577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu    [edx], xmm0         // store 8 pixels of RGB565
10587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    lea        edx, [edx + 16]
10597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 8
10607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    jg         convertloop
10617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vzeroupper
10627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ret
10637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  }
10647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
10657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif  // HAS_ARGBTORGB565ROW_AVX2
10667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
10677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#ifdef HAS_ARGBTOARGB1555ROW_AVX2
10687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
10697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
10707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  __asm {
10717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        eax, [esp + 4]      // src_argb
10727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        edx, [esp + 8]      // dst_rgb
10737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        ecx, [esp + 12]     // pix
10747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpcmpeqb   ymm4, ymm4, ymm4
10757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpsrld     ymm4, ymm4, 27      // generate mask 0x0000001f
10767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpslld     ymm5, ymm4, 5       // generate mask 0x000003e0
10777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpslld     ymm6, ymm4, 10      // generate mask 0x00007c00
10787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpcmpeqb   ymm7, ymm7, ymm7    // generate mask 0xffff8000
10797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpslld     ymm7, ymm7, 15
10807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
10817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian convertloop:
10827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
10837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpsrld     ymm3, ymm0, 9       // R
10847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpsrld     ymm2, ymm0, 6       // G
10857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpsrld     ymm1, ymm0, 3       // B
10867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpsrad     ymm0, ymm0, 16      // A
10877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpand      ymm3, ymm3, ymm6    // R
10887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpand      ymm2, ymm2, ymm5    // G
10897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpand      ymm1, ymm1, ymm4    // B
10907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpand      ymm0, ymm0, ymm7    // A
10917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpor       ymm0, ymm0, ymm1    // BA
10927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpor       ymm2, ymm2, ymm3    // GR
10937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpor       ymm0, ymm0, ymm2    // BGRA
10947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpackssdw  ymm0, ymm0, ymm0
10957ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpermq     ymm0, ymm0, 0xd8
10967ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    lea        eax, [eax + 32]
10977ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu    [edx], xmm0         // store 8 pixels of ARGB1555
10987ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    lea        edx, [edx + 16]
10997ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 8
11007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    jg         convertloop
11017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vzeroupper
11027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ret
11037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  }
11047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
11057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif  // HAS_ARGBTOARGB1555ROW_AVX2
11067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
11077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#ifdef HAS_ARGBTOARGB4444ROW_AVX2
11087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
11097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
11107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  __asm {
11117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        eax, [esp + 4]   // src_argb
11127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        edx, [esp + 8]   // dst_rgb
11137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        ecx, [esp + 12]  // pix
11147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpcmpeqb   ymm4, ymm4, ymm4   // generate mask 0xf000f000
11157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpsllw     ymm4, ymm4, 12
11167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpsrlw     ymm3, ymm4, 8      // generate mask 0x00f000f0
11177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
11187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian convertloop:
11197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
11207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpand      ymm1, ymm0, ymm4    // high nibble
11217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpand      ymm0, ymm0, ymm3    // low nibble
11227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpsrld     ymm1, ymm1, 8
11237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpsrld     ymm0, ymm0, 4
11247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpor       ymm0, ymm0, ymm1
11257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpackuswb  ymm0, ymm0, ymm0
11267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpermq     ymm0, ymm0, 0xd8
11277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    lea        eax, [eax + 32]
11287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu    [edx], xmm0         // store 8 pixels of ARGB4444
11297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    lea        edx, [edx + 16]
11307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 8
11317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    jg         convertloop
11327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vzeroupper
11337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ret
11347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  }
11357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
11367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif  // HAS_ARGBTOARGB4444ROW_AVX2
11377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
1138ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
11397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
1140ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
1141ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
1142ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4]   /* src_argb */
1143ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 8]   /* dst_y */
1144ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 12]  /* pix */
1145ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm4, kARGBToY
11467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqa     xmm5, kAddY16
1147ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1148ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
11497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm0, [eax]
11507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm1, [eax + 16]
11517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm2, [eax + 32]
11527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm3, [eax + 48]
1153ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm0, xmm4
1154ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm1, xmm4
1155ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm2, xmm4
1156ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm3, xmm4
1157ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 64]
1158ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    phaddw     xmm0, xmm1
1159ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    phaddw     xmm2, xmm3
1160ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm0, 7
1161ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm2, 7
1162ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm0, xmm2
1163ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddb      xmm0, xmm5
11647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     [edx], xmm0
1165ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 16]
11667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 16
1167ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
1168ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
1169ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
1170ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1171ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
11727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
11737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
11747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
1175ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
1176ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
1177ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4]   /* src_argb */
1178ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 8]   /* dst_y */
1179ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 12]  /* pix */
1180ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm4, kARGBToYJ
1181ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm5, kAddYJ64
1182ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1183ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
11847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm0, [eax]
11857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm1, [eax + 16]
11867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm2, [eax + 32]
11877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm3, [eax + 48]
1188ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm0, xmm4
1189ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm1, xmm4
1190ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm2, xmm4
1191ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm3, xmm4
1192ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 64]
1193ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    phaddw     xmm0, xmm1
1194ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    phaddw     xmm2, xmm3
1195ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddw      xmm0, xmm5  // Add .5 for rounding.
1196ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddw      xmm2, xmm5
1197ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm0, 7
1198ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm2, 7
1199ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm0, xmm2
12007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     [edx], xmm0
1201ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 16]
12027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 16
1203ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
1204ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
1205ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
1206ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1207ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1208ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBTOYROW_AVX2
12097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// vpermd for vphaddw + vpackuswb vpermd.
12107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianstatic const lvec32 kPermdARGBToY_AVX = {
12117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  0, 4, 1, 5, 2, 6, 3, 7
12127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian};
12137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
1214ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
12157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
1216ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
1217ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
1218ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4]   /* src_argb */
1219ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 8]   /* dst_y */
1220ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 12]  /* pix */
1221ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vbroadcastf128 ymm4, kARGBToY
1222ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vbroadcastf128 ymm5, kAddY16
12237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu    ymm6, kPermdARGBToY_AVX
1224ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1225ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
1226ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovdqu    ymm0, [eax]
1227ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovdqu    ymm1, [eax + 32]
1228ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovdqu    ymm2, [eax + 64]
1229ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovdqu    ymm3, [eax + 96]
1230ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpmaddubsw ymm0, ymm0, ymm4
1231ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpmaddubsw ymm1, ymm1, ymm4
1232ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpmaddubsw ymm2, ymm2, ymm4
1233ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpmaddubsw ymm3, ymm3, ymm4
1234ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 128]
1235ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vphaddw    ymm0, ymm0, ymm1  // mutates.
1236ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vphaddw    ymm2, ymm2, ymm3
1237ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpsrlw     ymm0, ymm0, 7
1238ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpsrlw     ymm2, ymm2, 7
1239ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpackuswb  ymm0, ymm0, ymm2  // mutates.
1240ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.
12417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpaddb     ymm0, ymm0, ymm5  // add 16 for Y
1242ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovdqu    [edx], ymm0
1243ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 32]
12447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 32
1245ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
1246ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vzeroupper
1247ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
1248ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
1249ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1250ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif  //  HAS_ARGBTOYROW_AVX2
1251ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
12527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#ifdef HAS_ARGBTOYJROW_AVX2
1253ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
12547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
1255ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
1256ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
1257ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4]   /* src_argb */
1258ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 8]   /* dst_y */
1259ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 12]  /* pix */
1260ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vbroadcastf128 ymm4, kARGBToYJ
1261ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vbroadcastf128 ymm5, kAddYJ64
12627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu    ymm6, kPermdARGBToY_AVX
1263ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1264ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
1265ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovdqu    ymm0, [eax]
1266ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovdqu    ymm1, [eax + 32]
1267ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovdqu    ymm2, [eax + 64]
1268ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovdqu    ymm3, [eax + 96]
1269ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpmaddubsw ymm0, ymm0, ymm4
1270ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpmaddubsw ymm1, ymm1, ymm4
1271ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpmaddubsw ymm2, ymm2, ymm4
1272ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpmaddubsw ymm3, ymm3, ymm4
1273ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 128]
1274ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vphaddw    ymm0, ymm0, ymm1  // mutates.
1275ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vphaddw    ymm2, ymm2, ymm3
1276ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpaddw     ymm0, ymm0, ymm5  // Add .5 for rounding.
1277ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpaddw     ymm2, ymm2, ymm5
1278ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpsrlw     ymm0, ymm0, 7
1279ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpsrlw     ymm2, ymm2, 7
1280ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpackuswb  ymm0, ymm0, ymm2  // mutates.
1281ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.
1282ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovdqu    [edx], ymm0
1283ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 32]
12847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 32
1285ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
1286ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1287ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vzeroupper
1288ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
1289ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
1290ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1291ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif  //  HAS_ARGBTOYJROW_AVX2
1292ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
12937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
12947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
1295ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
1296ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4]   /* src_argb */
1297ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 8]   /* dst_y */
1298ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 12]  /* pix */
12997ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqa     xmm4, kBGRAToY
1300ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm5, kAddY16
1301ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1302ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
1303ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm0, [eax]
1304ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm1, [eax + 16]
1305ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm2, [eax + 32]
1306ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm3, [eax + 48]
1307ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm0, xmm4
1308ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm1, xmm4
1309ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm2, xmm4
1310ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm3, xmm4
1311ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 64]
1312ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    phaddw     xmm0, xmm1
1313ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    phaddw     xmm2, xmm3
1314ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm0, 7
1315ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm2, 7
1316ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm0, xmm2
1317ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddb      xmm0, xmm5
1318ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     [edx], xmm0
1319ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 16]
13207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 16
1321ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
1322ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
1323ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
1324ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1325ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
13267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
13277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
1328ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
1329ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4]   /* src_argb */
1330ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 8]   /* dst_y */
1331ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 12]  /* pix */
13327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqa     xmm4, kABGRToY
13337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqa     xmm5, kAddY16
1334ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1335ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
1336ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm0, [eax]
1337ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm1, [eax + 16]
1338ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm2, [eax + 32]
1339ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm3, [eax + 48]
1340ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm0, xmm4
1341ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm1, xmm4
1342ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm2, xmm4
1343ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm3, xmm4
1344ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 64]
1345ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    phaddw     xmm0, xmm1
1346ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    phaddw     xmm2, xmm3
1347ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm0, 7
1348ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm2, 7
1349ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm0, xmm2
13507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    paddb      xmm0, xmm5
1351ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     [edx], xmm0
1352ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 16]
13537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 16
1354ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
1355ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
1356ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
1357ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1358ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
13597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
13607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
1361ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
1362ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4]   /* src_argb */
1363ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 8]   /* dst_y */
1364ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 12]  /* pix */
1365ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm4, kRGBAToY
1366ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm5, kAddY16
1367ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1368ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
1369ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm0, [eax]
1370ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm1, [eax + 16]
1371ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm2, [eax + 32]
1372ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm3, [eax + 48]
1373ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm0, xmm4
1374ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm1, xmm4
1375ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm2, xmm4
1376ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm3, xmm4
1377ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 64]
1378ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    phaddw     xmm0, xmm1
1379ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    phaddw     xmm2, xmm3
1380ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm0, 7
1381ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm2, 7
1382ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm0, xmm2
1383ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddb      xmm0, xmm5
1384ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     [edx], xmm0
1385ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 16]
13867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 16
1387ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
1388ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
1389ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
1390ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1391ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
13927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
1393ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1394ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                       uint8* dst_u, uint8* dst_v, int width) {
1395ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
1396ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
1397ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       edi
1398ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 8 + 4]   // src_argb
1399ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, [esp + 8 + 8]   // src_stride_argb
1400ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 8 + 12]  // dst_u
1401ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edi, [esp + 8 + 16]  // dst_v
1402ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 8 + 20]  // pix
1403ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm5, kAddUV128
14047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqa     xmm6, kARGBToV
14057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqa     xmm7, kARGBToU
1406ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        edi, edx             // stride from u to v
1407ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1408ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
1409ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    /* step 1 - subsample 16x2 argb pixels to 8x1 */
14107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm0, [eax]
14117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm4, [eax + esi]
14127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    pavgb      xmm0, xmm4
14137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm1, [eax + 16]
14147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm4, [eax + esi + 16]
14157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    pavgb      xmm1, xmm4
14167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm2, [eax + 32]
14177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm4, [eax + esi + 32]
14187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    pavgb      xmm2, xmm4
14197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm3, [eax + 48]
14207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm4, [eax + esi + 48]
14217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    pavgb      xmm3, xmm4
14227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
1423ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax,  [eax + 64]
1424ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm4, xmm0
1425ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    shufps     xmm0, xmm1, 0x88
1426ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    shufps     xmm4, xmm1, 0xdd
1427ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgb      xmm0, xmm4
1428ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm4, xmm2
1429ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    shufps     xmm2, xmm3, 0x88
1430ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    shufps     xmm4, xmm3, 0xdd
1431ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgb      xmm2, xmm4
1432ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1433ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // step 2 - convert to U and V
1434ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // from here down is very similar to Y code except
1435ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // instead of 16 different pixels, its 8 pixels of U and 8 of V
1436ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm1, xmm0
1437ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm3, xmm2
1438ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm0, xmm7  // U
1439ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm2, xmm7
1440ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm1, xmm6  // V
1441ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm3, xmm6
1442ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    phaddw     xmm0, xmm2
1443ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    phaddw     xmm1, xmm3
1444ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psraw      xmm0, 8
1445ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psraw      xmm1, 8
1446ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packsswb   xmm0, xmm1
1447ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddb      xmm0, xmm5            // -> unsigned
1448ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1449ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // step 3 - store 8 U and 8 V values
1450ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movlps     qword ptr [edx], xmm0 // U
1451ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movhps     qword ptr [edx + edi], xmm0 // V
1452ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 8]
14537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 16
1454ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
1455ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1456ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        edi
1457ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
1458ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
1459ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
1460ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1461ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
14627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
1463ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1464ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        uint8* dst_u, uint8* dst_v, int width) {
1465ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
1466ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
1467ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       edi
1468ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 8 + 4]   // src_argb
1469ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, [esp + 8 + 8]   // src_stride_argb
1470ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 8 + 12]  // dst_u
1471ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edi, [esp + 8 + 16]  // dst_v
1472ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 8 + 20]  // pix
1473ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm5, kAddUVJ128
14747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqa     xmm6, kARGBToVJ
14757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqa     xmm7, kARGBToUJ
1476ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        edi, edx             // stride from u to v
1477ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1478ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
1479ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    /* step 1 - subsample 16x2 argb pixels to 8x1 */
14807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm0, [eax]
14817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm4, [eax + esi]
14827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    pavgb      xmm0, xmm4
14837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm1, [eax + 16]
14847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm4, [eax + esi + 16]
14857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    pavgb      xmm1, xmm4
14867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm2, [eax + 32]
14877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm4, [eax + esi + 32]
14887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    pavgb      xmm2, xmm4
14897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm3, [eax + 48]
14907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm4, [eax + esi + 48]
14917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    pavgb      xmm3, xmm4
14927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
1493ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax,  [eax + 64]
1494ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm4, xmm0
1495ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    shufps     xmm0, xmm1, 0x88
1496ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    shufps     xmm4, xmm1, 0xdd
1497ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgb      xmm0, xmm4
1498ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm4, xmm2
1499ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    shufps     xmm2, xmm3, 0x88
1500ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    shufps     xmm4, xmm3, 0xdd
1501ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgb      xmm2, xmm4
1502ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1503ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // step 2 - convert to U and V
1504ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // from here down is very similar to Y code except
1505ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // instead of 16 different pixels, its 8 pixels of U and 8 of V
1506ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm1, xmm0
1507ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm3, xmm2
1508ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm0, xmm7  // U
1509ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm2, xmm7
1510ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm1, xmm6  // V
1511ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm3, xmm6
1512ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    phaddw     xmm0, xmm2
1513ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    phaddw     xmm1, xmm3
1514ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddw      xmm0, xmm5            // +.5 rounding -> unsigned
1515ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddw      xmm1, xmm5
1516ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psraw      xmm0, 8
1517ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psraw      xmm1, 8
1518ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packsswb   xmm0, xmm1
1519ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1520ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // step 3 - store 8 U and 8 V values
1521ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movlps     qword ptr [edx], xmm0 // U
1522ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movhps     qword ptr [edx + edi], xmm0 // V
1523ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 8]
15247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 16
1525ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
1526ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1527ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        edi
1528ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
1529ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
1530ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
1531ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1532ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1533ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBTOUVROW_AVX2
15347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
1535ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
1536ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                      uint8* dst_u, uint8* dst_v, int width) {
1537ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
1538ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
1539ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       edi
1540ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 8 + 4]   // src_argb
1541ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, [esp + 8 + 8]   // src_stride_argb
1542ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 8 + 12]  // dst_u
1543ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edi, [esp + 8 + 16]  // dst_v
1544ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 8 + 20]  // pix
1545ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vbroadcastf128 ymm5, kAddUV128
1546ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vbroadcastf128 ymm6, kARGBToV
1547ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vbroadcastf128 ymm7, kARGBToU
1548ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        edi, edx             // stride from u to v
1549ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1550ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
1551ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    /* step 1 - subsample 32x2 argb pixels to 16x1 */
1552ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovdqu    ymm0, [eax]
1553ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovdqu    ymm1, [eax + 32]
1554ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovdqu    ymm2, [eax + 64]
1555ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovdqu    ymm3, [eax + 96]
1556ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpavgb     ymm0, ymm0, [eax + esi]
1557ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpavgb     ymm1, ymm1, [eax + esi + 32]
1558ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpavgb     ymm2, ymm2, [eax + esi + 64]
1559ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpavgb     ymm3, ymm3, [eax + esi + 96]
1560ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax,  [eax + 128]
1561ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vshufps    ymm4, ymm0, ymm1, 0x88
1562ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vshufps    ymm0, ymm0, ymm1, 0xdd
1563ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpavgb     ymm0, ymm0, ymm4  // mutated by vshufps
1564ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vshufps    ymm4, ymm2, ymm3, 0x88
1565ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vshufps    ymm2, ymm2, ymm3, 0xdd
1566ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpavgb     ymm2, ymm2, ymm4  // mutated by vshufps
1567ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1568ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // step 2 - convert to U and V
1569ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // from here down is very similar to Y code except
1570ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // instead of 32 different pixels, its 16 pixels of U and 16 of V
1571ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpmaddubsw ymm1, ymm0, ymm7  // U
1572ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpmaddubsw ymm3, ymm2, ymm7
1573ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpmaddubsw ymm0, ymm0, ymm6  // V
1574ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpmaddubsw ymm2, ymm2, ymm6
1575ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vphaddw    ymm1, ymm1, ymm3  // mutates
1576ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vphaddw    ymm0, ymm0, ymm2
1577ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpsraw     ymm1, ymm1, 8
1578ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpsraw     ymm0, ymm0, 8
1579ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpacksswb  ymm0, ymm1, ymm0  // mutates
1580ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpermq     ymm0, ymm0, 0xd8  // For vpacksswb
1581ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpshufb    ymm0, ymm0, kShufARGBToUV_AVX  // For vshufps + vphaddw
1582ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpaddb     ymm0, ymm0, ymm5  // -> unsigned
1583ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1584ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // step 3 - store 16 U and 16 V values
1585ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vextractf128 [edx], ymm0, 0 // U
1586ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vextractf128 [edx + edi], ymm0, 1 // V
1587ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 16]
15887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 32
1589ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
1590ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1591ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        edi
1592ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
1593ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vzeroupper
1594ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
1595ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
1596ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1597ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif  // HAS_ARGBTOUVROW_AVX2
1598ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
15997ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
16007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid ARGBToUV444Row_SSSE3(const uint8* src_argb0,
16017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                          uint8* dst_u, uint8* dst_v, int width) {
1602ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
1603ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       edi
16047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        eax, [esp + 4 + 4]   // src_argb
16057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        edx, [esp + 4 + 8]   // dst_u
16067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        edi, [esp + 4 + 12]  // dst_v
16077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        ecx, [esp + 4 + 16]  // pix
1608ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm5, kAddUV128
16097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqa     xmm6, kARGBToV
16107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqa     xmm7, kARGBToU
1611ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        edi, edx             // stride from u to v
1612ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1613ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
16147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    /* convert to U and V */
16157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm0, [eax]          // U
1616ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm1, [eax + 16]
1617ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm2, [eax + 32]
1618ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm3, [eax + 48]
16197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    pmaddubsw  xmm0, xmm7
16207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    pmaddubsw  xmm1, xmm7
1621ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm2, xmm7
16227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    pmaddubsw  xmm3, xmm7
16237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    phaddw     xmm0, xmm1
16247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    phaddw     xmm2, xmm3
1625ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psraw      xmm0, 8
16267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    psraw      xmm2, 8
16277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    packsswb   xmm0, xmm2
16287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    paddb      xmm0, xmm5
16297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     [edx], xmm0
1630ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1631ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm0, [eax]          // V
1632ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm1, [eax + 16]
1633ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm2, [eax + 32]
1634ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm3, [eax + 48]
1635ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm0, xmm6
1636ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm1, xmm6
1637ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm2, xmm6
1638ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm3, xmm6
1639ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    phaddw     xmm0, xmm1
1640ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    phaddw     xmm2, xmm3
1641ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psraw      xmm0, 8
1642ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psraw      xmm2, 8
1643ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packsswb   xmm0, xmm2
1644ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddb      xmm0, xmm5
1645ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax,  [eax + 64]
1646ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     [edx + edi], xmm0
1647ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx,  [edx + 16]
16487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx,  16
1649ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
1650ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1651ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        edi
1652ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
1653ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
1654ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1655ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
16567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
1657ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBToUV422Row_SSSE3(const uint8* src_argb0,
1658ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                          uint8* dst_u, uint8* dst_v, int width) {
1659ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
1660ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       edi
1661ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4 + 4]   // src_argb
1662ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 4 + 8]   // dst_u
1663ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edi, [esp + 4 + 12]  // dst_v
1664ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 4 + 16]  // pix
1665ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm5, kAddUV128
16667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqa     xmm6, kARGBToV
16677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqa     xmm7, kARGBToU
1668ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        edi, edx             // stride from u to v
1669ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1670ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
1671ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    /* step 1 - subsample 16x2 argb pixels to 8x1 */
16727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm0, [eax]
16737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm1, [eax + 16]
16747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm2, [eax + 32]
16757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm3, [eax + 48]
1676ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax,  [eax + 64]
1677ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm4, xmm0
1678ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    shufps     xmm0, xmm1, 0x88
1679ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    shufps     xmm4, xmm1, 0xdd
1680ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgb      xmm0, xmm4
1681ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm4, xmm2
1682ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    shufps     xmm2, xmm3, 0x88
1683ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    shufps     xmm4, xmm3, 0xdd
1684ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgb      xmm2, xmm4
1685ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1686ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // step 2 - convert to U and V
1687ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // from here down is very similar to Y code except
1688ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // instead of 16 different pixels, its 8 pixels of U and 8 of V
1689ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm1, xmm0
1690ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm3, xmm2
1691ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm0, xmm7  // U
1692ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm2, xmm7
1693ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm1, xmm6  // V
1694ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm3, xmm6
1695ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    phaddw     xmm0, xmm2
1696ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    phaddw     xmm1, xmm3
1697ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psraw      xmm0, 8
1698ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psraw      xmm1, 8
1699ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packsswb   xmm0, xmm1
1700ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddb      xmm0, xmm5            // -> unsigned
1701ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1702ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // step 3 - store 8 U and 8 V values
1703ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movlps     qword ptr [edx], xmm0 // U
1704ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movhps     qword ptr [edx + edi], xmm0 // V
1705ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 8]
17067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 16
1707ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
1708ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1709ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        edi
1710ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
1711ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
1712ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1713ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
17147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
17157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
17167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                       uint8* dst_u, uint8* dst_v, int width) {
1717ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
17187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    push       esi
1719ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       edi
17207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        eax, [esp + 8 + 4]   // src_argb
17217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        esi, [esp + 8 + 8]   // src_stride_argb
17227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        edx, [esp + 8 + 12]  // dst_u
17237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        edi, [esp + 8 + 16]  // dst_v
17247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        ecx, [esp + 8 + 20]  // pix
1725ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm5, kAddUV128
17267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqa     xmm6, kBGRAToV
17277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqa     xmm7, kBGRAToU
1728ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        edi, edx             // stride from u to v
1729ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1730ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
1731ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    /* step 1 - subsample 16x2 argb pixels to 8x1 */
1732ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm0, [eax]
17337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm4, [eax + esi]
17347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    pavgb      xmm0, xmm4
1735ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm1, [eax + 16]
17367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm4, [eax + esi + 16]
17377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    pavgb      xmm1, xmm4
1738ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm2, [eax + 32]
17397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm4, [eax + esi + 32]
17407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    pavgb      xmm2, xmm4
1741ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm3, [eax + 48]
17427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm4, [eax + esi + 48]
17437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    pavgb      xmm3, xmm4
17447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
1745ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax,  [eax + 64]
1746ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm4, xmm0
1747ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    shufps     xmm0, xmm1, 0x88
1748ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    shufps     xmm4, xmm1, 0xdd
1749ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgb      xmm0, xmm4
1750ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm4, xmm2
1751ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    shufps     xmm2, xmm3, 0x88
1752ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    shufps     xmm4, xmm3, 0xdd
1753ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgb      xmm2, xmm4
1754ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1755ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // step 2 - convert to U and V
1756ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // from here down is very similar to Y code except
1757ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // instead of 16 different pixels, its 8 pixels of U and 8 of V
1758ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm1, xmm0
1759ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm3, xmm2
1760ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm0, xmm7  // U
1761ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm2, xmm7
1762ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm1, xmm6  // V
1763ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm3, xmm6
1764ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    phaddw     xmm0, xmm2
1765ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    phaddw     xmm1, xmm3
1766ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psraw      xmm0, 8
1767ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psraw      xmm1, 8
1768ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packsswb   xmm0, xmm1
1769ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddb      xmm0, xmm5            // -> unsigned
1770ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1771ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // step 3 - store 8 U and 8 V values
1772ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movlps     qword ptr [edx], xmm0 // U
1773ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movhps     qword ptr [edx + edi], xmm0 // V
1774ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 8]
17757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 16
1776ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
1777ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1778ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        edi
17797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    pop        esi
1780ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
1781ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
1782ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1783ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
17847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
17857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1786ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                       uint8* dst_u, uint8* dst_v, int width) {
1787ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
1788ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
1789ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       edi
1790ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 8 + 4]   // src_argb
1791ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, [esp + 8 + 8]   // src_stride_argb
1792ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 8 + 12]  // dst_u
1793ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edi, [esp + 8 + 16]  // dst_v
1794ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 8 + 20]  // pix
1795ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm5, kAddUV128
17967ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqa     xmm6, kABGRToV
17977ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqa     xmm7, kABGRToU
1798ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        edi, edx             // stride from u to v
1799ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1800ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
1801ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    /* step 1 - subsample 16x2 argb pixels to 8x1 */
18027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm0, [eax]
18037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm4, [eax + esi]
18047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    pavgb      xmm0, xmm4
18057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm1, [eax + 16]
18067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm4, [eax + esi + 16]
18077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    pavgb      xmm1, xmm4
18087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm2, [eax + 32]
18097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm4, [eax + esi + 32]
18107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    pavgb      xmm2, xmm4
18117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm3, [eax + 48]
18127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm4, [eax + esi + 48]
18137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    pavgb      xmm3, xmm4
18147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
1815ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax,  [eax + 64]
1816ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm4, xmm0
1817ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    shufps     xmm0, xmm1, 0x88
1818ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    shufps     xmm4, xmm1, 0xdd
1819ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgb      xmm0, xmm4
1820ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm4, xmm2
1821ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    shufps     xmm2, xmm3, 0x88
1822ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    shufps     xmm4, xmm3, 0xdd
1823ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgb      xmm2, xmm4
1824ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1825ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // step 2 - convert to U and V
1826ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // from here down is very similar to Y code except
1827ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // instead of 16 different pixels, its 8 pixels of U and 8 of V
1828ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm1, xmm0
1829ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm3, xmm2
1830ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm0, xmm7  // U
1831ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm2, xmm7
1832ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm1, xmm6  // V
1833ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm3, xmm6
1834ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    phaddw     xmm0, xmm2
1835ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    phaddw     xmm1, xmm3
1836ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psraw      xmm0, 8
1837ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psraw      xmm1, 8
1838ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packsswb   xmm0, xmm1
1839ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddb      xmm0, xmm5            // -> unsigned
1840ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1841ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // step 3 - store 8 U and 8 V values
1842ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movlps     qword ptr [edx], xmm0 // U
1843ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movhps     qword ptr [edx + edi], xmm0 // V
1844ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 8]
18457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 16
1846ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
1847ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1848ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        edi
1849ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
1850ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
1851ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
1852ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1853ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
18547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
18557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
18567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                       uint8* dst_u, uint8* dst_v, int width) {
1857ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
1858ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
1859ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       edi
1860ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 8 + 4]   // src_argb
1861ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, [esp + 8 + 8]   // src_stride_argb
1862ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 8 + 12]  // dst_u
1863ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edi, [esp + 8 + 16]  // dst_v
1864ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 8 + 20]  // pix
1865ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm5, kAddUV128
18667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqa     xmm6, kRGBAToV
18677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqa     xmm7, kRGBAToU
1868ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        edi, edx             // stride from u to v
1869ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1870ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
1871ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    /* step 1 - subsample 16x2 argb pixels to 8x1 */
1872ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm0, [eax]
1873ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm4, [eax + esi]
1874ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgb      xmm0, xmm4
18757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm1, [eax + 16]
1876ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm4, [eax + esi + 16]
1877ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgb      xmm1, xmm4
18787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm2, [eax + 32]
1879ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm4, [eax + esi + 32]
1880ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgb      xmm2, xmm4
18817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm3, [eax + 48]
1882ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm4, [eax + esi + 48]
1883ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgb      xmm3, xmm4
18847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
1885ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax,  [eax + 64]
1886ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm4, xmm0
1887ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    shufps     xmm0, xmm1, 0x88
1888ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    shufps     xmm4, xmm1, 0xdd
1889ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgb      xmm0, xmm4
1890ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm4, xmm2
1891ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    shufps     xmm2, xmm3, 0x88
1892ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    shufps     xmm4, xmm3, 0xdd
1893ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgb      xmm2, xmm4
1894ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1895ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // step 2 - convert to U and V
1896ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // from here down is very similar to Y code except
1897ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // instead of 16 different pixels, its 8 pixels of U and 8 of V
1898ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm1, xmm0
1899ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm3, xmm2
1900ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm0, xmm7  // U
1901ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm2, xmm7
1902ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm1, xmm6  // V
1903ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm3, xmm6
1904ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    phaddw     xmm0, xmm2
1905ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    phaddw     xmm1, xmm3
1906ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psraw      xmm0, 8
1907ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psraw      xmm1, 8
1908ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packsswb   xmm0, xmm1
1909ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddb      xmm0, xmm5            // -> unsigned
1910ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1911ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // step 3 - store 8 U and 8 V values
1912ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movlps     qword ptr [edx], xmm0 // U
1913ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movhps     qword ptr [edx + edi], xmm0 // V
1914ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 8]
19157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 16
1916ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
1917ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1918ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        edi
1919ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
1920ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
1921ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
1922ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
19237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif  // HAS_ARGBTOYROW_SSSE3
1924ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
19257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Read 16 UV from 444
19267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define READYUV444_AVX2 __asm {                                                \
19277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm vmovdqu    xmm0, [esi]                  /* U */         /* NOLINT */ \
19287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm vmovdqu    xmm1, [esi + edi]            /* V */         /* NOLINT */ \
19297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm lea        esi,  [esi + 16]                                          \
19307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm vpermq     ymm0, ymm0, 0xd8                                          \
19317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm vpermq     ymm1, ymm1, 0xd8                                          \
19327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
19337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  }
19347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
19357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Read 8 UV from 422, upsample to 16 UV.
19367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define READYUV422_AVX2 __asm {                                                \
19377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm vmovq      xmm0, qword ptr [esi]        /* U */         /* NOLINT */ \
19387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm vmovq      xmm1, qword ptr [esi + edi]  /* V */         /* NOLINT */ \
19397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm lea        esi,  [esi + 8]                                           \
19407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
19417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm vpermq     ymm0, ymm0, 0xd8                                          \
19427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
19437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  }
19447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
19457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Read 4 UV from 411, upsample to 16 UV.
19467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define READYUV411_AVX2 __asm {                                                \
19477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm vmovd      xmm0, dword ptr [esi]        /* U */         /* NOLINT */ \
19487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm vmovd      xmm1, dword ptr [esi + edi]  /* V */         /* NOLINT */ \
19497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm lea        esi,  [esi + 4]                                           \
19507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
19517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
19527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm vpermq     ymm0, ymm0, 0xd8                                          \
19537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm vpunpckldq ymm0, ymm0, ymm0             /* UVUVUVUV (upsample) */    \
19547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  }
19557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
19567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Read 8 UV from NV12, upsample to 16 UV.
19577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define READNV12_AVX2 __asm {                                                  \
19587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm vmovdqu    xmm0, [esi]                  /* UV */                     \
19597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm lea        esi,  [esi + 16]                                          \
19607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm vpermq     ymm0, ymm0, 0xd8                                          \
19617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
19627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  }
19637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
19647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Convert 16 pixels: 16 UV and 16 Y.
19657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define YUVTORGB_AVX2(YuvConstants) __asm {                                    \
19667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    /* Step 1: Find 8 UV contributions to 16 R,G,B values */                   \
19677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm vpmaddubsw ymm2, ymm0, YuvConstants.kUVToR        /* scale R UV */   \
19687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm vpmaddubsw ymm1, ymm0, YuvConstants.kUVToG        /* scale G UV */   \
19697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm vpmaddubsw ymm0, ymm0, YuvConstants.kUVToB        /* scale B UV */   \
19707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm vmovdqu    ymm3, YuvConstants.kUVBiasR                               \
19717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm vpsubw     ymm2, ymm3, ymm2                                          \
19727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm vmovdqu    ymm3, YuvConstants.kUVBiasG                               \
19737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm vpsubw     ymm1, ymm3, ymm1                                          \
19747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm vmovdqu    ymm3, YuvConstants.kUVBiasB                               \
19757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm vpsubw     ymm0, ymm3, ymm0                                          \
19767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    /* Step 2: Find Y contribution to 16 R,G,B values */                       \
19777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm vmovdqu    xmm3, [eax]                  /* NOLINT */                 \
19787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm lea        eax, [eax + 16]                                           \
19797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm vpermq     ymm3, ymm3, 0xd8                                          \
19807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm vpunpcklbw ymm3, ymm3, ymm3                                          \
19817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm vpmulhuw   ymm3, ymm3, YuvConstants.kYToRgb                          \
19827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm vpaddsw    ymm0, ymm0, ymm3           /* B += Y */                   \
19837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm vpaddsw    ymm1, ymm1, ymm3           /* G += Y */                   \
19847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm vpaddsw    ymm2, ymm2, ymm3           /* R += Y */                   \
19857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm vpsraw     ymm0, ymm0, 6                                             \
19867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm vpsraw     ymm1, ymm1, 6                                             \
19877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm vpsraw     ymm2, ymm2, 6                                             \
19887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm vpackuswb  ymm0, ymm0, ymm0           /* B */                        \
19897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm vpackuswb  ymm1, ymm1, ymm1           /* G */                        \
19907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm vpackuswb  ymm2, ymm2, ymm2           /* R */                        \
19917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  }
19927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
19937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Store 16 ARGB values.
19947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define STOREARGB_AVX2 __asm {                                                 \
19957ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    /* Step 3: Weave into ARGB */                                              \
19967ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm vpunpcklbw ymm0, ymm0, ymm1           /* BG */                       \
19977ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm vpermq     ymm0, ymm0, 0xd8                                          \
19987ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm vpunpcklbw ymm2, ymm2, ymm5           /* RA */                       \
19997ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm vpermq     ymm2, ymm2, 0xd8                                          \
20007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm vpunpcklwd ymm1, ymm0, ymm2           /* BGRA first 8 pixels */      \
20017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm vpunpckhwd ymm0, ymm0, ymm2           /* BGRA next 8 pixels */       \
20027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm vmovdqu    0[edx], ymm1                                              \
20037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm vmovdqu    32[edx], ymm0                                             \
20047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm lea        edx,  [edx + 64]                                          \
20057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  }
20067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
20077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#ifdef HAS_I422TOARGBROW_AVX2
20087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 16 pixels
20097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
20107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
20117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid I422ToARGBRow_AVX2(const uint8* y_buf,
20127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                        const uint8* u_buf,
20137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                        const uint8* v_buf,
20147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                        uint8* dst_argb,
20157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                        int width) {
2016ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
2017ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
2018ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       edi
20197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        eax, [esp + 8 + 4]   // Y
20207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        esi, [esp + 8 + 8]   // U
20217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        edi, [esp + 8 + 12]  // V
20227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        edx, [esp + 8 + 16]  // argb
20237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        ecx, [esp + 8 + 20]  // width
20247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        edi, esi
20257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
2026ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2027ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
20287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    READYUV422_AVX2
20297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    YUVTORGB_AVX2(kYuvConstants)
20307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    STOREARGB_AVX2
2031ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2032ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        ecx, 16
2033ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
2034ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2035ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        edi
2036ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
20377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vzeroupper
2038ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
2039ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
2040ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
20417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif  // HAS_I422TOARGBROW_AVX2
2042ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
20437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#ifdef HAS_J422TOARGBROW_AVX2
20447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 16 pixels
20457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
20467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
20477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid J422ToARGBRow_AVX2(const uint8* y_buf,
20487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                        const uint8* u_buf,
20497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                        const uint8* v_buf,
20507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                        uint8* dst_argb,
20517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                        int width) {
2052ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
2053ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
2054ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       edi
20557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        eax, [esp + 8 + 4]   // Y
20567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        esi, [esp + 8 + 8]   // U
20577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        edi, [esp + 8 + 12]  // V
20587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        edx, [esp + 8 + 16]  // argb
20597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        ecx, [esp + 8 + 20]  // width
20607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        edi, esi
20617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
2062ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2063ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
20647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    READYUV422_AVX2
20657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    YUVTORGB_AVX2(kYuvJConstants)
20667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    STOREARGB_AVX2
2067ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
20687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 16
20697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    jg         convertloop
20707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
20717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    pop        edi
20727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    pop        esi
20737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vzeroupper
20747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ret
20757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  }
20767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
20777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif  // HAS_J422TOARGBROW_AVX2
20787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
20797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#ifdef HAS_I444TOARGBROW_AVX2
20807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 16 pixels
20817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 16 UV values with 16 Y producing 16 ARGB (64 bytes).
20827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
20837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid I444ToARGBRow_AVX2(const uint8* y_buf,
20847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                        const uint8* u_buf,
20857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                        const uint8* v_buf,
20867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                        uint8* dst_argb,
20877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                        int width) {
20887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  __asm {
20897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    push       esi
20907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    push       edi
20917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        eax, [esp + 8 + 4]   // Y
20927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        esi, [esp + 8 + 8]   // U
20937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        edi, [esp + 8 + 12]  // V
20947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        edx, [esp + 8 + 16]  // argb
20957ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        ecx, [esp + 8 + 20]  // width
20967ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        edi, esi
20977ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
20987ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
20997ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian convertloop:
21007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    READYUV444_AVX2
21017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    YUVTORGB_AVX2(kYuvConstants)
21027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    STOREARGB_AVX2
2103ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2104ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        ecx, 16
2105ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
2106ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2107ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        edi
2108ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
21097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vzeroupper
2110ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
2111ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
2112ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
21137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif  // HAS_I444TOARGBROW_AVX2
2114ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
21157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#ifdef HAS_I411TOARGBROW_AVX2
21167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 16 pixels
21177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
21187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
21197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid I411ToARGBRow_AVX2(const uint8* y_buf,
21207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                        const uint8* u_buf,
21217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                        const uint8* v_buf,
21227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                        uint8* dst_argb,
21237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                        int width) {
2124ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
2125ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
2126ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       edi
21277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        eax, [esp + 8 + 4]   // Y
21287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        esi, [esp + 8 + 8]   // U
21297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        edi, [esp + 8 + 12]  // V
21307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        edx, [esp + 8 + 16]  // argb
21317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        ecx, [esp + 8 + 20]  // width
21327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        edi, esi
21337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
2134ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2135ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
21367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    READYUV411_AVX2
21377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    YUVTORGB_AVX2(kYuvConstants)
21387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    STOREARGB_AVX2
2139ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
21407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 16
21417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    jg         convertloop
21427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
21437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    pop        edi
21447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    pop        esi
21457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vzeroupper
21467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ret
21477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  }
21487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
21497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif  // HAS_I411TOARGBROW_AVX2
21507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
21517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#ifdef HAS_NV12TOARGBROW_AVX2
21527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 16 pixels.
21537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
21547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
21557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid NV12ToARGBRow_AVX2(const uint8* y_buf,
21567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                        const uint8* uv_buf,
21577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                        uint8* dst_argb,
21587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                        int width) {
21597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  __asm {
21607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    push       esi
21617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        eax, [esp + 4 + 4]   // Y
21627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        esi, [esp + 4 + 8]   // UV
21637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        edx, [esp + 4 + 12]  // argb
21647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        ecx, [esp + 4 + 16]  // width
21657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
21667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
21677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian convertloop:
21687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    READNV12_AVX2
21697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    YUVTORGB_AVX2(kYuvConstants)
21707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    STOREARGB_AVX2
2171ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2172ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        ecx, 16
21737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    jg         convertloop
21747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
21757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    pop        esi
21767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vzeroupper
21777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ret
21787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  }
21797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
21807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif  // HAS_NV12TOARGBROW_AVX2
21817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
21827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#ifdef HAS_NV21TOARGBROW_AVX2
21837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 16 pixels.
21847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 8 VU values upsampled to 16 VU, mixed with 16 Y producing 16 ARGB (64 bytes).
21857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
21867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid NV21ToARGBRow_AVX2(const uint8* y_buf,
21877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                        const uint8* uv_buf,
21887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                        uint8* dst_argb,
21897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                        int width) {
21907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  __asm {
21917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    push       esi
21927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        eax, [esp + 4 + 4]   // Y
21937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        esi, [esp + 4 + 8]   // UV
21947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        edx, [esp + 4 + 12]  // argb
21957ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        ecx, [esp + 4 + 16]  // width
21967ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
21977ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
21987ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian convertloop:
21997ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    READNV12_AVX2
22007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    YUVTORGB_AVX2(kYvuConstants)
22017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    STOREARGB_AVX2
22027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
22037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 16
22047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    jg         convertloop
22057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
22067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    pop        esi
22077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vzeroupper
22087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ret
22097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  }
22107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
22117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif  // HAS_NV21TOARGBROW_AVX2
22127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
22137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#ifdef HAS_I422TOBGRAROW_AVX2
22147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 16 pixels
22157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes).
22167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// TODO(fbarchard): Use macros to reduce duplicate code.  See SSSE3.
22177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
22187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid I422ToBGRARow_AVX2(const uint8* y_buf,
22197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                        const uint8* u_buf,
22207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                        const uint8* v_buf,
22217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                        uint8* dst_argb,
22227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                        int width) {
22237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  __asm {
22247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    push       esi
22257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    push       edi
22267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        eax, [esp + 8 + 4]   // Y
22277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        esi, [esp + 8 + 8]   // U
22287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        edi, [esp + 8 + 12]  // V
22297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        edx, [esp + 8 + 16]  // argb
22307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        ecx, [esp + 8 + 20]  // width
22317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        edi, esi
22327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
22337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
22347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian convertloop:
22357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    READYUV422_AVX2
22367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    YUVTORGB_AVX2(kYuvConstants)
22377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
22387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    // Step 3: Weave into BGRA
22397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpunpcklbw ymm1, ymm1, ymm0           // GB
22407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpermq     ymm1, ymm1, 0xd8
22417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpunpcklbw ymm2, ymm5, ymm2           // AR
22427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpermq     ymm2, ymm2, 0xd8
22437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpunpcklwd ymm0, ymm2, ymm1           // ARGB first 8 pixels
22447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpunpckhwd ymm2, ymm2, ymm1           // ARGB next 8 pixels
22457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu    [edx], ymm0
22467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu    [edx + 32], ymm2
22477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    lea        edx,  [edx + 64]
22487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 16
2249ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
2250ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2251ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        edi
2252ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
22537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vzeroupper
2254ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
2255ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
2256ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
22577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif  // HAS_I422TOBGRAROW_AVX2
2258ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
22597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#ifdef HAS_I422TORGBAROW_AVX2
22607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 16 pixels
22617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
22627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// TODO(fbarchard): Use macros to reduce duplicate code.  See SSSE3.
22637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
22647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid I422ToRGBARow_AVX2(const uint8* y_buf,
22657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                        const uint8* u_buf,
22667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                        const uint8* v_buf,
22677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                        uint8* dst_argb,
22687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                        int width) {
2269ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
2270ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
2271ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       edi
22727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        eax, [esp + 8 + 4]   // Y
22737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        esi, [esp + 8 + 8]   // U
22747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        edi, [esp + 8 + 12]  // V
22757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        edx, [esp + 8 + 16]  // argb
22767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        ecx, [esp + 8 + 20]  // width
22777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        edi, esi
22787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
2279ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2280ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
22817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    READYUV422_AVX2
22827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    YUVTORGB_AVX2(kYuvConstants)
2283ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
22847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    // Step 3: Weave into RGBA
22857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpunpcklbw ymm1, ymm1, ymm2           // GR
22867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpermq     ymm1, ymm1, 0xd8
22877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpunpcklbw ymm2, ymm5, ymm0           // AB
22887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpermq     ymm2, ymm2, 0xd8
22897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpunpcklwd ymm0, ymm2, ymm1           // ABGR first 8 pixels
22907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpunpckhwd ymm1, ymm2, ymm1           // ABGR next 8 pixels
22917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu    [edx], ymm0
22927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu    [edx + 32], ymm1
22937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    lea        edx,  [edx + 64]
2294ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        ecx, 16
2295ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
2296ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2297ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        edi
2298ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
22997ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vzeroupper
2300ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
2301ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
2302ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
23037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif  // HAS_I422TORGBAROW_AVX2
2304ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
23057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#ifdef HAS_I422TOABGRROW_AVX2
2306ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// 16 pixels
23077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes).
23087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// TODO(fbarchard): Use macros to reduce duplicate code.  See SSSE3.
23097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
23107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid I422ToABGRRow_AVX2(const uint8* y_buf,
2311ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        const uint8* u_buf,
2312ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        const uint8* v_buf,
2313ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        uint8* dst_argb,
2314ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        int width) {
2315ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
2316ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
2317ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       edi
2318ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 8 + 4]   // Y
2319ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, [esp + 8 + 8]   // U
2320ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edi, [esp + 8 + 12]  // V
2321ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 8 + 16]  // argb
2322ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 8 + 20]  // width
2323ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        edi, esi
2324ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
2325ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2326ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
23277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    READYUV422_AVX2
23287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    YUVTORGB_AVX2(kYuvConstants)
23297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
23307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    // Step 3: Weave into ABGR
23317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpunpcklbw ymm1, ymm2, ymm1           // RG
23327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpermq     ymm1, ymm1, 0xd8
23337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpunpcklbw ymm2, ymm0, ymm5           // BA
2334ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpermq     ymm2, ymm2, 0xd8
23357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpunpcklwd ymm0, ymm1, ymm2           // RGBA first 8 pixels
23367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpunpckhwd ymm1, ymm1, ymm2           // RGBA next 8 pixels
23377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu    [edx], ymm0
23387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu    [edx + 32], ymm1
2339ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx,  [edx + 64]
2340ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        ecx, 16
2341ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
2342ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2343ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        edi
2344ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
23457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vzeroupper
2346ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
2347ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
2348ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
23497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif  // HAS_I422TOABGRROW_AVX2
2350ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
23517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#if defined(HAS_I422TOARGBROW_SSSE3)
2352ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
2353ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2354ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Read 8 UV from 444.
2355ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#define READYUV444 __asm {                                                     \
2356ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    __asm movq       xmm0, qword ptr [esi] /* U */                /* NOLINT */ \
2357ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    __asm movq       xmm1, qword ptr [esi + edi] /* V */          /* NOLINT */ \
2358ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    __asm lea        esi,  [esi + 8]                                           \
2359ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
2360ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
2361ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2362ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Read 4 UV from 422, upsample to 8 UV.
2363ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#define READYUV422 __asm {                                                     \
2364ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    __asm movd       xmm0, [esi]          /* U */                              \
2365ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    __asm movd       xmm1, [esi + edi]    /* V */                              \
2366ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    __asm lea        esi,  [esi + 4]                                           \
2367ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
2368ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
2369ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
2370ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2371ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Read 2 UV from 411, upsample to 8 UV.
2372ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#define READYUV411 __asm {                                                     \
2373ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    __asm movzx      ebx, word ptr [esi]        /* U */           /* NOLINT */ \
2374ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    __asm movd       xmm0, ebx                                                 \
2375ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    __asm movzx      ebx, word ptr [esi + edi]  /* V */           /* NOLINT */ \
2376ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    __asm movd       xmm1, ebx                                                 \
2377ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    __asm lea        esi,  [esi + 2]                                           \
2378ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
2379ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
23807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm punpckldq  xmm0, xmm0           /* UVUVUVUV (upsample) */            \
2381ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
2382ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2383ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Read 4 UV from NV12, upsample to 8 UV.
2384ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#define READNV12 __asm {                                                       \
2385ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    __asm movq       xmm0, qword ptr [esi] /* UV */               /* NOLINT */ \
2386ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    __asm lea        esi,  [esi + 8]                                           \
2387ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
2388ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
2389ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2390ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Convert 8 pixels: 8 UV and 8 Y.
23917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define YUVTORGB(YuvConstants) __asm {                                         \
2392ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    /* Step 1: Find 4 UV contributions to 8 R,G,B values */                    \
2393ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    __asm movdqa     xmm1, xmm0                                                \
2394ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    __asm movdqa     xmm2, xmm0                                                \
23957ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm movdqa     xmm3, xmm0                                                \
23967ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm movdqa     xmm0, YuvConstants.kUVBiasB /* unbias back to signed */   \
23977ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm pmaddubsw  xmm1, YuvConstants.kUVToB   /* scale B UV */              \
23987ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm psubw      xmm0, xmm1                                                \
23997ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm movdqa     xmm1, YuvConstants.kUVBiasG                               \
24007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm pmaddubsw  xmm2, YuvConstants.kUVToG   /* scale G UV */              \
24017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm psubw      xmm1, xmm2                                                \
24027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm movdqa     xmm2, YuvConstants.kUVBiasR                               \
24037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm pmaddubsw  xmm3, YuvConstants.kUVToR   /* scale R UV */              \
24047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm psubw      xmm2, xmm3                                                \
2405ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    /* Step 2: Find Y contribution to 8 R,G,B values */                        \
2406ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    __asm movq       xmm3, qword ptr [eax]                        /* NOLINT */ \
2407ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    __asm lea        eax, [eax + 8]                                            \
24087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm punpcklbw  xmm3, xmm3                                                \
24097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm pmulhuw    xmm3, YuvConstants.kYToRgb                                \
2410ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    __asm paddsw     xmm0, xmm3           /* B += Y */                         \
2411ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    __asm paddsw     xmm1, xmm3           /* G += Y */                         \
2412ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    __asm paddsw     xmm2, xmm3           /* R += Y */                         \
2413ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    __asm psraw      xmm0, 6                                                   \
2414ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    __asm psraw      xmm1, 6                                                   \
2415ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    __asm psraw      xmm2, 6                                                   \
2416ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    __asm packuswb   xmm0, xmm0           /* B */                              \
2417ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    __asm packuswb   xmm1, xmm1           /* G */                              \
2418ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    __asm packuswb   xmm2, xmm2           /* R */                              \
2419ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
2420ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
24217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Store 8 ARGB values.
24227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define STOREARGB __asm {                                                      \
24237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    /* Step 3: Weave into ARGB */                                              \
24247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm punpcklbw  xmm0, xmm1           /* BG */                             \
24257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm punpcklbw  xmm2, xmm5           /* RA */                             \
2426ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    __asm movdqa     xmm1, xmm0                                                \
24277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm punpcklwd  xmm0, xmm2           /* BGRA first 4 pixels */            \
24287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm punpckhwd  xmm1, xmm2           /* BGRA next 4 pixels */             \
24297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm movdqu     0[edx], xmm0                                              \
24307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm movdqu     16[edx], xmm1                                             \
24317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm lea        edx,  [edx + 32]                                          \
24327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  }
24337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
24347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Store 8 BGRA values.
24357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define STOREBGRA __asm {                                                      \
24367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    /* Step 3: Weave into BGRA */                                              \
24377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm pcmpeqb    xmm5, xmm5           /* generate 0xffffffff for alpha */  \
24387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm punpcklbw  xmm1, xmm0           /* GB */                             \
24397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm punpcklbw  xmm5, xmm2           /* AR */                             \
24407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm movdqa     xmm0, xmm5                                                \
24417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm punpcklwd  xmm5, xmm1           /* BGRA first 4 pixels */            \
24427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm punpckhwd  xmm0, xmm1           /* BGRA next 4 pixels */             \
24437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm movdqu     0[edx], xmm5                                              \
24447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm movdqu     16[edx], xmm0                                             \
24457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm lea        edx,  [edx + 32]                                          \
24467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  }
24477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
24487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Store 8 ABGR values.
24497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define STOREABGR __asm {                                                      \
24507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    /* Step 3: Weave into ABGR */                                              \
24517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm punpcklbw  xmm2, xmm1           /* RG */                             \
24527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm punpcklbw  xmm0, xmm5           /* BA */                             \
24537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm movdqa     xmm1, xmm2                                                \
24547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm punpcklwd  xmm2, xmm0           /* RGBA first 4 pixels */            \
24557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm punpckhwd  xmm1, xmm0           /* RGBA next 4 pixels */             \
24567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm movdqu     0[edx], xmm2                                              \
24577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm movdqu     16[edx], xmm1                                             \
24587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm lea        edx,  [edx + 32]                                          \
24597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  }
24607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
24617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Store 8 RGBA values.
24627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define STORERGBA __asm {                                                      \
24637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    /* Step 3: Weave into RGBA */                                              \
24647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm pcmpeqb    xmm5, xmm5           /* generate 0xffffffff for alpha */  \
24657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm punpcklbw  xmm1, xmm2           /* GR */                             \
24667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm punpcklbw  xmm5, xmm0           /* AB */                             \
24677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm movdqa     xmm0, xmm5                                                \
24687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm punpcklwd  xmm5, xmm1           /* RGBA first 4 pixels */            \
24697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm punpckhwd  xmm0, xmm1           /* RGBA next 4 pixels */             \
24707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm movdqu     0[edx], xmm5                                              \
24717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm movdqu     16[edx], xmm0                                             \
24727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm lea        edx,  [edx + 32]                                          \
24737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  }
24747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
24757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Store 8 RGB24 values.
24767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define STORERGB24 __asm {                                                     \
24777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    /* Step 3: Weave into RRGB */                                              \
24787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm punpcklbw  xmm0, xmm1           /* BG */                             \
24797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm punpcklbw  xmm2, xmm2           /* RR */                             \
24807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm movdqa     xmm1, xmm0                                                \
24817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm punpcklwd  xmm0, xmm2           /* BGRR first 4 pixels */            \
24827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm punpckhwd  xmm1, xmm2           /* BGRR next 4 pixels */             \
24837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    /* Step 4: RRGB -> RGB24 */                                                \
24847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm pshufb     xmm0, xmm5           /* Pack first 8 and last 4 bytes. */ \
24857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm pshufb     xmm1, xmm6           /* Pack first 12 bytes. */           \
24867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm palignr    xmm1, xmm0, 12       /* last 4 bytes of xmm0 + 12 xmm1 */ \
24877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm movq       qword ptr 0[edx], xmm0  /* First 8 bytes */               \
24887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm movdqu     8[edx], xmm1         /* Last 16 bytes */                  \
24897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm lea        edx,  [edx + 24]                                          \
24907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  }
24917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
24927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Store 8 RAW values.
24937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define STORERAW __asm {                                                       \
24947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    /* Step 3: Weave into RRGB */                                              \
24957ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm punpcklbw  xmm0, xmm1           /* BG */                             \
24967ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm punpcklbw  xmm2, xmm2           /* RR */                             \
24977ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm movdqa     xmm1, xmm0                                                \
24987ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm punpcklwd  xmm0, xmm2           /* BGRR first 4 pixels */            \
24997ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm punpckhwd  xmm1, xmm2           /* BGRR next 4 pixels */             \
25007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    /* Step 4: RRGB -> RAW */                                                  \
25017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm pshufb     xmm0, xmm5           /* Pack first 8 and last 4 bytes. */ \
25027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm pshufb     xmm1, xmm6           /* Pack first 12 bytes. */           \
25037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm palignr    xmm1, xmm0, 12       /* last 4 bytes of xmm0 + 12 xmm1 */ \
25047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm movq       qword ptr 0[edx], xmm0  /* First 8 bytes */               \
25057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm movdqu     8[edx], xmm1         /* Last 16 bytes */                  \
25067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm lea        edx,  [edx + 24]                                          \
25077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  }
25087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
25097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Store 8 RGB565 values.
25107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#define STORERGB565 __asm {                                                    \
25117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    /* Step 3: Weave into RRGB */                                              \
25127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm punpcklbw  xmm0, xmm1           /* BG */                             \
25137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm punpcklbw  xmm2, xmm2           /* RR */                             \
25147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm movdqa     xmm1, xmm0                                                \
25157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm punpcklwd  xmm0, xmm2           /* BGRR first 4 pixels */            \
25167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm punpckhwd  xmm1, xmm2           /* BGRR next 4 pixels */             \
25177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    /* Step 4: RRGB -> RGB565 */                                               \
25187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm movdqa     xmm3, xmm0    /* B  first 4 pixels of argb */             \
25197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm movdqa     xmm2, xmm0    /* G */                                     \
25207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm pslld      xmm0, 8       /* R */                                     \
25217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm psrld      xmm3, 3       /* B */                                     \
25227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm psrld      xmm2, 5       /* G */                                     \
25237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm psrad      xmm0, 16      /* R */                                     \
25247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm pand       xmm3, xmm5    /* B */                                     \
25257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm pand       xmm2, xmm6    /* G */                                     \
25267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm pand       xmm0, xmm7    /* R */                                     \
25277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm por        xmm3, xmm2    /* BG */                                    \
25287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm por        xmm0, xmm3    /* BGR */                                   \
25297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm movdqa     xmm3, xmm1    /* B  next 4 pixels of argb */              \
25307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm movdqa     xmm2, xmm1    /* G */                                     \
25317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm pslld      xmm1, 8       /* R */                                     \
25327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm psrld      xmm3, 3       /* B */                                     \
25337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm psrld      xmm2, 5       /* G */                                     \
25347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm psrad      xmm1, 16      /* R */                                     \
25357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm pand       xmm3, xmm5    /* B */                                     \
25367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm pand       xmm2, xmm6    /* G */                                     \
25377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm pand       xmm1, xmm7    /* R */                                     \
25387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm por        xmm3, xmm2    /* BG */                                    \
25397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm por        xmm1, xmm3    /* BGR */                                   \
25407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm packssdw   xmm0, xmm1                                                \
25417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm movdqu     0[edx], xmm0  /* store 8 pixels of RGB565 */              \
25427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __asm lea        edx, [edx + 16]                                           \
25437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  }
25447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
25457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 8 pixels.
2546ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
25477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
2548ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid I444ToARGBRow_SSSE3(const uint8* y_buf,
2549ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                         const uint8* u_buf,
2550ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                         const uint8* v_buf,
2551ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                         uint8* dst_argb,
2552ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                         int width) {
2553ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
2554ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
2555ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       edi
2556ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 8 + 4]   // Y
2557ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, [esp + 8 + 8]   // U
2558ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edi, [esp + 8 + 12]  // V
2559ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 8 + 16]  // argb
2560ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 8 + 20]  // width
2561ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        edi, esi
2562ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
2563ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2564ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
2565ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    READYUV444
25667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    YUVTORGB(kYuvConstants)
25677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    STOREARGB
2568ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2569ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        ecx, 8
2570ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
2571ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2572ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        edi
2573ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
2574ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
2575ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
2576ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2577ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
25787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 8 pixels.
25797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).
25807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
2581ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid I422ToRGB24Row_SSSE3(const uint8* y_buf,
2582ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                          const uint8* u_buf,
2583ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                          const uint8* v_buf,
2584ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                          uint8* dst_rgb24,
2585ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                          int width) {
2586ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
2587ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
2588ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       edi
2589ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 8 + 4]   // Y
2590ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, [esp + 8 + 8]   // U
2591ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edi, [esp + 8 + 12]  // V
2592ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 8 + 16]  // rgb24
2593ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 8 + 20]  // width
2594ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        edi, esi
2595ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm5, kShuffleMaskARGBToRGB24_0
2596ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm6, kShuffleMaskARGBToRGB24
2597ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2598ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
2599ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    READYUV422
26007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    YUVTORGB(kYuvConstants)
26017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    STORERGB24
2602ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2603ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        ecx, 8
2604ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
2605ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2606ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        edi
2607ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
2608ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
2609ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
2610ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2611ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
26127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 8 pixels.
26137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RAW (24 bytes).
26147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
2615ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid I422ToRAWRow_SSSE3(const uint8* y_buf,
2616ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        const uint8* u_buf,
2617ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        const uint8* v_buf,
2618ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        uint8* dst_raw,
2619ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        int width) {
2620ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
2621ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
2622ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       edi
2623ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 8 + 4]   // Y
2624ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, [esp + 8 + 8]   // U
2625ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edi, [esp + 8 + 12]  // V
2626ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 8 + 16]  // raw
2627ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 8 + 20]  // width
2628ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        edi, esi
2629ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm5, kShuffleMaskARGBToRAW_0
2630ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm6, kShuffleMaskARGBToRAW
2631ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2632ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
2633ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    READYUV422
26347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    YUVTORGB(kYuvConstants)
26357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    STORERAW
2636ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2637ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        ecx, 8
2638ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
2639ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2640ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        edi
2641ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
2642ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
2643ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
2644ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2645ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
26467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 8 pixels
26477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).
26487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
2649ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid I422ToRGB565Row_SSSE3(const uint8* y_buf,
2650ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                           const uint8* u_buf,
2651ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                           const uint8* v_buf,
2652ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                           uint8* rgb565_buf,
2653ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                           int width) {
2654ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
2655ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
2656ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       edi
2657ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 8 + 4]   // Y
2658ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, [esp + 8 + 8]   // U
2659ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edi, [esp + 8 + 12]  // V
2660ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 8 + 16]  // rgb565
2661ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 8 + 20]  // width
2662ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        edi, esi
2663ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pcmpeqb    xmm5, xmm5       // generate mask 0x0000001f
2664ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrld      xmm5, 27
2665ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pcmpeqb    xmm6, xmm6       // generate mask 0x000007e0
2666ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrld      xmm6, 26
2667ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pslld      xmm6, 5
2668ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pcmpeqb    xmm7, xmm7       // generate mask 0xfffff800
2669ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pslld      xmm7, 11
2670ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2671ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
2672ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    READYUV422
26737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    YUVTORGB(kYuvConstants)
26747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    STORERGB565
2675ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2676ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        ecx, 8
2677ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
2678ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2679ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        edi
2680ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
2681ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
2682ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
2683ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2684ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
26857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 8 pixels.
2686ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
26877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
2688ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid I422ToARGBRow_SSSE3(const uint8* y_buf,
2689ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                         const uint8* u_buf,
2690ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                         const uint8* v_buf,
2691ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                         uint8* dst_argb,
2692ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                         int width) {
2693ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
2694ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
2695ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       edi
2696ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 8 + 4]   // Y
2697ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, [esp + 8 + 8]   // U
2698ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edi, [esp + 8 + 12]  // V
2699ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 8 + 16]  // argb
2700ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 8 + 20]  // width
2701ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        edi, esi
2702ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
2703ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2704ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
2705ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    READYUV422
27067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    YUVTORGB(kYuvConstants)
27077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    STOREARGB
2708ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2709ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        ecx, 8
2710ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
2711ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2712ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        edi
2713ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
2714ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
2715ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
2716ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2717ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
27187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 8 pixels.
27197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// JPeg color space version of I422ToARGB
27207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
27217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
27227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid J422ToARGBRow_SSSE3(const uint8* y_buf,
2723ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                         const uint8* u_buf,
2724ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                         const uint8* v_buf,
2725ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                         uint8* dst_argb,
2726ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                         int width) {
2727ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
2728ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
2729ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       edi
2730ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 8 + 4]   // Y
2731ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, [esp + 8 + 8]   // U
2732ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edi, [esp + 8 + 12]  // V
2733ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 8 + 16]  // argb
2734ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 8 + 20]  // width
2735ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        edi, esi
2736ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
2737ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2738ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
2739ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    READYUV422
27407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    YUVTORGB(kYuvJConstants)
27417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    STOREARGB
27427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
2743ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        ecx, 8
2744ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
2745ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2746ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        edi
2747ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
2748ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
2749ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
2750ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2751ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
27527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 8 pixels.
2753ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2754ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Similar to I420 but duplicate UV once more.
27557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
27567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid I411ToARGBRow_SSSE3(const uint8* y_buf,
27577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                         const uint8* u_buf,
27587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                         const uint8* v_buf,
27597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                         uint8* dst_argb,
27607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                         int width) {
2761ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
2762ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       ebx
2763ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
2764ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       edi
2765ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 12 + 4]   // Y
2766ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, [esp + 12 + 8]   // U
2767ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edi, [esp + 12 + 12]  // V
2768ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 12 + 16]  // argb
2769ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 12 + 20]  // width
2770ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        edi, esi
27717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    pcmpeqb    xmm5, xmm5            // generate 0xffffffff for alpha
2772ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2773ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
2774ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    READYUV411  // modifies EBX
27757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    YUVTORGB(kYuvConstants)
27767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    STOREARGB
2777ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2778ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        ecx, 8
2779ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
2780ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2781ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        edi
2782ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
2783ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        ebx
2784ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
2785ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
2786ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2787ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
27887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 8 pixels.
2789ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
27907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
27917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid NV12ToARGBRow_SSSE3(const uint8* y_buf,
27927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                         const uint8* uv_buf,
27937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                         uint8* dst_argb,
27947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                         int width) {
2795ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
2796ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
2797ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4 + 4]   // Y
2798ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, [esp + 4 + 8]   // UV
2799ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 4 + 12]  // argb
2800ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 4 + 16]  // width
2801ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
2802ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2803ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
2804ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    READNV12
28057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    YUVTORGB(kYuvConstants)
28067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    STOREARGB
2807ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2808ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        ecx, 8
2809ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
2810ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2811ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
2812ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
2813ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
2814ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2815ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
28167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 8 pixels.
28177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 4 VU values upsampled to 8 VU, mixed with 8 Y producing 8 ARGB (32 bytes).
28187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
28197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid NV21ToARGBRow_SSSE3(const uint8* y_buf,
28207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                         const uint8* uv_buf,
28217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                         uint8* dst_argb,
28227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                         int width) {
2823ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
2824ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
2825ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4 + 4]   // Y
28267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        esi, [esp + 4 + 8]   // UV
2827ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 4 + 12]  // argb
2828ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 4 + 16]  // width
2829ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
2830ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2831ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
2832ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    READNV12
28337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    YUVTORGB(kYvuConstants)
28347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    STOREARGB
2835ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2836ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        ecx, 8
2837ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
2838ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2839ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
2840ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
2841ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
2842ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2843ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
28447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
2845ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid I422ToBGRARow_SSSE3(const uint8* y_buf,
2846ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                         const uint8* u_buf,
2847ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                         const uint8* v_buf,
2848ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                         uint8* dst_bgra,
2849ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                         int width) {
2850ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
2851ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
2852ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       edi
2853ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 8 + 4]   // Y
2854ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, [esp + 8 + 8]   // U
2855ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edi, [esp + 8 + 12]  // V
2856ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 8 + 16]  // bgra
2857ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 8 + 20]  // width
2858ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        edi, esi
2859ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2860ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
2861ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    READYUV422
28627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    YUVTORGB(kYuvConstants)
28637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    STOREBGRA
2864ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2865ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        ecx, 8
2866ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
2867ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2868ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        edi
2869ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
2870ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
2871ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
2872ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2873ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
28747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
2875ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid I422ToABGRRow_SSSE3(const uint8* y_buf,
2876ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                         const uint8* u_buf,
2877ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                         const uint8* v_buf,
2878ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                         uint8* dst_abgr,
2879ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                         int width) {
2880ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
2881ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
2882ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       edi
2883ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 8 + 4]   // Y
2884ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, [esp + 8 + 8]   // U
2885ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edi, [esp + 8 + 12]  // V
2886ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 8 + 16]  // abgr
2887ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 8 + 20]  // width
2888ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        edi, esi
2889ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
2890ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2891ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
2892ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    READYUV422
28937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    YUVTORGB(kYuvConstants)
28947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    STOREABGR
2895ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2896ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        ecx, 8
2897ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
2898ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2899ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        edi
2900ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
2901ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
2902ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
2903ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2904ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
29057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
2906ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid I422ToRGBARow_SSSE3(const uint8* y_buf,
2907ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                         const uint8* u_buf,
2908ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                         const uint8* v_buf,
2909ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                         uint8* dst_rgba,
2910ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                         int width) {
2911ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
2912ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
2913ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       edi
2914ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 8 + 4]   // Y
2915ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, [esp + 8 + 8]   // U
2916ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edi, [esp + 8 + 12]  // V
2917ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 8 + 16]  // rgba
2918ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 8 + 20]  // width
2919ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        edi, esi
2920ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2921ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
2922ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    READYUV422
29237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    YUVTORGB(kYuvConstants)
29247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    STORERGBA
2925ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2926ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        ecx, 8
2927ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
2928ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2929ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        edi
2930ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
2931ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
2932ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
2933ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
2934ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2935ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif  // HAS_I422TOARGBROW_SSSE3
2936ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
29377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#ifdef HAS_I400TOARGBROW_SSE2
29387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
29397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
29407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid I400ToARGBRow_SSE2(const uint8* y_buf,
29417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                        uint8* rgb_buf,
29427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                        int width) {
2943ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
29447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        eax, 0x4a354a35      // 4a35 = 18997 = round(1.164 * 64 * 256)
2945ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       xmm2, eax
2946ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufd     xmm2, xmm2,0
29477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        eax, 0x04880488      // 0488 = 1160 = round(1.164 * 64 * 16)
29487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movd       xmm3, eax
29497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    pshufd     xmm3, xmm3, 0
29507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    pcmpeqb    xmm4, xmm4           // generate mask 0xff000000
29517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    pslld      xmm4, 24
29527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
2953ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4]       // Y
2954ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 8]       // rgb
2955ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 12]      // width
2956ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2957ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
2958ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
2959ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movq       xmm0, qword ptr [eax]
2960ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 8]
29617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    punpcklbw  xmm0, xmm0           // Y.Y
29627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    pmulhuw    xmm0, xmm2
2963ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psubusw    xmm0, xmm3
2964ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm0, 6
2965ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm0, xmm0           // G
2966ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2967ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // Step 2: Weave into ARGB
2968ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw  xmm0, xmm0           // GG
2969ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm1, xmm0
2970ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklwd  xmm0, xmm0           // BGRA first 4 pixels
2971ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpckhwd  xmm1, xmm1           // BGRA next 4 pixels
2972ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    por        xmm0, xmm4
2973ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    por        xmm1, xmm4
29747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     [edx], xmm0
29757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     [edx + 16], xmm1
2976ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx,  [edx + 32]
2977ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        ecx, 8
2978ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
29797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ret
29807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  }
29817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
29827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif  // HAS_I400TOARGBROW_SSE2
29837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
29847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#ifdef HAS_I400TOARGBROW_AVX2
29857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
29867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// note: vpunpcklbw mutates and vpackuswb unmutates.
29877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
29887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid I400ToARGBRow_AVX2(const uint8* y_buf,
29897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                        uint8* rgb_buf,
29907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                        int width) {
29917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  __asm {
29927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        eax, 0x4a354a35      // 4a35 = 18997 = round(1.164 * 64 * 256)
29937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovd      xmm2, eax
29947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vbroadcastss ymm2, xmm2
29957ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        eax, 0x04880488      // 0488 = 1160 = round(1.164 * 64 * 16)
29967ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovd      xmm3, eax
29977ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vbroadcastss ymm3, xmm3
29987ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpcmpeqb   ymm4, ymm4, ymm4     // generate mask 0xff000000
29997ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpslld     ymm4, ymm4, 24
30007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
30017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        eax, [esp + 4]       // Y
30027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        edx, [esp + 8]       // rgb
30037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        ecx, [esp + 12]      // width
3004ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
30057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian convertloop:
30067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164
30077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu    xmm0, [eax]
30087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    lea        eax, [eax + 16]
30097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpermq     ymm0, ymm0, 0xd8           // vpunpcklbw mutates
30107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpunpcklbw ymm0, ymm0, ymm0           // Y.Y
30117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpmulhuw   ymm0, ymm0, ymm2
30127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpsubusw   ymm0, ymm0, ymm3
30137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpsrlw     ymm0, ymm0, 6
30147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpackuswb  ymm0, ymm0, ymm0           // G.  still mutated: 3120
30157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
30167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    // TODO(fbarchard): Weave alpha with unpack.
30177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    // Step 2: Weave into ARGB
30187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpunpcklbw ymm1, ymm0, ymm0           // GG - mutates
30197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpermq     ymm1, ymm1, 0xd8
30207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpunpcklwd ymm0, ymm1, ymm1           // GGGG first 8 pixels
30217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpunpckhwd ymm1, ymm1, ymm1           // GGGG next 8 pixels
30227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpor       ymm0, ymm0, ymm4
30237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpor       ymm1, ymm1, ymm4
30247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu    [edx], ymm0
30257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu    [edx + 32], ymm1
30267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    lea        edx,  [edx + 64]
30277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 16
30287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    jg         convertloop
30297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vzeroupper
3030ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
3031ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
3032ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
30337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif  // HAS_I400TOARGBROW_AVX2
3034ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3035ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_MIRRORROW_SSSE3
3036ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Shuffle table for reversing the bytes.
3037ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const uvec8 kShuffleMirror = {
3038ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
3039ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian};
3040ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
30417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// TODO(fbarchard): Replace lea with -16 offset.
30427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
3043ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
3044ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
3045ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov       eax, [esp + 4]   // src
3046ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov       edx, [esp + 8]   // dst
3047ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov       ecx, [esp + 12]  // width
3048ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa    xmm5, kShuffleMirror
3049ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3050ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
30517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu    xmm0, [eax - 16 + ecx]
3052ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufb    xmm0, xmm5
30537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu    [edx], xmm0
3054ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea       edx, [edx + 16]
30557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub       ecx, 16
3056ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg        convertloop
3057ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
3058ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
3059ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
3060ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif  // HAS_MIRRORROW_SSSE3
3061ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3062ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_MIRRORROW_AVX2
30637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
3064ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
3065ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
3066ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov       eax, [esp + 4]   // src
3067ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov       edx, [esp + 8]   // dst
3068ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov       ecx, [esp + 12]  // width
30697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vbroadcastf128 ymm5, kShuffleMirror
3070ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3071ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
30727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu   ymm0, [eax - 32 + ecx]
3073ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpshufb   ymm0, ymm0, ymm5
3074ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpermq    ymm0, ymm0, 0x4e  // swap high and low halfs
3075ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovdqu   [edx], ymm0
3076ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea       edx, [edx + 32]
30777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub       ecx, 32
3078ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg        convertloop
3079ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vzeroupper
3080ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
3081ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
3082ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
3083ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif  // HAS_MIRRORROW_AVX2
3084ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3085ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_MIRRORROW_SSE2
30867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
3087ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
3088ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
3089ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov       eax, [esp + 4]   // src
3090ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov       edx, [esp + 8]   // dst
3091ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov       ecx, [esp + 12]  // width
3092ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3093ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
30947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu    xmm0, [eax - 16 + ecx]
3095ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa    xmm1, xmm0        // swap bytes
3096ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psllw     xmm0, 8
3097ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw     xmm1, 8
3098ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    por       xmm0, xmm1
3099ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshuflw   xmm0, xmm0, 0x1b  // swap words
3100ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufhw   xmm0, xmm0, 0x1b
3101ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufd    xmm0, xmm0, 0x4e  // swap qwords
3102ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu    [edx], xmm0
3103ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea       edx, [edx + 16]
31047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub       ecx, 16
3105ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg        convertloop
3106ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
3107ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
3108ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
3109ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif  // HAS_MIRRORROW_SSE2
3110ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3111ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_MIRRORROW_UV_SSSE3
3112ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Shuffle table for reversing the bytes of UV channels.
3113ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const uvec8 kShuffleMirrorUV = {
3114ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
3115ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian};
3116ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
31177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
3118ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
3119ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                       int width) {
3120ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
3121ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push      edi
3122ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov       eax, [esp + 4 + 4]   // src
3123ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov       edx, [esp + 4 + 8]   // dst_u
3124ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov       edi, [esp + 4 + 12]  // dst_v
3125ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov       ecx, [esp + 4 + 16]  // width
3126ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa    xmm1, kShuffleMirrorUV
3127ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea       eax, [eax + ecx * 2 - 16]
3128ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub       edi, edx
3129ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3130ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
31317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu    xmm0, [eax]
3132ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea       eax, [eax - 16]
3133ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufb    xmm0, xmm1
3134ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movlpd    qword ptr [edx], xmm0
3135ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movhpd    qword ptr [edx + edi], xmm0
3136ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea       edx, [edx + 8]
31377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub       ecx, 8
3138ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg        convertloop
3139ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3140ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop       edi
3141ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
3142ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
3143ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
3144ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif  // HAS_MIRRORROW_UV_SSSE3
3145ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
31467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#ifdef HAS_ARGBMIRRORROW_SSE2
31477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
31487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
3149ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
3150ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov       eax, [esp + 4]   // src
3151ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov       edx, [esp + 8]   // dst
3152ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov       ecx, [esp + 12]  // width
3153ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea       eax, [eax - 16 + ecx * 4]  // last 4 pixels.
3154ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3155ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
31567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu    xmm0, [eax]
3157ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea       eax, [eax - 16]
31587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    pshufd    xmm0, xmm0, 0x1b
31597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu    [edx], xmm0
3160ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea       edx, [edx + 16]
31617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub       ecx, 4
3162ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg        convertloop
3163ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
3164ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
3165ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
31667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif  // HAS_ARGBMIRRORROW_SSE2
3167ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3168ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBMIRRORROW_AVX2
3169ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Shuffle table for reversing the bytes.
3170ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const ulvec32 kARGBShuffleMirror_AVX2 = {
3171ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
3172ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian};
3173ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
31747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
3175ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
3176ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
3177ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov       eax, [esp + 4]   // src
3178ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov       edx, [esp + 8]   // dst
3179ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov       ecx, [esp + 12]  // width
31807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu   ymm5, kARGBShuffleMirror_AVX2
3181ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3182ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
31837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpermd    ymm0, ymm5, [eax - 32 + ecx * 4]  // permute dword order
3184ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovdqu   [edx], ymm0
3185ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea       edx, [edx + 32]
31867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub       ecx, 8
3187ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg        convertloop
3188ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vzeroupper
3189ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
3190ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
3191ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
3192ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif  // HAS_ARGBMIRRORROW_AVX2
3193ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3194ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_SPLITUVROW_SSE2
31957ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
3196ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
3197ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
3198ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       edi
3199ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4 + 4]    // src_uv
3200ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 4 + 8]    // dst_u
3201ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edi, [esp + 4 + 12]   // dst_v
3202ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 4 + 16]   // pix
3203ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
3204ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm5, 8
3205ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        edi, edx
3206ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3207ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  convertloop:
3208ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm0, [eax]
3209ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm1, [eax + 16]
3210ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax,  [eax + 32]
3211ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm2, xmm0
3212ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm3, xmm1
3213ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pand       xmm0, xmm5   // even bytes
3214ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pand       xmm1, xmm5
3215ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm0, xmm1
3216ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm2, 8      // odd bytes
3217ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm3, 8
3218ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm2, xmm3
3219ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     [edx], xmm0
3220ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     [edx + edi], xmm2
3221ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 16]
3222ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        ecx, 16
3223ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
3224ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3225ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        edi
3226ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
3227ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
3228ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
32297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
3230ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif  // HAS_SPLITUVROW_SSE2
3231ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3232ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_SPLITUVROW_AVX2
32337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
3234ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
3235ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
3236ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       edi
3237ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4 + 4]    // src_uv
3238ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 4 + 8]    // dst_u
3239ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edi, [esp + 4 + 12]   // dst_v
3240ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 4 + 16]   // pix
3241ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
3242ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpsrlw     ymm5, ymm5, 8
3243ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        edi, edx
3244ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3245ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  convertloop:
3246ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovdqu    ymm0, [eax]
3247ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovdqu    ymm1, [eax + 32]
3248ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax,  [eax + 64]
3249ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpsrlw     ymm2, ymm0, 8      // odd bytes
3250ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpsrlw     ymm3, ymm1, 8
3251ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpand      ymm0, ymm0, ymm5   // even bytes
3252ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpand      ymm1, ymm1, ymm5
3253ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpackuswb  ymm0, ymm0, ymm1
3254ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpackuswb  ymm2, ymm2, ymm3
3255ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpermq     ymm0, ymm0, 0xd8
3256ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpermq     ymm2, ymm2, 0xd8
3257ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovdqu    [edx], ymm0
3258ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovdqu    [edx + edi], ymm2
3259ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 32]
3260ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        ecx, 32
3261ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
3262ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3263ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        edi
3264ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vzeroupper
3265ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
3266ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
3267ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
3268ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif  // HAS_SPLITUVROW_AVX2
3269ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3270ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_MERGEUVROW_SSE2
32717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
3272ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
3273ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                     int width) {
3274ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
3275ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       edi
3276ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4 + 4]    // src_u
3277ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 4 + 8]    // src_v
3278ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edi, [esp + 4 + 12]   // dst_uv
3279ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 4 + 16]   // width
3280ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        edx, eax
3281ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3282ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  convertloop:
3283ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm0, [eax]      // read 16 U's
3284ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm1, [eax + edx]  // and 16 V's
3285ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax,  [eax + 16]
3286ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm2, xmm0
3287ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw  xmm0, xmm1       // first 8 UV pairs
3288ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpckhbw  xmm2, xmm1       // next 8 UV pairs
3289ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     [edi], xmm0
3290ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     [edi + 16], xmm2
3291ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edi, [edi + 32]
3292ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        ecx, 16
3293ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
3294ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3295ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        edi
3296ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
3297ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
3298ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
3299ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif  //  HAS_MERGEUVROW_SSE2
3300ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3301ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_MERGEUVROW_AVX2
33027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
3303ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
3304ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                     int width) {
3305ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
3306ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       edi
3307ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4 + 4]    // src_u
3308ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 4 + 8]    // src_v
3309ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edi, [esp + 4 + 12]   // dst_uv
3310ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 4 + 16]   // width
3311ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        edx, eax
3312ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3313ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  convertloop:
3314ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovdqu    ymm0, [eax]           // read 32 U's
3315ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovdqu    ymm1, [eax + edx]     // and 32 V's
3316ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax,  [eax + 32]
3317ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpunpcklbw ymm2, ymm0, ymm1      // low 16 UV pairs. mutated qqword 0,2
3318ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpunpckhbw ymm0, ymm0, ymm1      // high 16 UV pairs. mutated qqword 1,3
33197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vextractf128 [edi], ymm2, 0       // bytes 0..15
33207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vextractf128 [edi + 16], ymm0, 0  // bytes 16..31
33217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vextractf128 [edi + 32], ymm2, 1  // bytes 32..47
33227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vextractf128 [edi + 48], ymm0, 1  // bytes 47..63
3323ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edi, [edi + 64]
3324ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        ecx, 32
3325ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
3326ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3327ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        edi
3328ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vzeroupper
3329ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
3330ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
3331ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
3332ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif  //  HAS_MERGEUVROW_AVX2
3333ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3334ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_COPYROW_SSE2
3335ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
33367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
3337ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
3338ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
3339ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4]   // src
3340ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 8]   // dst
3341ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 12]  // count
3342ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3343ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  convertloop:
33447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm0, [eax]
33457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm1, [eax + 16]
3346ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 32]
33477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     [edx], xmm0
33487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     [edx + 16], xmm1
3349ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 32]
3350ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        ecx, 32
3351ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
3352ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
3353ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
3354ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
3355ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif  // HAS_COPYROW_SSE2
3356ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
33577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#ifdef HAS_COPYROW_AVX
33587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time.
33597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
33607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid CopyRow_AVX(const uint8* src, uint8* dst, int count) {
3361ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
33627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        eax, [esp + 4]   // src
33637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        edx, [esp + 8]   // dst
3364ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 12]  // count
33657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
33667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  convertloop:
33677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu    ymm0, [eax]
33687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu    ymm1, [eax + 32]
33697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    lea        eax, [eax + 64]
33707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu    [edx], ymm0
33717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu    [edx + 32], ymm1
33727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    lea        edx, [edx + 64]
33737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 64
33747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    jg         convertloop
33757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
33767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vzeroupper
3377ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
3378ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
3379ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
33807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif  // HAS_COPYROW_AVX
3381ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
33827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Multiple of 1.
33837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
33847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
3385ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
3386ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, esi
3387ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, edi
3388ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, [esp + 4]   // src
3389ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edi, [esp + 8]   // dst
3390ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 12]  // count
33917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    rep movsb
3392ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edi, edx
3393ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, eax
3394ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
3395ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
3396ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
3397ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3398ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBCOPYALPHAROW_SSE2
3399ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// width in pixels
34007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
3401ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
3402ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
3403ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4]   // src
3404ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 8]   // dst
3405ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 12]  // count
3406ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pcmpeqb    xmm0, xmm0       // generate mask 0xff000000
3407ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pslld      xmm0, 24
3408ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff
3409ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrld      xmm1, 8
3410ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3411ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  convertloop:
34127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm2, [eax]
34137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm3, [eax + 16]
3414ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 32]
34157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm4, [edx]
34167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm5, [edx + 16]
3417ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pand       xmm2, xmm0
3418ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pand       xmm3, xmm0
3419ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pand       xmm4, xmm1
3420ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pand       xmm5, xmm1
3421ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    por        xmm2, xmm4
3422ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    por        xmm3, xmm5
34237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     [edx], xmm2
34247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     [edx + 16], xmm3
3425ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 32]
3426ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        ecx, 8
3427ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
3428ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3429ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
3430ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
3431ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
3432ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif  // HAS_ARGBCOPYALPHAROW_SSE2
3433ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3434ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBCOPYALPHAROW_AVX2
3435ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// width in pixels
34367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
3437ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
3438ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
3439ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4]   // src
3440ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 8]   // dst
3441ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 12]  // count
3442ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpcmpeqb   ymm0, ymm0, ymm0
3443ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff
3444ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3445ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  convertloop:
3446ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovdqu    ymm1, [eax]
3447ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovdqu    ymm2, [eax + 32]
3448ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 64]
3449ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpblendvb  ymm1, ymm1, [edx], ymm0
3450ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpblendvb  ymm2, ymm2, [edx + 32], ymm0
3451ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovdqu    [edx], ymm1
3452ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovdqu    [edx + 32], ymm2
3453ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 64]
3454ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        ecx, 16
3455ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
3456ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3457ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vzeroupper
3458ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
3459ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
3460ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
3461ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif  // HAS_ARGBCOPYALPHAROW_AVX2
3462ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3463ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
3464ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// width in pixels
34657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
3466ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
3467ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
3468ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4]   // src
3469ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 8]   // dst
3470ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 12]  // count
3471ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pcmpeqb    xmm0, xmm0       // generate mask 0xff000000
3472ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pslld      xmm0, 24
3473ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff
3474ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrld      xmm1, 8
3475ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3476ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  convertloop:
3477ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movq       xmm2, qword ptr [eax]  // 8 Y's
3478ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 8]
3479ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw  xmm2, xmm2
3480ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpckhwd  xmm3, xmm2
3481ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklwd  xmm2, xmm2
34827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm4, [edx]
34837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm5, [edx + 16]
3484ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pand       xmm2, xmm0
3485ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pand       xmm3, xmm0
3486ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pand       xmm4, xmm1
3487ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pand       xmm5, xmm1
3488ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    por        xmm2, xmm4
3489ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    por        xmm3, xmm5
34907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     [edx], xmm2
34917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     [edx + 16], xmm3
3492ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 32]
3493ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        ecx, 8
3494ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
3495ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3496ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
3497ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
3498ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
3499ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
3500ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3501ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
3502ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// width in pixels
35037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
3504ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
3505ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
3506ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4]   // src
3507ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 8]   // dst
3508ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 12]  // count
3509ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpcmpeqb   ymm0, ymm0, ymm0
3510ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff
3511ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3512ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  convertloop:
3513ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpmovzxbd  ymm1, qword ptr [eax]
3514ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpmovzxbd  ymm2, qword ptr [eax + 8]
3515ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 16]
3516ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpslld     ymm1, ymm1, 24
3517ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpslld     ymm2, ymm2, 24
3518ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpblendvb  ymm1, ymm1, [edx], ymm0
3519ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpblendvb  ymm2, ymm2, [edx + 32], ymm0
3520ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovdqu    [edx], ymm1
3521ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovdqu    [edx + 32], ymm2
3522ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 64]
3523ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        ecx, 16
3524ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
3525ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3526ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vzeroupper
3527ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
3528ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
3529ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
3530ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
3531ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3532ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_SETROW_X86
35337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Write 'count' bytes using an 8 bit value repeated.
35347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Count should be multiple of 4.
35357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
35367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid SetRow_X86(uint8* dst, uint8 v8, int count) {
35377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  __asm {
35387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movzx      eax, byte ptr [esp + 8]    // v8
35397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        edx, 0x01010101  // Duplicate byte to all bytes.
35407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mul        edx              // overwrites edx with upper part of result.
3541ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, edi
3542ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edi, [esp + 4]   // dst
3543ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 12]  // count
3544ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    shr        ecx, 2
3545ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    rep stosd
3546ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edi, edx
3547ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
3548ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
3549ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
3550ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
35517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Write 'count' bytes using an 8 bit value repeated.
35527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
35537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid SetRow_ERMS(uint8* dst, uint8 v8, int count) {
3554ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
35557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        edx, edi
35567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        edi, [esp + 4]   // dst
35577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        eax, [esp + 8]   // v8
35587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        ecx, [esp + 12]  // count
35597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    rep stosb
35607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        edi, edx
35617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ret
35627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  }
35637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
3564ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
35657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Write 'count' 32 bit values.
35667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
35677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) {
35687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  __asm {
35697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        edx, edi
35707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        edi, [esp + 4]   // dst
35717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        eax, [esp + 8]   // v32
35727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        ecx, [esp + 12]  // count
35737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    rep stosd
35747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        edi, edx
3575ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
3576ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
3577ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
3578ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif  // HAS_SETROW_X86
3579ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3580ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_YUY2TOYROW_AVX2
35817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
3582ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid YUY2ToYRow_AVX2(const uint8* src_yuy2,
3583ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                     uint8* dst_y, int pix) {
3584ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
3585ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4]    // src_yuy2
3586ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 8]    // dst_y
3587ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 12]   // pix
3588ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
3589ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpsrlw     ymm5, ymm5, 8
3590ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3591ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  convertloop:
3592ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovdqu    ymm0, [eax]
3593ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovdqu    ymm1, [eax + 32]
3594ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax,  [eax + 64]
3595ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpand      ymm0, ymm0, ymm5   // even bytes are Y
3596ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpand      ymm1, ymm1, ymm5
3597ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpackuswb  ymm0, ymm0, ymm1   // mutates.
3598ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpermq     ymm0, ymm0, 0xd8
3599ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovdqu    [edx], ymm0
3600ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 32]
36017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 32
3602ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
3603ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vzeroupper
3604ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
3605ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
3606ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
3607ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
36087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
3609ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
3610ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                      uint8* dst_u, uint8* dst_v, int pix) {
3611ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
3612ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
3613ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       edi
3614ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 8 + 4]    // src_yuy2
3615ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, [esp + 8 + 8]    // stride_yuy2
3616ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 8 + 12]   // dst_u
3617ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edi, [esp + 8 + 16]   // dst_v
3618ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 8 + 20]   // pix
3619ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
3620ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpsrlw     ymm5, ymm5, 8
3621ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        edi, edx
3622ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3623ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  convertloop:
3624ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovdqu    ymm0, [eax]
3625ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovdqu    ymm1, [eax + 32]
3626ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpavgb     ymm0, ymm0, [eax + esi]
3627ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpavgb     ymm1, ymm1, [eax + esi + 32]
3628ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax,  [eax + 64]
3629ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpsrlw     ymm0, ymm0, 8      // YUYV -> UVUV
3630ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpsrlw     ymm1, ymm1, 8
3631ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpackuswb  ymm0, ymm0, ymm1   // mutates.
3632ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpermq     ymm0, ymm0, 0xd8
3633ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpand      ymm1, ymm0, ymm5  // U
3634ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpsrlw     ymm0, ymm0, 8     // V
3635ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpackuswb  ymm1, ymm1, ymm1  // mutates.
3636ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpackuswb  ymm0, ymm0, ymm0  // mutates.
3637ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpermq     ymm1, ymm1, 0xd8
3638ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpermq     ymm0, ymm0, 0xd8
3639ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vextractf128 [edx], ymm1, 0  // U
3640ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vextractf128 [edx + edi], ymm0, 0 // V
3641ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 16]
3642ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        ecx, 32
3643ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
3644ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3645ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        edi
3646ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
3647ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vzeroupper
3648ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
3649ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
3650ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
3651ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
36527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
3653ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
3654ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                         uint8* dst_u, uint8* dst_v, int pix) {
3655ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
3656ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       edi
3657ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4 + 4]    // src_yuy2
3658ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 4 + 8]    // dst_u
3659ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edi, [esp + 4 + 12]   // dst_v
3660ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 4 + 16]   // pix
36617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
36627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpsrlw     ymm5, ymm5, 8
36637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        edi, edx
36647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
36657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  convertloop:
36667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu    ymm0, [eax]
36677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu    ymm1, [eax + 32]
36687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    lea        eax,  [eax + 64]
36697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpsrlw     ymm0, ymm0, 8      // YUYV -> UVUV
36707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpsrlw     ymm1, ymm1, 8
36717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpackuswb  ymm0, ymm0, ymm1   // mutates.
36727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpermq     ymm0, ymm0, 0xd8
36737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpand      ymm1, ymm0, ymm5  // U
36747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpsrlw     ymm0, ymm0, 8     // V
36757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpackuswb  ymm1, ymm1, ymm1  // mutates.
36767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpackuswb  ymm0, ymm0, ymm0  // mutates.
36777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpermq     ymm1, ymm1, 0xd8
36787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpermq     ymm0, ymm0, 0xd8
36797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vextractf128 [edx], ymm1, 0  // U
36807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vextractf128 [edx + edi], ymm0, 0 // V
36817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    lea        edx, [edx + 16]
36827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 32
3683ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
3684ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3685ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        edi
36867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vzeroupper
3687ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
3688ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
3689ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
3690ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
36917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
36927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid UYVYToYRow_AVX2(const uint8* src_uyvy,
36937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                     uint8* dst_y, int pix) {
3694ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
36957ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        eax, [esp + 4]    // src_uyvy
3696ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 8]    // dst_y
3697ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 12]   // pix
3698ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3699ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  convertloop:
37007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu    ymm0, [eax]
37017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu    ymm1, [eax + 32]
37027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    lea        eax,  [eax + 64]
37037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpsrlw     ymm0, ymm0, 8      // odd bytes are Y
37047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpsrlw     ymm1, ymm1, 8
37057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpackuswb  ymm0, ymm0, ymm1   // mutates.
37067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpermq     ymm0, ymm0, 0xd8
37077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu    [edx], ymm0
37087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    lea        edx, [edx + 32]
37097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 32
3710ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
37117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vzeroupper
3712ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
3713ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
3714ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
3715ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
37167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
37177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
37187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                      uint8* dst_u, uint8* dst_v, int pix) {
3719ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
3720ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
3721ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       edi
3722ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 8 + 4]    // src_yuy2
3723ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, [esp + 8 + 8]    // stride_yuy2
3724ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 8 + 12]   // dst_u
3725ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edi, [esp + 8 + 16]   // dst_v
3726ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 8 + 20]   // pix
37277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
37287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpsrlw     ymm5, ymm5, 8
3729ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        edi, edx
3730ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3731ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  convertloop:
37327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu    ymm0, [eax]
37337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu    ymm1, [eax + 32]
37347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpavgb     ymm0, ymm0, [eax + esi]
37357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpavgb     ymm1, ymm1, [eax + esi + 32]
37367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    lea        eax,  [eax + 64]
37377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpand      ymm0, ymm0, ymm5   // UYVY -> UVUV
37387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpand      ymm1, ymm1, ymm5
37397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpackuswb  ymm0, ymm0, ymm1   // mutates.
37407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpermq     ymm0, ymm0, 0xd8
37417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpand      ymm1, ymm0, ymm5  // U
37427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpsrlw     ymm0, ymm0, 8     // V
37437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpackuswb  ymm1, ymm1, ymm1  // mutates.
37447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpackuswb  ymm0, ymm0, ymm0  // mutates.
37457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpermq     ymm1, ymm1, 0xd8
37467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpermq     ymm0, ymm0, 0xd8
37477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vextractf128 [edx], ymm1, 0  // U
37487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vextractf128 [edx + edi], ymm0, 0 // V
37497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    lea        edx, [edx + 16]
37507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 32
3751ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
3752ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3753ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        edi
3754ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
37557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vzeroupper
3756ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
3757ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
3758ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
3759ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
37607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
37617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid UYVYToUV422Row_AVX2(const uint8* src_uyvy,
37627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                         uint8* dst_u, uint8* dst_v, int pix) {
3763ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
3764ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       edi
3765ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4 + 4]    // src_yuy2
3766ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 4 + 8]    // dst_u
3767ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edi, [esp + 4 + 12]   // dst_v
3768ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 4 + 16]   // pix
37697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
37707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpsrlw     ymm5, ymm5, 8
3771ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        edi, edx
3772ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3773ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  convertloop:
37747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu    ymm0, [eax]
37757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu    ymm1, [eax + 32]
37767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    lea        eax,  [eax + 64]
37777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpand      ymm0, ymm0, ymm5   // UYVY -> UVUV
37787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpand      ymm1, ymm1, ymm5
37797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpackuswb  ymm0, ymm0, ymm1   // mutates.
37807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpermq     ymm0, ymm0, 0xd8
37817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpand      ymm1, ymm0, ymm5  // U
37827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpsrlw     ymm0, ymm0, 8     // V
37837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpackuswb  ymm1, ymm1, ymm1  // mutates.
37847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpackuswb  ymm0, ymm0, ymm0  // mutates.
37857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpermq     ymm1, ymm1, 0xd8
37867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpermq     ymm0, ymm0, 0xd8
37877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vextractf128 [edx], ymm1, 0  // U
37887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vextractf128 [edx + edi], ymm0, 0 // V
37897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    lea        edx, [edx + 16]
37907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 32
3791ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
3792ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3793ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        edi
37947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vzeroupper
3795ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
3796ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
3797ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
37987ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif  // HAS_YUY2TOYROW_AVX2
3799ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
38007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#ifdef HAS_YUY2TOYROW_SSE2
38017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
38027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid YUY2ToYRow_SSE2(const uint8* src_yuy2,
3803ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                     uint8* dst_y, int pix) {
3804ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
38057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        eax, [esp + 4]    // src_yuy2
3806ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 8]    // dst_y
3807ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 12]   // pix
38087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff
38097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    psrlw      xmm5, 8
3810ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3811ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  convertloop:
38127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm0, [eax]
38137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm1, [eax + 16]
3814ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax,  [eax + 32]
38157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    pand       xmm0, xmm5   // even bytes are Y
38167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    pand       xmm1, xmm5
3817ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm0, xmm1
38187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     [edx], xmm0
3819ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 16]
38207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 16
3821ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
3822ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
3823ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
3824ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
3825ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
38267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
38277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
3828ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                      uint8* dst_u, uint8* dst_v, int pix) {
3829ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
3830ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
3831ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       edi
3832ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 8 + 4]    // src_yuy2
3833ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, [esp + 8 + 8]    // stride_yuy2
3834ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 8 + 12]   // dst_u
3835ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edi, [esp + 8 + 16]   // dst_v
3836ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 8 + 20]   // pix
3837ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
3838ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm5, 8
3839ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        edi, edx
3840ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3841ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  convertloop:
38427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm0, [eax]
38437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm1, [eax + 16]
38447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm2, [eax + esi]
38457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm3, [eax + esi + 16]
3846ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax,  [eax + 32]
3847ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgb      xmm0, xmm2
3848ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgb      xmm1, xmm3
38497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    psrlw      xmm0, 8      // YUYV -> UVUV
38507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    psrlw      xmm1, 8
3851ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm0, xmm1
3852ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm1, xmm0
3853ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pand       xmm0, xmm5  // U
3854ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm0, xmm0
3855ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm1, 8     // V
3856ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm1, xmm1
3857ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movq       qword ptr [edx], xmm0
3858ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movq       qword ptr [edx + edi], xmm1
3859ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 8]
3860ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        ecx, 16
3861ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
3862ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3863ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        edi
3864ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
3865ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
3866ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
3867ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
3868ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
38697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
38707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
3871ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                         uint8* dst_u, uint8* dst_v, int pix) {
3872ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
3873ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       edi
3874ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4 + 4]    // src_yuy2
3875ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 4 + 8]    // dst_u
3876ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edi, [esp + 4 + 12]   // dst_v
3877ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 4 + 16]   // pix
3878ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
3879ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm5, 8
3880ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        edi, edx
3881ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3882ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  convertloop:
38837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm0, [eax]
38847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm1, [eax + 16]
3885ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax,  [eax + 32]
38867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    psrlw      xmm0, 8      // YUYV -> UVUV
38877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    psrlw      xmm1, 8
3888ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm0, xmm1
3889ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm1, xmm0
3890ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pand       xmm0, xmm5  // U
3891ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm0, xmm0
3892ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm1, 8     // V
3893ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm1, xmm1
3894ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movq       qword ptr [edx], xmm0
3895ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movq       qword ptr [edx + edi], xmm1
3896ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 8]
3897ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        ecx, 16
3898ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
3899ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3900ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        edi
3901ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
3902ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
3903ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
3904ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
39057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
39067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid UYVYToYRow_SSE2(const uint8* src_uyvy,
39077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                     uint8* dst_y, int pix) {
3908ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
3909ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4]    // src_uyvy
3910ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 8]    // dst_y
3911ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 12]   // pix
3912ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3913ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  convertloop:
3914ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm0, [eax]
3915ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm1, [eax + 16]
3916ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax,  [eax + 32]
3917ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm0, 8    // odd bytes are Y
3918ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm1, 8
3919ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm0, xmm1
3920ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     [edx], xmm0
3921ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 16]
39227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 16
3923ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
3924ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
3925ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
3926ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
3927ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
39287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
39297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
39307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                      uint8* dst_u, uint8* dst_v, int pix) {
3931ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
3932ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
3933ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       edi
3934ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 8 + 4]    // src_yuy2
3935ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, [esp + 8 + 8]    // stride_yuy2
3936ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 8 + 12]   // dst_u
3937ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edi, [esp + 8 + 16]   // dst_v
3938ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 8 + 20]   // pix
3939ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
3940ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm5, 8
3941ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        edi, edx
3942ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3943ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  convertloop:
3944ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm0, [eax]
3945ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm1, [eax + 16]
3946ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm2, [eax + esi]
3947ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm3, [eax + esi + 16]
3948ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax,  [eax + 32]
3949ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgb      xmm0, xmm2
3950ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgb      xmm1, xmm3
3951ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pand       xmm0, xmm5   // UYVY -> UVUV
3952ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pand       xmm1, xmm5
3953ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm0, xmm1
3954ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm1, xmm0
3955ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pand       xmm0, xmm5  // U
3956ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm0, xmm0
3957ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm1, 8     // V
3958ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm1, xmm1
3959ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movq       qword ptr [edx], xmm0
3960ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movq       qword ptr [edx + edi], xmm1
3961ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 8]
3962ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        ecx, 16
3963ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
3964ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3965ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        edi
3966ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
3967ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
3968ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
3969ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
3970ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
39717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
39727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid UYVYToUV422Row_SSE2(const uint8* src_uyvy,
39737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                         uint8* dst_u, uint8* dst_v, int pix) {
3974ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
3975ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       edi
3976ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4 + 4]    // src_yuy2
3977ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 4 + 8]    // dst_u
3978ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edi, [esp + 4 + 12]   // dst_v
3979ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 4 + 16]   // pix
3980ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
3981ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm5, 8
3982ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        edi, edx
3983ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3984ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  convertloop:
3985ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm0, [eax]
3986ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm1, [eax + 16]
3987ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax,  [eax + 32]
3988ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pand       xmm0, xmm5   // UYVY -> UVUV
3989ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pand       xmm1, xmm5
3990ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm0, xmm1
3991ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm1, xmm0
3992ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pand       xmm0, xmm5  // U
3993ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm0, xmm0
3994ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm1, 8     // V
3995ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm1, xmm1
3996ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movq       qword ptr [edx], xmm0
3997ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movq       qword ptr [edx + edi], xmm1
3998ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 8]
3999ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        ecx, 16
4000ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
4001ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4002ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        edi
4003ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
4004ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
4005ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
4006ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif  // HAS_YUY2TOYROW_SSE2
4007ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4008ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBBLENDROW_SSE2
4009ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Blend 8 pixels at a time.
40107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
4011ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4012ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                       uint8* dst_argb, int width) {
4013ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
4014ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
4015ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4 + 4]   // src_argb0
4016ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, [esp + 4 + 8]   // src_argb1
4017ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 4 + 12]  // dst_argb
4018ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 4 + 16]  // width
4019ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pcmpeqb    xmm7, xmm7       // generate constant 1
4020ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm7, 15
4021ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff
4022ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm6, 8
4023ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
4024ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psllw      xmm5, 8
4025ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
4026ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pslld      xmm4, 24
40277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 4
40287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    jl         convertloop4b    // less than 4 pixels?
4029ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4030ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // 4 pixel loop.
4031ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  convertloop4:
4032ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm3, [eax]      // src argb
4033ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 16]
4034ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm0, xmm3       // src argb
4035ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pxor       xmm3, xmm4       // ~alpha
4036ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm2, [esi]      // _r_b
4037ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm3, 8          // alpha
4038ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufhw    xmm3, xmm3, 0F5h // 8 alpha words
4039ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshuflw    xmm3, xmm3, 0F5h
4040ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pand       xmm2, xmm6       // _r_b
4041ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddw      xmm3, xmm7       // 256 - alpha
4042ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmullw     xmm2, xmm3       // _r_b * alpha
4043ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm1, [esi]      // _a_g
4044ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        esi, [esi + 16]
4045ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm1, 8          // _a_g
4046ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    por        xmm0, xmm4       // set alpha to 255
4047ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmullw     xmm1, xmm3       // _a_g * alpha
4048ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm2, 8          // _r_b convert to 8 bits again
4049ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddusb    xmm0, xmm2       // + src argb
4050ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
4051ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddusb    xmm0, xmm1       // + src argb
40527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     [edx], xmm0
4053ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 16]
40547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 4
4055ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jge        convertloop4
4056ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4057ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  convertloop4b:
4058ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    add        ecx, 4 - 1
4059ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jl         convertloop1b
4060ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4061ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // 1 pixel loop.
4062ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  convertloop1:
4063ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       xmm3, [eax]      // src argb
4064ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 4]
4065ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm0, xmm3       // src argb
4066ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pxor       xmm3, xmm4       // ~alpha
4067ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       xmm2, [esi]      // _r_b
4068ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm3, 8          // alpha
4069ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufhw    xmm3, xmm3, 0F5h // 8 alpha words
4070ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshuflw    xmm3, xmm3, 0F5h
4071ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pand       xmm2, xmm6       // _r_b
4072ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddw      xmm3, xmm7       // 256 - alpha
4073ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmullw     xmm2, xmm3       // _r_b * alpha
4074ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       xmm1, [esi]      // _a_g
4075ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        esi, [esi + 4]
4076ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm1, 8          // _a_g
4077ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    por        xmm0, xmm4       // set alpha to 255
4078ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmullw     xmm1, xmm3       // _a_g * alpha
4079ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm2, 8          // _r_b convert to 8 bits again
4080ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddusb    xmm0, xmm2       // + src argb
4081ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
4082ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddusb    xmm0, xmm1       // + src argb
4083ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       [edx], xmm0
4084ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 4]
40857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 1
4086ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jge        convertloop1
4087ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4088ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  convertloop1b:
4089ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
4090ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
4091ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
4092ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
4093ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif  // HAS_ARGBBLENDROW_SSE2
4094ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4095ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBBLENDROW_SSSE3
4096ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Shuffle table for isolating alpha.
4097ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const uvec8 kShuffleAlpha = {
4098ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
4099ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
4100ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian};
4101ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Same as SSE2, but replaces:
4102ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian//    psrlw      xmm3, 8          // alpha
4103ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian//    pshufhw    xmm3, xmm3, 0F5h // 8 alpha words
4104ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian//    pshuflw    xmm3, xmm3, 0F5h
4105ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// with..
4106ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian//    pshufb     xmm3, kShuffleAlpha // alpha
4107ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Blend 8 pixels at a time.
4108ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
41097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
4110ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
4111ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        uint8* dst_argb, int width) {
4112ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
4113ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
4114ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4 + 4]   // src_argb0
4115ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, [esp + 4 + 8]   // src_argb1
4116ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 4 + 12]  // dst_argb
4117ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 4 + 16]  // width
4118ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pcmpeqb    xmm7, xmm7       // generate constant 0x0001
4119ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm7, 15
4120ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff
4121ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm6, 8
4122ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
4123ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psllw      xmm5, 8
4124ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
4125ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pslld      xmm4, 24
41267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 4
41277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    jl         convertloop4b    // less than 4 pixels?
4128ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4129ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // 4 pixel loop.
4130ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  convertloop4:
4131ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm3, [eax]      // src argb
4132ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 16]
4133ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm0, xmm3       // src argb
4134ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pxor       xmm3, xmm4       // ~alpha
4135ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm2, [esi]      // _r_b
4136ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufb     xmm3, kShuffleAlpha // alpha
4137ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pand       xmm2, xmm6       // _r_b
4138ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddw      xmm3, xmm7       // 256 - alpha
4139ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmullw     xmm2, xmm3       // _r_b * alpha
4140ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm1, [esi]      // _a_g
4141ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        esi, [esi + 16]
4142ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm1, 8          // _a_g
4143ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    por        xmm0, xmm4       // set alpha to 255
4144ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmullw     xmm1, xmm3       // _a_g * alpha
4145ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm2, 8          // _r_b convert to 8 bits again
4146ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddusb    xmm0, xmm2       // + src argb
4147ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
4148ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddusb    xmm0, xmm1       // + src argb
41497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     [edx], xmm0
4150ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 16]
41517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 4
41527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    jge        convertloop4
4153ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4154ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  convertloop4b:
4155ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    add        ecx, 4 - 1
4156ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jl         convertloop1b
4157ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4158ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // 1 pixel loop.
4159ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  convertloop1:
4160ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       xmm3, [eax]      // src argb
4161ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 4]
4162ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm0, xmm3       // src argb
4163ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pxor       xmm3, xmm4       // ~alpha
4164ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       xmm2, [esi]      // _r_b
4165ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufb     xmm3, kShuffleAlpha // alpha
4166ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pand       xmm2, xmm6       // _r_b
4167ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddw      xmm3, xmm7       // 256 - alpha
4168ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmullw     xmm2, xmm3       // _r_b * alpha
4169ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       xmm1, [esi]      // _a_g
4170ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        esi, [esi + 4]
4171ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm1, 8          // _a_g
4172ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    por        xmm0, xmm4       // set alpha to 255
4173ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmullw     xmm1, xmm3       // _a_g * alpha
4174ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm2, 8          // _r_b convert to 8 bits again
4175ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddusb    xmm0, xmm2       // + src argb
4176ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
4177ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddusb    xmm0, xmm1       // + src argb
4178ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       [edx], xmm0
4179ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 4]
41807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 1
4181ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jge        convertloop1
4182ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4183ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  convertloop1b:
4184ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
4185ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
4186ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
4187ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
4188ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif  // HAS_ARGBBLENDROW_SSSE3
4189ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4190ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBATTENUATEROW_SSE2
4191ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Attenuate 4 pixels at a time.
41927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
4193ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
4194ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
4195ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4]   // src_argb0
4196ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 8]   // dst_argb
4197ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 12]  // width
4198ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
4199ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pslld      xmm4, 24
4200ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pcmpeqb    xmm5, xmm5       // generate mask 0x00ffffff
4201ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrld      xmm5, 8
4202ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4203ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
42047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm0, [eax]      // read 4 pixels
4205ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw  xmm0, xmm0       // first 2
4206ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufhw    xmm2, xmm0, 0FFh // 8 alpha words
4207ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshuflw    xmm2, xmm2, 0FFh
4208ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmulhuw    xmm0, xmm2       // rgb * a
42097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm1, [eax]      // read 4 pixels
4210ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpckhbw  xmm1, xmm1       // next 2 pixels
4211ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufhw    xmm2, xmm1, 0FFh // 8 alpha words
4212ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshuflw    xmm2, xmm2, 0FFh
4213ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmulhuw    xmm1, xmm2       // rgb * a
42147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm2, [eax]      // alphas
4215ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 16]
4216ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm0, 8
4217ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pand       xmm2, xmm4
4218ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm1, 8
4219ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm0, xmm1
4220ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pand       xmm0, xmm5       // keep original alphas
4221ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    por        xmm0, xmm2
42227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     [edx], xmm0
4223ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 16]
42247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 4
4225ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
4226ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4227ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
4228ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
4229ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
4230ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif  // HAS_ARGBATTENUATEROW_SSE2
4231ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4232ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBATTENUATEROW_SSSE3
4233ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Shuffle table duplicating alpha.
4234ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const uvec8 kShuffleAlpha0 = {
4235ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
4236ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian};
4237ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const uvec8 kShuffleAlpha1 = {
4238ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
4239ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
4240ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian};
42417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
4242ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
4243ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
4244ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4]   // src_argb0
4245ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 8]   // dst_argb
4246ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 12]  // width
4247ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pcmpeqb    xmm3, xmm3       // generate mask 0xff000000
4248ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pslld      xmm3, 24
4249ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm4, kShuffleAlpha0
4250ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm5, kShuffleAlpha1
4251ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4252ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
4253ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm0, [eax]      // read 4 pixels
4254ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufb     xmm0, xmm4       // isolate first 2 alphas
4255ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm1, [eax]      // read 4 pixels
4256ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw  xmm1, xmm1       // first 2 pixel rgbs
4257ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmulhuw    xmm0, xmm1       // rgb * a
4258ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm1, [eax]      // read 4 pixels
4259ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufb     xmm1, xmm5       // isolate next 2 alphas
4260ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm2, [eax]      // read 4 pixels
4261ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpckhbw  xmm2, xmm2       // next 2 pixel rgbs
4262ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmulhuw    xmm1, xmm2       // rgb * a
4263ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm2, [eax]      // mask original alpha
4264ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 16]
4265ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pand       xmm2, xmm3
4266ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm0, 8
4267ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm1, 8
4268ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm0, xmm1
4269ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    por        xmm0, xmm2       // copy original alpha
4270ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     [edx], xmm0
4271ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 16]
42727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 4
4273ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
4274ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4275ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
4276ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
4277ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
4278ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif  // HAS_ARGBATTENUATEROW_SSSE3
4279ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4280ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBATTENUATEROW_AVX2
4281ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Shuffle table duplicating alpha.
42827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianstatic const uvec8 kShuffleAlpha_AVX2 = {
42837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
4284ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian};
42857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
4286ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
4287ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
4288ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4]   // src_argb0
4289ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 8]   // dst_argb
4290ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 12]  // width
4291ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        edx, eax
42927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vbroadcastf128 ymm4,kShuffleAlpha_AVX2
4293ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpcmpeqb   ymm5, ymm5, ymm5 // generate mask 0xff000000
4294ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpslld     ymm5, ymm5, 24
4295ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4296ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
4297ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovdqu    ymm6, [eax]       // read 8 pixels.
4298ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
4299ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
4300ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpshufb    ymm2, ymm0, ymm4  // low 4 alphas
4301ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpshufb    ymm3, ymm1, ymm4  // high 4 alphas
4302ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpmulhuw   ymm0, ymm0, ymm2  // rgb * a
4303ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpmulhuw   ymm1, ymm1, ymm3  // rgb * a
4304ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpand      ymm6, ymm6, ymm5  // isolate alpha
4305ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpsrlw     ymm0, ymm0, 8
4306ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpsrlw     ymm1, ymm1, 8
4307ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpackuswb  ymm0, ymm0, ymm1  // unmutated.
4308ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpor       ymm0, ymm0, ymm6  // copy original alpha
4309ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovdqu    [eax + edx], ymm0
4310ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 32]
43117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 8
4312ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
4313ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4314ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vzeroupper
4315ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
4316ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
4317ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
4318ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif  // HAS_ARGBATTENUATEROW_AVX2
4319ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4320ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBUNATTENUATEROW_SSE2
4321ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Unattenuate 4 pixels at a time.
43227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
4323ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
4324ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                             int width) {
4325ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
4326ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
4327ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       edi
4328ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 8 + 4]   // src_argb0
4329ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 8 + 8]   // dst_argb
4330ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 8 + 12]  // width
4331ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4332ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
4333ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm0, [eax]      // read 4 pixels
4334ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      esi, byte ptr [eax + 3]  // first alpha
4335ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      edi, byte ptr [eax + 7]  // second alpha
4336ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw  xmm0, xmm0       // first 2
4337ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       xmm2, dword ptr fixed_invtbl8[esi * 4]
4338ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       xmm3, dword ptr fixed_invtbl8[edi * 4]
4339ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshuflw    xmm2, xmm2, 040h // first 4 inv_alpha words.  1, a, a, a
4340ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshuflw    xmm3, xmm3, 040h // next 4 inv_alpha words
4341ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movlhps    xmm2, xmm3
4342ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmulhuw    xmm0, xmm2       // rgb * a
4343ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4344ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm1, [eax]      // read 4 pixels
4345ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      esi, byte ptr [eax + 11]  // third alpha
4346ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      edi, byte ptr [eax + 15]  // forth alpha
4347ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpckhbw  xmm1, xmm1       // next 2
4348ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       xmm2, dword ptr fixed_invtbl8[esi * 4]
4349ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       xmm3, dword ptr fixed_invtbl8[edi * 4]
4350ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshuflw    xmm2, xmm2, 040h // first 4 inv_alpha words
4351ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshuflw    xmm3, xmm3, 040h // next 4 inv_alpha words
4352ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movlhps    xmm2, xmm3
4353ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmulhuw    xmm1, xmm2       // rgb * a
4354ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 16]
4355ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4356ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm0, xmm1
4357ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     [edx], xmm0
4358ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 16]
43597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 4
4360ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
4361ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        edi
4362ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
4363ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
4364ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
4365ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
4366ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif  // HAS_ARGBUNATTENUATEROW_SSE2
4367ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4368ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBUNATTENUATEROW_AVX2
4369ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Shuffle table duplicating alpha.
43707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianstatic const uvec8 kUnattenShuffleAlpha_AVX2 = {
43717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
4372ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian};
4373ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
4374ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// USE_GATHER is not on by default, due to being a slow instruction.
4375ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef USE_GATHER
43767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
4377ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
4378ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                             int width) {
4379ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
4380ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4]   // src_argb0
4381ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 8]   // dst_argb
4382ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 12]  // width
4383ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        edx, eax
43847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vbroadcastf128 ymm4, kUnattenShuffleAlpha_AVX2
4385ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4386ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
4387ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovdqu    ymm6, [eax]       // read 8 pixels.
4388ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0xffffffff for gather.
4389ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpsrld     ymm2, ymm6, 24    // alpha in low 8 bits.
4390ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
4391ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
4392ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5  // ymm5 cleared.  1, a
4393ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a
4394ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpunpckhwd ymm3, ymm3, ymm3  // high 4 inverted alphas. mutated.
4395ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpshufb    ymm2, ymm2, ymm4  // replicate low 4 alphas. 1, a, a, a
4396ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpshufb    ymm3, ymm3, ymm4  // replicate high 4 alphas
4397ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
4398ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
4399ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpackuswb  ymm0, ymm0, ymm1  // unmutated.
4400ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovdqu    [eax + edx], ymm0
4401ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 32]
44027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 8
4403ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
4404ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4405ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vzeroupper
4406ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
4407ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
4408ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
4409ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#else  // USE_GATHER
44107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
4411ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
4412ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                             int width) {
4413ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
4414ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4415ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4]   // src_argb0
4416ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 8]   // dst_argb
4417ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 12]  // width
4418ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        edx, eax
44197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vbroadcastf128 ymm5, kUnattenShuffleAlpha_AVX2
4420ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4421ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
4422ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       edi
4423ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4424ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
4425ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // replace VPGATHER
4426ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      esi, byte ptr [eax + 3]                 // alpha0
4427ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      edi, byte ptr [eax + 7]                 // alpha1
4428ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovd      xmm0, dword ptr fixed_invtbl8[esi * 4]  // [1,a0]
4429ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovd      xmm1, dword ptr fixed_invtbl8[edi * 4]  // [1,a1]
4430ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      esi, byte ptr [eax + 11]                // alpha2
4431ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      edi, byte ptr [eax + 15]                // alpha3
4432ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpunpckldq xmm6, xmm0, xmm1                        // [1,a1,1,a0]
4433ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovd      xmm2, dword ptr fixed_invtbl8[esi * 4]  // [1,a2]
4434ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovd      xmm3, dword ptr fixed_invtbl8[edi * 4]  // [1,a3]
4435ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      esi, byte ptr [eax + 19]                // alpha4
4436ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      edi, byte ptr [eax + 23]                // alpha5
4437ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpunpckldq xmm7, xmm2, xmm3                        // [1,a3,1,a2]
4438ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovd      xmm0, dword ptr fixed_invtbl8[esi * 4]  // [1,a4]
4439ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovd      xmm1, dword ptr fixed_invtbl8[edi * 4]  // [1,a5]
4440ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      esi, byte ptr [eax + 27]                // alpha6
4441ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      edi, byte ptr [eax + 31]                // alpha7
4442ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpunpckldq xmm0, xmm0, xmm1                        // [1,a5,1,a4]
4443ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovd      xmm2, dword ptr fixed_invtbl8[esi * 4]  // [1,a6]
4444ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovd      xmm3, dword ptr fixed_invtbl8[edi * 4]  // [1,a7]
4445ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpunpckldq xmm2, xmm2, xmm3                        // [1,a7,1,a6]
4446ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpunpcklqdq xmm3, xmm6, xmm7                       // [1,a3,1,a2,1,a1,1,a0]
4447ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpunpcklqdq xmm0, xmm0, xmm2                       // [1,a7,1,a6,1,a5,1,a4]
4448ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
4449ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // end of VPGATHER
4450ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4451ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovdqu    ymm6, [eax]       // read 8 pixels.
4452ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
4453ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
4454ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a
4455ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpunpckhwd ymm3, ymm3, ymm3  // high 4 inverted alphas. mutated.
4456ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpshufb    ymm2, ymm2, ymm5  // replicate low 4 alphas. 1, a, a, a
4457ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpshufb    ymm3, ymm3, ymm5  // replicate high 4 alphas
4458ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
4459ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
4460ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpackuswb  ymm0, ymm0, ymm1  // unmutated.
4461ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovdqu    [eax + edx], ymm0
4462ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 32]
44637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 8
4464ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
4465ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4466ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        edi
4467ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
4468ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vzeroupper
4469ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
4470ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
4471ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
4472ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif  // USE_GATHER
4473ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif  // HAS_ARGBATTENUATEROW_AVX2
4474ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4475ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBGRAYROW_SSSE3
4476ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
44777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
4478ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
4479ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
4480ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4]   /* src_argb */
4481ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 8]   /* dst_argb */
4482ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 12]  /* width */
4483ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm4, kARGBToYJ
4484ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm5, kAddYJ64
4485ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4486ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
44877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm0, [eax]  // G
44887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm1, [eax + 16]
4489ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm0, xmm4
4490ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm1, xmm4
4491ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    phaddw     xmm0, xmm1
4492ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddw      xmm0, xmm5  // Add .5 for rounding.
4493ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm0, 7
4494ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm0, xmm0   // 8 G bytes
44957ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm2, [eax]  // A
44967ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm3, [eax + 16]
4497ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 32]
4498ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrld      xmm2, 24
4499ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrld      xmm3, 24
4500ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm2, xmm3
4501ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm2, xmm2   // 8 A bytes
4502ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm3, xmm0   // Weave into GG, GA, then GGGA
4503ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw  xmm0, xmm0   // 8 GG words
4504ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw  xmm3, xmm2   // 8 GA words
4505ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm1, xmm0
4506ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklwd  xmm0, xmm3   // GGGA first 4
4507ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpckhwd  xmm1, xmm3   // GGGA next 4
45087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     [edx], xmm0
45097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     [edx + 16], xmm1
4510ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 32]
45117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 8
4512ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
4513ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
4514ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
4515ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
4516ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif  // HAS_ARGBGRAYROW_SSSE3
4517ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4518ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBSEPIAROW_SSSE3
4519ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian//    b = (r * 35 + g * 68 + b * 17) >> 7
4520ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian//    g = (r * 45 + g * 88 + b * 22) >> 7
4521ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian//    r = (r * 50 + g * 98 + b * 24) >> 7
4522ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Constant for ARGB color to sepia tone.
4523ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const vec8 kARGBToSepiaB = {
4524ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
4525ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian};
4526ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4527ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const vec8 kARGBToSepiaG = {
4528ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
4529ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian};
4530ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4531ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const vec8 kARGBToSepiaR = {
4532ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
4533ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian};
4534ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4535ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
45367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
4537ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
4538ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
4539ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4]   /* dst_argb */
4540ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 8]   /* width */
4541ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm2, kARGBToSepiaB
4542ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm3, kARGBToSepiaG
4543ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm4, kARGBToSepiaR
4544ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4545ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
45467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm0, [eax]  // B
45477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm6, [eax + 16]
4548ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm0, xmm2
4549ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm6, xmm2
4550ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    phaddw     xmm0, xmm6
4551ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm0, 7
4552ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm0, xmm0   // 8 B values
45537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm5, [eax]  // G
45547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm1, [eax + 16]
4555ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm5, xmm3
4556ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm1, xmm3
4557ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    phaddw     xmm5, xmm1
4558ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm5, 7
4559ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm5, xmm5   // 8 G values
4560ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw  xmm0, xmm5   // 8 BG values
45617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm5, [eax]  // R
45627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm1, [eax + 16]
4563ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm5, xmm4
4564ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm1, xmm4
4565ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    phaddw     xmm5, xmm1
4566ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm5, 7
4567ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm5, xmm5   // 8 R values
45687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm6, [eax]  // A
45697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm1, [eax + 16]
4570ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrld      xmm6, 24
4571ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrld      xmm1, 24
4572ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm6, xmm1
4573ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm6, xmm6   // 8 A values
4574ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw  xmm5, xmm6   // 8 RA values
4575ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm1, xmm0   // Weave BG, RA together
4576ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklwd  xmm0, xmm5   // BGRA first 4
4577ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpckhwd  xmm1, xmm5   // BGRA next 4
45787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     [eax], xmm0
45797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     [eax + 16], xmm1
4580ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 32]
45817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 8
4582ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
4583ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
4584ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
4585ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
4586ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif  // HAS_ARGBSEPIAROW_SSSE3
4587ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4588ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
4589ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Tranform 8 ARGB pixels (32 bytes) with color matrix.
4590ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Same as Sepia except matrix is provided.
4591ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
4592ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
45937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
4594ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
4595ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                              const int8* matrix_argb, int width) {
4596ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
4597ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4]   /* src_argb */
4598ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 8]   /* dst_argb */
4599ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 12]  /* matrix_argb */
4600ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm5, [ecx]
4601ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufd     xmm2, xmm5, 0x00
4602ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufd     xmm3, xmm5, 0x55
4603ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufd     xmm4, xmm5, 0xaa
4604ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufd     xmm5, xmm5, 0xff
4605ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 16]  /* width */
4606ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4607ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
46087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm0, [eax]  // B
46097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm7, [eax + 16]
4610ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm0, xmm2
4611ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm7, xmm2
46127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm6, [eax]  // G
46137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm1, [eax + 16]
4614ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm6, xmm3
4615ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm1, xmm3
4616ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    phaddsw    xmm0, xmm7   // B
4617ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    phaddsw    xmm6, xmm1   // G
4618ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psraw      xmm0, 6      // B
4619ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psraw      xmm6, 6      // G
4620ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm0, xmm0   // 8 B values
4621ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm6, xmm6   // 8 G values
4622ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw  xmm0, xmm6   // 8 BG values
46237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm1, [eax]  // R
46247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm7, [eax + 16]
4625ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm1, xmm4
4626ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm7, xmm4
4627ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    phaddsw    xmm1, xmm7   // R
46287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm6, [eax]  // A
46297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm7, [eax + 16]
4630ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm6, xmm5
4631ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm7, xmm5
4632ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    phaddsw    xmm6, xmm7   // A
4633ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psraw      xmm1, 6      // R
4634ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psraw      xmm6, 6      // A
4635ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm1, xmm1   // 8 R values
4636ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm6, xmm6   // 8 A values
4637ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw  xmm1, xmm6   // 8 RA values
4638ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm6, xmm0   // Weave BG, RA together
4639ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklwd  xmm0, xmm1   // BGRA first 4
4640ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpckhwd  xmm6, xmm1   // BGRA next 4
46417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     [edx], xmm0
46427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     [edx + 16], xmm6
4643ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 32]
4644ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 32]
46457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 8
4646ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
4647ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
4648ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
4649ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
4650ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
4651ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4652ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBQUANTIZEROW_SSE2
4653ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Quantize 4 ARGB pixels (16 bytes).
46547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
4655ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
4656ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                          int interval_offset, int width) {
4657ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
4658ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4]    /* dst_argb */
4659ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       xmm2, [esp + 8]   /* scale */
4660ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       xmm3, [esp + 12]  /* interval_size */
4661ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       xmm4, [esp + 16]  /* interval_offset */
4662ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 20]   /* width */
4663ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshuflw    xmm2, xmm2, 040h
4664ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufd     xmm2, xmm2, 044h
4665ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshuflw    xmm3, xmm3, 040h
4666ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufd     xmm3, xmm3, 044h
4667ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshuflw    xmm4, xmm4, 040h
4668ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufd     xmm4, xmm4, 044h
4669ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pxor       xmm5, xmm5  // constant 0
4670ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pcmpeqb    xmm6, xmm6  // generate mask 0xff000000
4671ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pslld      xmm6, 24
4672ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4673ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
46747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm0, [eax]  // read 4 pixels
4675ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw  xmm0, xmm5   // first 2 pixels
4676ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmulhuw    xmm0, xmm2   // pixel * scale >> 16
46777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm1, [eax]  // read 4 pixels
4678ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpckhbw  xmm1, xmm5   // next 2 pixels
4679ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmulhuw    xmm1, xmm2
4680ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmullw     xmm0, xmm3   // * interval_size
46817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm7, [eax]  // read 4 pixels
4682ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmullw     xmm1, xmm3
4683ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pand       xmm7, xmm6   // mask alpha
4684ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddw      xmm0, xmm4   // + interval_size / 2
4685ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddw      xmm1, xmm4
4686ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm0, xmm1
4687ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    por        xmm0, xmm7
46887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     [eax], xmm0
4689ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 16]
46907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 4
4691ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
4692ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
4693ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
4694ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
4695ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif  // HAS_ARGBQUANTIZEROW_SSE2
4696ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4697ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBSHADEROW_SSE2
4698ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Shade 4 pixels at a time by specified value.
46997ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
4700ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
4701ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                       uint32 value) {
4702ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
4703ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4]   // src_argb
4704ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 8]   // dst_argb
4705ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 12]  // width
4706ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       xmm2, [esp + 16]  // value
4707ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw  xmm2, xmm2
4708ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklqdq xmm2, xmm2
4709ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4710ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
47117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm0, [eax]      // read 4 pixels
4712ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 16]
4713ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm1, xmm0
4714ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw  xmm0, xmm0       // first 2
4715ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpckhbw  xmm1, xmm1       // next 2
4716ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmulhuw    xmm0, xmm2       // argb * value
4717ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmulhuw    xmm1, xmm2       // argb * value
4718ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm0, 8
4719ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm1, 8
4720ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm0, xmm1
47217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     [edx], xmm0
4722ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 16]
47237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 4
4724ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
4725ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4726ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
4727ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
4728ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
4729ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif  // HAS_ARGBSHADEROW_SSE2
4730ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4731ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBMULTIPLYROW_SSE2
4732ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
47337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
4734ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4735ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                          uint8* dst_argb, int width) {
4736ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
4737ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
4738ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4 + 4]   // src_argb0
4739ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, [esp + 4 + 8]   // src_argb1
4740ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 4 + 12]  // dst_argb
4741ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 4 + 16]  // width
4742ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pxor       xmm5, xmm5  // constant 0
4743ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4744ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
4745ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
4746ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm2, [esi]        // read 4 pixels from src_argb1
4747ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm1, xmm0
4748ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm3, xmm2
4749ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw  xmm0, xmm0         // first 2
4750ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpckhbw  xmm1, xmm1         // next 2
4751ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw  xmm2, xmm5         // first 2
4752ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpckhbw  xmm3, xmm5         // next 2
4753ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmulhuw    xmm0, xmm2         // src_argb0 * src_argb1 first 2
4754ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmulhuw    xmm1, xmm3         // src_argb0 * src_argb1 next 2
4755ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 16]
4756ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        esi, [esi + 16]
4757ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm0, xmm1
4758ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     [edx], xmm0
4759ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 16]
47607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 4
4761ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
4762ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4763ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
4764ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
4765ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
4766ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
4767ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif  // HAS_ARGBMULTIPLYROW_SSE2
4768ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4769ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBADDROW_SSE2
4770ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Add 2 rows of ARGB pixels together, 4 pixels at a time.
4771ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// TODO(fbarchard): Port this to posix, neon and other math functions.
47727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
4773ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4774ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                     uint8* dst_argb, int width) {
4775ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
4776ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
4777ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4 + 4]   // src_argb0
4778ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, [esp + 4 + 8]   // src_argb1
4779ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 4 + 12]  // dst_argb
4780ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 4 + 16]  // width
4781ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4782ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        ecx, 4
4783ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jl         convertloop49
4784ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4785ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop4:
4786ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
4787ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 16]
4788ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm1, [esi]        // read 4 pixels from src_argb1
4789ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        esi, [esi + 16]
4790ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddusb    xmm0, xmm1         // src_argb0 + src_argb1
4791ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     [edx], xmm0
4792ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 16]
47937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 4
4794ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jge        convertloop4
4795ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4796ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop49:
4797ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    add        ecx, 4 - 1
4798ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jl         convertloop19
4799ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4800ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop1:
4801ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       xmm0, [eax]        // read 1 pixels from src_argb0
4802ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 4]
4803ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       xmm1, [esi]        // read 1 pixels from src_argb1
4804ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        esi, [esi + 4]
4805ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddusb    xmm0, xmm1         // src_argb0 + src_argb1
4806ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       [edx], xmm0
4807ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 4]
48087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 1
4809ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jge        convertloop1
4810ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4811ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop19:
4812ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
4813ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
4814ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
4815ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
4816ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif  // HAS_ARGBADDROW_SSE2
4817ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4818ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBSUBTRACTROW_SSE2
4819ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
48207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
4821ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4822ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                          uint8* dst_argb, int width) {
4823ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
4824ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
4825ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4 + 4]   // src_argb0
4826ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, [esp + 4 + 8]   // src_argb1
4827ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 4 + 12]  // dst_argb
4828ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 4 + 16]  // width
4829ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4830ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
4831ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
4832ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 16]
4833ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm1, [esi]        // read 4 pixels from src_argb1
4834ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        esi, [esi + 16]
4835ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psubusb    xmm0, xmm1         // src_argb0 - src_argb1
4836ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     [edx], xmm0
4837ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 16]
48387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 4
4839ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
4840ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4841ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
4842ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
4843ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
4844ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
4845ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif  // HAS_ARGBSUBTRACTROW_SSE2
4846ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4847ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBMULTIPLYROW_AVX2
4848ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
48497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
4850ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
4851ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                          uint8* dst_argb, int width) {
4852ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
4853ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
4854ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4 + 4]   // src_argb0
4855ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, [esp + 4 + 8]   // src_argb1
4856ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 4 + 12]  // dst_argb
4857ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 4 + 16]  // width
4858ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpxor      ymm5, ymm5, ymm5     // constant 0
4859ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4860ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
4861ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovdqu    ymm1, [eax]        // read 8 pixels from src_argb0
4862ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 32]
4863ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovdqu    ymm3, [esi]        // read 8 pixels from src_argb1
4864ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        esi, [esi + 32]
4865ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpunpcklbw ymm0, ymm1, ymm1   // low 4
4866ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpunpckhbw ymm1, ymm1, ymm1   // high 4
4867ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpunpcklbw ymm2, ymm3, ymm5   // low 4
4868ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpunpckhbw ymm3, ymm3, ymm5   // high 4
4869ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpmulhuw   ymm0, ymm0, ymm2   // src_argb0 * src_argb1 low 4
4870ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpmulhuw   ymm1, ymm1, ymm3   // src_argb0 * src_argb1 high 4
4871ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpackuswb  ymm0, ymm0, ymm1
4872ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovdqu    [edx], ymm0
4873ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 32]
4874ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        ecx, 8
4875ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
4876ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4877ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
4878ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vzeroupper
4879ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
4880ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
4881ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
4882ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif  // HAS_ARGBMULTIPLYROW_AVX2
4883ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4884ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBADDROW_AVX2
4885ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Add 2 rows of ARGB pixels together, 8 pixels at a time.
48867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
4887ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
4888ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                     uint8* dst_argb, int width) {
4889ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
4890ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
4891ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4 + 4]   // src_argb0
4892ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, [esp + 4 + 8]   // src_argb1
4893ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 4 + 12]  // dst_argb
4894ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 4 + 16]  // width
4895ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4896ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
4897ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0
4898ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 32]
4899ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpaddusb   ymm0, ymm0, [esi]        // add 8 pixels from src_argb1
4900ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        esi, [esi + 32]
4901ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovdqu    [edx], ymm0
4902ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 32]
4903ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        ecx, 8
4904ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
4905ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4906ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
4907ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vzeroupper
4908ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
4909ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
4910ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
4911ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif  // HAS_ARGBADDROW_AVX2
4912ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4913ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBSUBTRACTROW_AVX2
4914ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
49157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
4916ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
4917ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                          uint8* dst_argb, int width) {
4918ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
4919ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
4920ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4 + 4]   // src_argb0
4921ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, [esp + 4 + 8]   // src_argb1
4922ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 4 + 12]  // dst_argb
4923ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 4 + 16]  // width
4924ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4925ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
4926ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0
4927ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 32]
4928ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpsubusb   ymm0, ymm0, [esi]        // src_argb0 - src_argb1
4929ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        esi, [esi + 32]
4930ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovdqu    [edx], ymm0
4931ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 32]
4932ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        ecx, 8
4933ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
4934ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4935ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
4936ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vzeroupper
4937ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
4938ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
4939ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
4940ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif  // HAS_ARGBSUBTRACTROW_AVX2
4941ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4942ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_SOBELXROW_SSE2
4943ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// SobelX as a matrix is
4944ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// -1  0  1
4945ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// -2  0  2
4946ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// -1  0  1
49477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
4948ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
4949ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                    const uint8* src_y2, uint8* dst_sobelx, int width) {
4950ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
4951ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
4952ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       edi
4953ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 8 + 4]   // src_y0
4954ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, [esp + 8 + 8]   // src_y1
4955ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edi, [esp + 8 + 12]  // src_y2
4956ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 8 + 16]  // dst_sobelx
4957ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 8 + 20]  // width
4958ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        esi, eax
4959ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        edi, eax
4960ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        edx, eax
4961ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pxor       xmm5, xmm5  // constant 0
4962ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4963ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
4964ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movq       xmm0, qword ptr [eax]            // read 8 pixels from src_y0[0]
4965ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movq       xmm1, qword ptr [eax + 2]        // read 8 pixels from src_y0[2]
4966ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw  xmm0, xmm5
4967ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw  xmm1, xmm5
4968ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psubw      xmm0, xmm1
4969ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movq       xmm1, qword ptr [eax + esi]      // read 8 pixels from src_y1[0]
4970ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movq       xmm2, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
4971ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw  xmm1, xmm5
4972ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw  xmm2, xmm5
4973ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psubw      xmm1, xmm2
4974ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movq       xmm2, qword ptr [eax + edi]      // read 8 pixels from src_y2[0]
4975ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movq       xmm3, qword ptr [eax + edi + 2]  // read 8 pixels from src_y2[2]
4976ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw  xmm2, xmm5
4977ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw  xmm3, xmm5
4978ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psubw      xmm2, xmm3
4979ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddw      xmm0, xmm2
4980ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddw      xmm0, xmm1
4981ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddw      xmm0, xmm1
4982ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pxor       xmm1, xmm1   // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
4983ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psubw      xmm1, xmm0
4984ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaxsw     xmm0, xmm1
4985ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm0, xmm0
4986ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movq       qword ptr [eax + edx], xmm0
4987ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 8]
49887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 8
4989ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
4990ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4991ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        edi
4992ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
4993ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
4994ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
4995ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
4996ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif  // HAS_SOBELXROW_SSE2
4997ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4998ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_SOBELYROW_SSE2
4999ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// SobelY as a matrix is
5000ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// -1 -2 -1
5001ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian//  0  0  0
5002ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian//  1  2  1
50037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
5004ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
5005ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                    uint8* dst_sobely, int width) {
5006ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
5007ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
5008ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4 + 4]   // src_y0
5009ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, [esp + 4 + 8]   // src_y1
5010ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 4 + 12]  // dst_sobely
5011ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 4 + 16]  // width
5012ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        esi, eax
5013ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        edx, eax
5014ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pxor       xmm5, xmm5  // constant 0
5015ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5016ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
5017ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movq       xmm0, qword ptr [eax]            // read 8 pixels from src_y0[0]
5018ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movq       xmm1, qword ptr [eax + esi]      // read 8 pixels from src_y1[0]
5019ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw  xmm0, xmm5
5020ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw  xmm1, xmm5
5021ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psubw      xmm0, xmm1
5022ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movq       xmm1, qword ptr [eax + 1]        // read 8 pixels from src_y0[1]
5023ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movq       xmm2, qword ptr [eax + esi + 1]  // read 8 pixels from src_y1[1]
5024ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw  xmm1, xmm5
5025ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw  xmm2, xmm5
5026ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psubw      xmm1, xmm2
5027ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movq       xmm2, qword ptr [eax + 2]        // read 8 pixels from src_y0[2]
5028ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movq       xmm3, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
5029ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw  xmm2, xmm5
5030ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw  xmm3, xmm5
5031ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psubw      xmm2, xmm3
5032ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddw      xmm0, xmm2
5033ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddw      xmm0, xmm1
5034ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddw      xmm0, xmm1
5035ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pxor       xmm1, xmm1   // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
5036ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psubw      xmm1, xmm0
5037ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaxsw     xmm0, xmm1
5038ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm0, xmm0
5039ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movq       qword ptr [eax + edx], xmm0
5040ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 8]
50417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 8
5042ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
5043ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5044ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
5045ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
5046ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
5047ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
5048ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif  // HAS_SOBELYROW_SSE2
5049ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5050ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_SOBELROW_SSE2
5051ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
5052ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// A = 255
5053ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// R = Sobel
5054ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// G = Sobel
5055ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// B = Sobel
50567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
5057ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
5058ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                   uint8* dst_argb, int width) {
5059ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
5060ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
5061ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4 + 4]   // src_sobelx
5062ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, [esp + 4 + 8]   // src_sobely
5063ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 4 + 12]  // dst_argb
5064ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 4 + 16]  // width
5065ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        esi, eax
5066ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pcmpeqb    xmm5, xmm5           // alpha 255
5067ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pslld      xmm5, 24             // 0xff000000
5068ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5069ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
50707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm0, [eax]            // read 16 pixels src_sobelx
50717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely
5072ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 16]
5073ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddusb    xmm0, xmm1             // sobel = sobelx + sobely
5074ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm2, xmm0             // GG
5075ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw  xmm2, xmm0             // First 8
5076ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpckhbw  xmm0, xmm0             // Next 8
5077ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm1, xmm2             // GGGG
5078ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklwd  xmm1, xmm2             // First 4
5079ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpckhwd  xmm2, xmm2             // Next 4
5080ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    por        xmm1, xmm5             // GGGA
5081ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    por        xmm2, xmm5
5082ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm3, xmm0             // GGGG
5083ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklwd  xmm3, xmm0             // Next 4
5084ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpckhwd  xmm0, xmm0             // Last 4
5085ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    por        xmm3, xmm5             // GGGA
5086ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    por        xmm0, xmm5
50877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     [edx], xmm1
50887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     [edx + 16], xmm2
50897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     [edx + 32], xmm3
50907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     [edx + 48], xmm0
5091ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 64]
50927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 16
5093ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
5094ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5095ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
5096ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
5097ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
5098ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
5099ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif  // HAS_SOBELROW_SSE2
5100ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5101ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_SOBELTOPLANEROW_SSE2
5102ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Adds Sobel X and Sobel Y and stores Sobel into a plane.
51037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
5104ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
5105ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                          uint8* dst_y, int width) {
5106ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
5107ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
5108ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4 + 4]   // src_sobelx
5109ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, [esp + 4 + 8]   // src_sobely
5110ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 4 + 12]  // dst_argb
5111ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 4 + 16]  // width
5112ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        esi, eax
5113ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5114ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
51157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm0, [eax]            // read 16 pixels src_sobelx
51167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely
5117ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 16]
5118ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddusb    xmm0, xmm1             // sobel = sobelx + sobely
51197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     [edx], xmm0
5120ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 16]
51217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 16
5122ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
5123ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5124ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
5125ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
5126ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
5127ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
5128ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif  // HAS_SOBELTOPLANEROW_SSE2
5129ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5130ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_SOBELXYROW_SSE2
5131ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Mixes Sobel X, Sobel Y and Sobel into ARGB.
5132ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// A = 255
5133ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// R = Sobel X
5134ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// G = Sobel
5135ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// B = Sobel Y
51367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
5137ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
5138ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                     uint8* dst_argb, int width) {
5139ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
5140ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
5141ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4 + 4]   // src_sobelx
5142ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, [esp + 4 + 8]   // src_sobely
5143ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 4 + 12]  // dst_argb
5144ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 4 + 16]  // width
5145ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        esi, eax
5146ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pcmpeqb    xmm5, xmm5           // alpha 255
5147ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5148ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
51497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm0, [eax]            // read 16 pixels src_sobelx
51507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely
5151ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 16]
5152ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm2, xmm0
5153ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddusb    xmm2, xmm1             // sobel = sobelx + sobely
5154ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm3, xmm0             // XA
5155ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw  xmm3, xmm5
5156ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpckhbw  xmm0, xmm5
5157ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm4, xmm1             // YS
5158ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw  xmm4, xmm2
5159ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpckhbw  xmm1, xmm2
5160ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm6, xmm4             // YSXA
5161ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklwd  xmm6, xmm3             // First 4
5162ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpckhwd  xmm4, xmm3             // Next 4
5163ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm7, xmm1             // YSXA
5164ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklwd  xmm7, xmm0             // Next 4
5165ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpckhwd  xmm1, xmm0             // Last 4
51667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     [edx], xmm6
51677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     [edx + 16], xmm4
51687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     [edx + 32], xmm7
51697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     [edx + 48], xmm1
5170ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 64]
51717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 16
5172ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
5173ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5174ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
5175ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
5176ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
5177ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
5178ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif  // HAS_SOBELXYROW_SSE2
5179ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5180ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
5181ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Consider float CumulativeSum.
5182ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Consider calling CumulativeSum one row at time as needed.
5183ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Consider circular CumulativeSum buffer of radius * 2 + 1 height.
5184ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Convert cumulative sum for an area to an average for 1 pixel.
5185ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// topleft is pointer to top left of CumulativeSum buffer for area.
5186ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// botleft is pointer to bottom left of CumulativeSum buffer.
5187ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// width is offset from left to right of area in CumulativeSum buffer measured
5188ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian//   in number of ints.
5189ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// area is the number of pixels in the area being averaged.
5190ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// dst points to pixel to store result to.
5191ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// count is number of averaged pixels to produce.
51927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Does 4 pixels at a time.
5193ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
5194ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                                    int width, int area, uint8* dst,
5195ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                                    int count) {
5196ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
5197ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, topleft  // eax topleft
5198ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, botleft  // esi botleft
5199ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, width
5200ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       xmm5, area
5201ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edi, dst
5202ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, count
5203ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    cvtdq2ps   xmm5, xmm5
5204ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    rcpss      xmm4, xmm5  // 1.0f / area
5205ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufd     xmm4, xmm4, 0
5206ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        ecx, 4
5207ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jl         l4b
5208ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5209ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    cmp        area, 128  // 128 pixels will not overflow 15 bits.
5210ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ja         l4
5211ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5212ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufd     xmm5, xmm5, 0        // area
5213ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pcmpeqb    xmm6, xmm6           // constant of 65536.0 - 1 = 65535.0
5214ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrld      xmm6, 16
5215ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    cvtdq2ps   xmm6, xmm6
5216ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    addps      xmm5, xmm6           // (65536.0 + area - 1)
5217ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mulps      xmm5, xmm4           // (65536.0 + area - 1) * 1 / area
5218ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    cvtps2dq   xmm5, xmm5           // 0.16 fixed point
5219ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packssdw   xmm5, xmm5           // 16 bit shorts
5220ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5221ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // 4 pixel loop small blocks.
5222ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  s4:
5223ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // top left
52247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm0, [eax]
52257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm1, [eax + 16]
52267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm2, [eax + 32]
52277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm3, [eax + 48]
5228ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5229ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // - top right
5230ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psubd      xmm0, [eax + edx * 4]
5231ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psubd      xmm1, [eax + edx * 4 + 16]
5232ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psubd      xmm2, [eax + edx * 4 + 32]
5233ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psubd      xmm3, [eax + edx * 4 + 48]
5234ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 64]
5235ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5236ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // - bottom left
5237ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psubd      xmm0, [esi]
5238ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psubd      xmm1, [esi + 16]
5239ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psubd      xmm2, [esi + 32]
5240ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psubd      xmm3, [esi + 48]
5241ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5242ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // + bottom right
5243ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddd      xmm0, [esi + edx * 4]
5244ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddd      xmm1, [esi + edx * 4 + 16]
5245ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddd      xmm2, [esi + edx * 4 + 32]
5246ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddd      xmm3, [esi + edx * 4 + 48]
5247ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        esi, [esi + 64]
5248ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5249ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packssdw   xmm0, xmm1  // pack 4 pixels into 2 registers
5250ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packssdw   xmm2, xmm3
5251ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5252ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmulhuw    xmm0, xmm5
5253ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmulhuw    xmm2, xmm5
5254ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5255ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm0, xmm2
5256ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     [edi], xmm0
5257ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edi, [edi + 16]
5258ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        ecx, 4
5259ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jge        s4
5260ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5261ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jmp        l4b
5262ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5263ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // 4 pixel loop
5264ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  l4:
5265ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // top left
52667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm0, [eax]
52677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm1, [eax + 16]
52687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm2, [eax + 32]
52697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm3, [eax + 48]
5270ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5271ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // - top right
5272ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psubd      xmm0, [eax + edx * 4]
5273ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psubd      xmm1, [eax + edx * 4 + 16]
5274ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psubd      xmm2, [eax + edx * 4 + 32]
5275ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psubd      xmm3, [eax + edx * 4 + 48]
5276ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 64]
5277ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5278ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // - bottom left
5279ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psubd      xmm0, [esi]
5280ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psubd      xmm1, [esi + 16]
5281ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psubd      xmm2, [esi + 32]
5282ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psubd      xmm3, [esi + 48]
5283ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5284ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // + bottom right
5285ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddd      xmm0, [esi + edx * 4]
5286ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddd      xmm1, [esi + edx * 4 + 16]
5287ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddd      xmm2, [esi + edx * 4 + 32]
5288ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddd      xmm3, [esi + edx * 4 + 48]
5289ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        esi, [esi + 64]
5290ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5291ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    cvtdq2ps   xmm0, xmm0   // Average = Sum * 1 / Area
5292ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    cvtdq2ps   xmm1, xmm1
5293ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mulps      xmm0, xmm4
5294ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mulps      xmm1, xmm4
5295ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    cvtdq2ps   xmm2, xmm2
5296ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    cvtdq2ps   xmm3, xmm3
5297ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mulps      xmm2, xmm4
5298ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mulps      xmm3, xmm4
5299ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    cvtps2dq   xmm0, xmm0
5300ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    cvtps2dq   xmm1, xmm1
5301ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    cvtps2dq   xmm2, xmm2
5302ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    cvtps2dq   xmm3, xmm3
5303ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packssdw   xmm0, xmm1
5304ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packssdw   xmm2, xmm3
5305ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm0, xmm2
5306ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     [edi], xmm0
5307ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edi, [edi + 16]
5308ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        ecx, 4
5309ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jge        l4
5310ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5311ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  l4b:
5312ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    add        ecx, 4 - 1
5313ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jl         l1b
5314ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5315ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // 1 pixel loop
5316ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  l1:
53177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm0, [eax]
5318ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psubd      xmm0, [eax + edx * 4]
5319ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 16]
5320ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psubd      xmm0, [esi]
5321ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddd      xmm0, [esi + edx * 4]
5322ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        esi, [esi + 16]
5323ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    cvtdq2ps   xmm0, xmm0
5324ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mulps      xmm0, xmm4
5325ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    cvtps2dq   xmm0, xmm0
5326ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packssdw   xmm0, xmm0
5327ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm0, xmm0
5328ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       dword ptr [edi], xmm0
5329ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edi, [edi + 4]
5330ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        ecx, 1
5331ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jge        l1
5332ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  l1b:
5333ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
5334ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
5335ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
5336ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5337ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
5338ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Creates a table of cumulative sums where each value is a sum of all values
5339ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// above and to the left of the value.
5340ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
5341ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                                  const int32* previous_cumsum, int width) {
5342ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
5343ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, row
5344ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, cumsum
5345ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, previous_cumsum
5346ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, width
5347ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pxor       xmm0, xmm0
5348ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pxor       xmm1, xmm1
5349ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5350ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        ecx, 4
5351ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jl         l4b
5352ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    test       edx, 15
5353ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jne        l4b
5354ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5355ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // 4 pixel loop
5356ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  l4:
5357ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm2, [eax]  // 4 argb pixels 16 bytes.
5358ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 16]
5359ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm4, xmm2
5360ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5361ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw  xmm2, xmm1
5362ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm3, xmm2
5363ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklwd  xmm2, xmm1
5364ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpckhwd  xmm3, xmm1
5365ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5366ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpckhbw  xmm4, xmm1
5367ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm5, xmm4
5368ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklwd  xmm4, xmm1
5369ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpckhwd  xmm5, xmm1
5370ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5371ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddd      xmm0, xmm2
53727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm2, [esi]  // previous row above.
5373ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddd      xmm2, xmm0
5374ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5375ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddd      xmm0, xmm3
53767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm3, [esi + 16]
5377ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddd      xmm3, xmm0
5378ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5379ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddd      xmm0, xmm4
53807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm4, [esi + 32]
5381ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddd      xmm4, xmm0
5382ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5383ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddd      xmm0, xmm5
53847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm5, [esi + 48]
5385ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        esi, [esi + 64]
5386ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddd      xmm5, xmm0
5387ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
53887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     [edx], xmm2
53897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     [edx + 16], xmm3
53907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     [edx + 32], xmm4
53917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     [edx + 48], xmm5
5392ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5393ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 64]
5394ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        ecx, 4
5395ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jge        l4
5396ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5397ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  l4b:
5398ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    add        ecx, 4 - 1
5399ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jl         l1b
5400ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5401ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // 1 pixel loop
5402ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  l1:
5403ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       xmm2, dword ptr [eax]  // 1 argb pixel 4 bytes.
5404ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 4]
5405ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw  xmm2, xmm1
5406ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklwd  xmm2, xmm1
5407ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddd      xmm0, xmm2
5408ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm2, [esi]
5409ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        esi, [esi + 16]
5410ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddd      xmm2, xmm0
5411ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     [edx], xmm2
5412ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 16]
5413ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        ecx, 1
5414ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jge        l1
5415ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5416ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian l1b:
5417ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
5418ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
5419ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
5420ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5421ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBAFFINEROW_SSE2
5422ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Copy ARGB pixels from source image with slope to a row of destination.
54237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
5424ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh VenkatasubramanianLIBYUV_API
5425ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
5426ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        uint8* dst_argb, const float* uv_dudv, int width) {
5427ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
5428ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
5429ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       edi
5430ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 12]  // src_argb
5431ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, [esp + 16]  // stride
5432ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 20]  // dst_argb
5433ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 24]  // pointer to uv_dudv
5434ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movq       xmm2, qword ptr [ecx]  // uv
5435ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movq       xmm7, qword ptr [ecx + 8]  // dudv
5436ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 28]  // width
5437ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    shl        esi, 16          // 4, stride
5438ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    add        esi, 4
5439ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       xmm5, esi
5440ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        ecx, 4
5441ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jl         l4b
5442ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5443ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // setup for 4 pixel loop
5444ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufd     xmm7, xmm7, 0x44  // dup dudv
5445ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufd     xmm5, xmm5, 0  // dup 4, stride
5446ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm0, xmm2    // x0, y0, x1, y1
5447ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    addps      xmm0, xmm7
5448ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movlhps    xmm2, xmm0
5449ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm4, xmm7
5450ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    addps      xmm4, xmm4    // dudv *= 2
5451ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm3, xmm2    // x2, y2, x3, y3
5452ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    addps      xmm3, xmm4
5453ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    addps      xmm4, xmm4    // dudv *= 4
5454ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5455ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // 4 pixel loop
5456ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  l4:
5457ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    cvttps2dq  xmm0, xmm2    // x, y float to int first 2
5458ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    cvttps2dq  xmm1, xmm3    // x, y float to int next 2
5459ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packssdw   xmm0, xmm1    // x, y as 8 shorts
5460ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddwd    xmm0, xmm5    // offsets = x * 4 + y * stride.
5461ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       esi, xmm0
5462ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufd     xmm0, xmm0, 0x39  // shift right
5463ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       edi, xmm0
5464ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufd     xmm0, xmm0, 0x39  // shift right
5465ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       xmm1, [eax + esi]  // read pixel 0
5466ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       xmm6, [eax + edi]  // read pixel 1
5467ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpckldq  xmm1, xmm6     // combine pixel 0 and 1
5468ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    addps      xmm2, xmm4    // x, y += dx, dy first 2
5469ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movq       qword ptr [edx], xmm1
5470ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       esi, xmm0
5471ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufd     xmm0, xmm0, 0x39  // shift right
5472ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       edi, xmm0
5473ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       xmm6, [eax + esi]  // read pixel 2
5474ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       xmm0, [eax + edi]  // read pixel 3
5475ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpckldq  xmm6, xmm0     // combine pixel 2 and 3
5476ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    addps      xmm3, xmm4    // x, y += dx, dy next 2
5477ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movq       qword ptr 8[edx], xmm6
54787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    lea        edx, [edx + 16]
54797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 4
54807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    jge        l4
5481ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
54827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  l4b:
54837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    add        ecx, 4 - 1
54847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    jl         l1b
5485ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
54867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    // 1 pixel loop
54877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  l1:
54887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    cvttps2dq  xmm0, xmm2    // x, y float to int
54897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    packssdw   xmm0, xmm0    // x, y as shorts
54907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    pmaddwd    xmm0, xmm5    // offset = x * 4 + y * stride
54917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    addps      xmm2, xmm7    // x, y += dx, dy
54927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movd       esi, xmm0
54937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movd       xmm0, [eax + esi]  // copy a pixel
54947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movd       [edx], xmm0
54957ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    lea        edx, [edx + 4]
54967ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 1
54977ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    jge        l1
54987ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  l1b:
5499ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        edi
5500ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
5501ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
5502ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
5503ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
55047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif  // HAS_ARGBAFFINEROW_SSE2
5505ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
55067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#ifdef HAS_INTERPOLATEROW_AVX2
55077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Bilinear filter 32x2 -> 32x1
55087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
55097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
5510ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                         ptrdiff_t src_stride, int dst_width,
5511ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                         int source_y_fraction) {
5512ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
5513ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
5514ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       edi
5515ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edi, [esp + 8 + 4]   // dst_ptr
5516ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, [esp + 8 + 8]   // src_ptr
5517ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 8 + 12]  // src_stride
5518ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 8 + 16]  // dst_width
5519ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
55207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    shr        eax, 1
5521ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // Dispatch to specialized filters if applicable.
5522ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    cmp        eax, 0
55237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    je         xloop100  // 0 / 128.  Blend 100 / 0.
55247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        edi, esi
55257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    cmp        eax, 32
55267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    je         xloop75   // 32 / 128 is 0.25.  Blend 75 / 25.
5527ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    cmp        eax, 64
55287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    je         xloop50   // 64 / 128 is 0.50.  Blend 50 / 50.
55297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    cmp        eax, 96
55307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    je         xloop25   // 96 / 128 is 0.75.  Blend 25 / 75.
5531ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
55327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovd      xmm0, eax  // high fraction 0..127
55337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    neg        eax
55347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    add        eax, 128
55357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovd      xmm5, eax  // low fraction 128..1
55367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpunpcklbw xmm5, xmm5, xmm0
55377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpunpcklwd xmm5, xmm5, xmm5
55387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpxor      ymm0, ymm0, ymm0
55397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpermd     ymm5, ymm0, ymm5
5540ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5541ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  xloop:
55427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu    ymm0, [esi]
55437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu    ymm2, [esi + edx]
55447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpunpckhbw ymm1, ymm0, ymm2  // mutates
55457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpunpcklbw ymm0, ymm0, ymm2  // mutates
55467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpmaddubsw ymm0, ymm0, ymm5
55477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpmaddubsw ymm1, ymm1, ymm5
55487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpsrlw     ymm0, ymm0, 7
55497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpsrlw     ymm1, ymm1, 7
55507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpackuswb  ymm0, ymm0, ymm1  // unmutates
55517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu    [esi + edi], ymm0
55527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    lea        esi, [esi + 32]
55537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 32
5554ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         xloop
5555ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jmp        xloop99
5556ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
55577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   // Blend 25 / 75.
55587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian xloop25:
55597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   vmovdqu    ymm0, [esi]
55607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   vmovdqu    ymm1, [esi + edx]
55617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   vpavgb     ymm0, ymm0, ymm1
55627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   vpavgb     ymm0, ymm0, ymm1
55637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   vmovdqu    [esi + edi], ymm0
55647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   lea        esi, [esi + 32]
55657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   sub        ecx, 32
55667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   jg         xloop25
55677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   jmp        xloop99
55687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
55697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   // Blend 50 / 50.
55707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian xloop50:
55717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   vmovdqu    ymm0, [esi]
55727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   vpavgb     ymm0, ymm0, [esi + edx]
55737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   vmovdqu    [esi + edi], ymm0
55747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   lea        esi, [esi + 32]
55757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   sub        ecx, 32
55767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   jg         xloop50
55777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   jmp        xloop99
55787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
55797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   // Blend 75 / 25.
55807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian xloop75:
55817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   vmovdqu    ymm1, [esi]
55827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   vmovdqu    ymm0, [esi + edx]
55837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   vpavgb     ymm0, ymm0, ymm1
55847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   vpavgb     ymm0, ymm0, ymm1
55857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   vmovdqu    [esi + edi], ymm0
55867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   lea        esi, [esi + 32]
55877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   sub        ecx, 32
55887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   jg         xloop75
55897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   jmp        xloop99
55907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
55917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   // Blend 100 / 0 - Copy row unchanged.
55927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian xloop100:
55937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian   rep movsb
5594ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5595ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  xloop99:
5596ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        edi
5597ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
55987ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vzeroupper
5599ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
5600ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
5601ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
56027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif  // HAS_INTERPOLATEROW_AVX2
5603ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5604ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Bilinear filter 16x2 -> 16x1
56057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
56067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
56077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                          ptrdiff_t src_stride, int dst_width,
56087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                          int source_y_fraction) {
5609ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
5610ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
5611ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       edi
5612ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edi, [esp + 8 + 4]   // dst_ptr
5613ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, [esp + 8 + 8]   // src_ptr
5614ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 8 + 12]  // src_stride
5615ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 8 + 16]  // dst_width
5616ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
5617ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        edi, esi
5618ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    shr        eax, 1
5619ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // Dispatch to specialized filters if applicable.
5620ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    cmp        eax, 0
5621ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    je         xloop100  // 0 / 128.  Blend 100 / 0.
5622ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    cmp        eax, 32
5623ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    je         xloop75   // 32 / 128 is 0.25.  Blend 75 / 25.
5624ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    cmp        eax, 64
5625ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    je         xloop50   // 64 / 128 is 0.50.  Blend 50 / 50.
5626ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    cmp        eax, 96
5627ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    je         xloop25   // 96 / 128 is 0.75.  Blend 25 / 75.
5628ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5629ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       xmm0, eax  // high fraction 0..127
5630ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    neg        eax
5631ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    add        eax, 128
5632ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       xmm5, eax  // low fraction 128..1
5633ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw  xmm5, xmm0
5634ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklwd  xmm5, xmm5
5635ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufd     xmm5, xmm5, 0
5636ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5637ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  xloop:
5638ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm0, [esi]
5639ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm2, [esi + edx]
5640ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm1, xmm0
5641ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw  xmm0, xmm2
5642ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpckhbw  xmm1, xmm2
5643ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm0, xmm5
5644ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm1, xmm5
5645ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm0, 7
5646ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm1, 7
5647ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm0, xmm1
5648ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     [esi + edi], xmm0
5649ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        esi, [esi + 16]
56507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 16
5651ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         xloop
5652ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jmp        xloop99
5653ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5654ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // Blend 25 / 75.
5655ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  xloop25:
5656ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm0, [esi]
5657ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm1, [esi + edx]
5658ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgb      xmm0, xmm1
5659ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgb      xmm0, xmm1
5660ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     [esi + edi], xmm0
5661ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        esi, [esi + 16]
56627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 16
5663ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         xloop25
5664ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jmp        xloop99
5665ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5666ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // Blend 50 / 50.
5667ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  xloop50:
5668ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm0, [esi]
5669ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm1, [esi + edx]
5670ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgb      xmm0, xmm1
5671ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     [esi + edi], xmm0
5672ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        esi, [esi + 16]
56737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 16
5674ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         xloop50
5675ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jmp        xloop99
5676ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5677ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // Blend 75 / 25.
5678ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  xloop75:
5679ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm1, [esi]
5680ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm0, [esi + edx]
5681ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgb      xmm0, xmm1
5682ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgb      xmm0, xmm1
5683ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     [esi + edi], xmm0
5684ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        esi, [esi + 16]
56857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 16
5686ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         xloop75
5687ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jmp        xloop99
5688ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5689ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // Blend 100 / 0 - Copy row unchanged.
5690ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  xloop100:
5691ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm0, [esi]
5692ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     [esi + edi], xmm0
5693ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        esi, [esi + 16]
56947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 16
5695ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         xloop100
5696ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5697ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  xloop99:
5698ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        edi
5699ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
5700ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
5701ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
5702ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
5703ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5704ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_INTERPOLATEROW_SSE2
5705ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Bilinear filter 16x2 -> 16x1
57067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
57077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
57087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                         ptrdiff_t src_stride, int dst_width,
57097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                         int source_y_fraction) {
5710ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
5711ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
5712ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       edi
5713ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edi, [esp + 8 + 4]   // dst_ptr
5714ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, [esp + 8 + 8]   // src_ptr
5715ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 8 + 12]  // src_stride
5716ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 8 + 16]  // dst_width
5717ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
5718ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        edi, esi
5719ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // Dispatch to specialized filters if applicable.
5720ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    cmp        eax, 0
5721ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    je         xloop100  // 0 / 256.  Blend 100 / 0.
5722ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    cmp        eax, 64
5723ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    je         xloop75   // 64 / 256 is 0.25.  Blend 75 / 25.
5724ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    cmp        eax, 128
5725ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    je         xloop50   // 128 / 256 is 0.50.  Blend 50 / 50.
5726ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    cmp        eax, 192
5727ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    je         xloop25   // 192 / 256 is 0.75.  Blend 25 / 75.
5728ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5729ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       xmm5, eax            // xmm5 = y fraction
5730ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw  xmm5, xmm5
5731ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm5, 1
5732ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklwd  xmm5, xmm5
5733ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpckldq  xmm5, xmm5
5734ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklqdq xmm5, xmm5
5735ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pxor       xmm4, xmm4
5736ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5737ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  xloop:
5738ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm0, [esi]  // row0
5739ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm2, [esi + edx]  // row1
5740ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm1, xmm0
5741ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm3, xmm2
5742ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw  xmm2, xmm4
5743ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpckhbw  xmm3, xmm4
5744ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw  xmm0, xmm4
5745ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpckhbw  xmm1, xmm4
5746ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psubw      xmm2, xmm0  // row1 - row0
5747ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psubw      xmm3, xmm1
5748ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddw      xmm2, xmm2  // 9 bits * 15 bits = 8.16
5749ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddw      xmm3, xmm3
5750ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmulhw     xmm2, xmm5  // scale diff
5751ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmulhw     xmm3, xmm5
5752ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddw      xmm0, xmm2  // sum rows
5753ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddw      xmm1, xmm3
5754ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm0, xmm1
5755ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     [esi + edi], xmm0
5756ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        esi, [esi + 16]
57577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 16
5758ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         xloop
5759ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jmp        xloop99
5760ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5761ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // Blend 25 / 75.
5762ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  xloop25:
5763ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm0, [esi]
5764ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm1, [esi + edx]
5765ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgb      xmm0, xmm1
5766ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgb      xmm0, xmm1
5767ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     [esi + edi], xmm0
5768ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        esi, [esi + 16]
57697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 16
5770ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         xloop25
5771ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jmp        xloop99
5772ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5773ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // Blend 50 / 50.
5774ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  xloop50:
5775ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm0, [esi]
5776ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm1, [esi + edx]
5777ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgb      xmm0, xmm1
5778ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     [esi + edi], xmm0
5779ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        esi, [esi + 16]
57807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 16
5781ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         xloop50
5782ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jmp        xloop99
5783ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5784ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // Blend 75 / 25.
5785ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  xloop75:
5786ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm1, [esi]
5787ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm0, [esi + edx]
5788ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgb      xmm0, xmm1
5789ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgb      xmm0, xmm1
5790ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     [esi + edi], xmm0
5791ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        esi, [esi + 16]
57927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 16
5793ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         xloop75
5794ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jmp        xloop99
5795ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5796ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // Blend 100 / 0 - Copy row unchanged.
5797ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  xloop100:
5798ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm0, [esi]
5799ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     [esi + edi], xmm0
5800ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        esi, [esi + 16]
58017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 16
5802ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         xloop100
5803ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5804ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  xloop99:
5805ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        edi
5806ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
5807ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
5808ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
5809ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
5810ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif  // HAS_INTERPOLATEROW_SSE2
5811ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5812ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
58137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
5814ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
5815ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                          const uint8* shuffler, int pix) {
5816ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
5817ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4]    // src_argb
5818ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 8]    // dst_argb
5819ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 12]   // shuffler
58207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm5, [ecx]
5821ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 16]   // pix
5822ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5823ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  wloop:
5824ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm0, [eax]
5825ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm1, [eax + 16]
5826ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 32]
5827ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufb     xmm0, xmm5
5828ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufb     xmm1, xmm5
5829ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     [edx], xmm0
5830ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     [edx + 16], xmm1
5831ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 32]
58327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 8
5833ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         wloop
5834ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
5835ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
5836ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
5837ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5838ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBSHUFFLEROW_AVX2
58397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
5840ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
5841ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                         const uint8* shuffler, int pix) {
5842ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
5843ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4]     // src_argb
5844ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 8]     // dst_argb
5845ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 12]    // shuffler
5846ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vbroadcastf128 ymm5, [ecx]    // same shuffle in high as low.
5847ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 16]    // pix
5848ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5849ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  wloop:
5850ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovdqu    ymm0, [eax]
5851ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovdqu    ymm1, [eax + 32]
5852ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 64]
5853ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpshufb    ymm0, ymm0, ymm5
5854ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpshufb    ymm1, ymm1, ymm5
5855ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovdqu    [edx], ymm0
5856ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovdqu    [edx + 32], ymm1
5857ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 64]
58587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 16
5859ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         wloop
5860ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5861ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vzeroupper
5862ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
5863ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
5864ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
5865ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif  // HAS_ARGBSHUFFLEROW_AVX2
5866ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
58677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
5868ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
5869ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                         const uint8* shuffler, int pix) {
5870ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
5871ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       ebx
5872ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
5873ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 8 + 4]    // src_argb
5874ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 8 + 8]    // dst_argb
5875ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, [esp + 8 + 12]   // shuffler
5876ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 8 + 16]   // pix
5877ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pxor       xmm5, xmm5
5878ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5879ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ebx, [esi]   // shuffler
5880ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    cmp        ebx, 0x03000102
5881ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    je         shuf_3012
5882ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    cmp        ebx, 0x00010203
5883ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    je         shuf_0123
5884ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    cmp        ebx, 0x00030201
5885ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    je         shuf_0321
5886ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    cmp        ebx, 0x02010003
5887ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    je         shuf_2103
5888ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5889ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  // TODO(fbarchard): Use one source pointer and 3 offsets.
5890ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  shuf_any1:
5891ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      ebx, byte ptr [esi]
5892ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      ebx, byte ptr [eax + ebx]
5893ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        [edx], bl
5894ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      ebx, byte ptr [esi + 1]
5895ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      ebx, byte ptr [eax + ebx]
5896ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        [edx + 1], bl
5897ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      ebx, byte ptr [esi + 2]
5898ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      ebx, byte ptr [eax + ebx]
5899ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        [edx + 2], bl
5900ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      ebx, byte ptr [esi + 3]
5901ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      ebx, byte ptr [eax + ebx]
5902ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        [edx + 3], bl
5903ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 4]
5904ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 4]
5905ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        ecx, 1
5906ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         shuf_any1
5907ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jmp        shuf99
5908ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5909ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  shuf_0123:
5910ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm0, [eax]
5911ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 16]
5912ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm1, xmm0
5913ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw  xmm0, xmm5
5914ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpckhbw  xmm1, xmm5
5915ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufhw    xmm0, xmm0, 01Bh   // 1B = 00011011 = 0x0123 = BGRAToARGB
5916ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshuflw    xmm0, xmm0, 01Bh
5917ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufhw    xmm1, xmm1, 01Bh
5918ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshuflw    xmm1, xmm1, 01Bh
5919ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm0, xmm1
5920ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     [edx], xmm0
5921ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 16]
59227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 4
5923ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         shuf_0123
5924ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jmp        shuf99
5925ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5926ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  shuf_0321:
5927ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm0, [eax]
5928ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 16]
5929ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm1, xmm0
5930ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw  xmm0, xmm5
5931ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpckhbw  xmm1, xmm5
5932ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufhw    xmm0, xmm0, 039h   // 39 = 00111001 = 0x0321 = RGBAToARGB
5933ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshuflw    xmm0, xmm0, 039h
5934ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufhw    xmm1, xmm1, 039h
5935ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshuflw    xmm1, xmm1, 039h
5936ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm0, xmm1
5937ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     [edx], xmm0
5938ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 16]
59397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 4
5940ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         shuf_0321
5941ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jmp        shuf99
5942ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5943ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  shuf_2103:
5944ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm0, [eax]
5945ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 16]
5946ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm1, xmm0
5947ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw  xmm0, xmm5
5948ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpckhbw  xmm1, xmm5
5949ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufhw    xmm0, xmm0, 093h   // 93 = 10010011 = 0x2103 = ARGBToRGBA
5950ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshuflw    xmm0, xmm0, 093h
5951ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufhw    xmm1, xmm1, 093h
5952ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshuflw    xmm1, xmm1, 093h
5953ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm0, xmm1
5954ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     [edx], xmm0
5955ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 16]
59567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 4
5957ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         shuf_2103
5958ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jmp        shuf99
5959ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5960ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  shuf_3012:
5961ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm0, [eax]
5962ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 16]
5963ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm1, xmm0
5964ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw  xmm0, xmm5
5965ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpckhbw  xmm1, xmm5
5966ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufhw    xmm0, xmm0, 0C6h   // C6 = 11000110 = 0x3012 = ABGRToARGB
5967ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshuflw    xmm0, xmm0, 0C6h
5968ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufhw    xmm1, xmm1, 0C6h
5969ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshuflw    xmm1, xmm1, 0C6h
5970ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm0, xmm1
5971ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     [edx], xmm0
5972ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 16]
59737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 4
5974ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         shuf_3012
5975ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5976ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  shuf99:
5977ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
5978ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        ebx
5979ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
5980ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
5981ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
5982ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5983ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// YUY2 - Macro-pixel = 2 image pixels
5984ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
5985ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
5986ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// UYVY - Macro-pixel = 2 image pixels
5987ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// U0Y0V0Y1
5988ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
59897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
5990ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid I422ToYUY2Row_SSE2(const uint8* src_y,
5991ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        const uint8* src_u,
5992ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        const uint8* src_v,
5993ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        uint8* dst_frame, int width) {
5994ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
5995ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
5996ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       edi
5997ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 8 + 4]    // src_y
5998ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, [esp + 8 + 8]    // src_u
5999ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 8 + 12]   // src_v
6000ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edi, [esp + 8 + 16]   // dst_frame
6001ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 8 + 20]   // width
6002ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        edx, esi
6003ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
6004ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  convertloop:
6005ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movq       xmm2, qword ptr [esi] // U
6006ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movq       xmm3, qword ptr [esi + edx] // V
6007ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        esi, [esi + 8]
6008ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw  xmm2, xmm3 // UV
6009ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm0, [eax] // Y
6010ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 16]
6011ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm1, xmm0
6012ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw  xmm0, xmm2 // YUYV
6013ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpckhbw  xmm1, xmm2
6014ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     [edi], xmm0
6015ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     [edi + 16], xmm1
6016ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edi, [edi + 32]
6017ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        ecx, 16
6018ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
6019ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
6020ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        edi
6021ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
6022ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
6023ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
6024ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
6025ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
60267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
6027ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid I422ToUYVYRow_SSE2(const uint8* src_y,
6028ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        const uint8* src_u,
6029ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        const uint8* src_v,
6030ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        uint8* dst_frame, int width) {
6031ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
6032ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
6033ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       edi
6034ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 8 + 4]    // src_y
6035ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, [esp + 8 + 8]    // src_u
6036ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 8 + 12]   // src_v
6037ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edi, [esp + 8 + 16]   // dst_frame
6038ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 8 + 20]   // width
6039ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        edx, esi
6040ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
6041ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  convertloop:
6042ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movq       xmm2, qword ptr [esi] // U
6043ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movq       xmm3, qword ptr [esi + edx] // V
6044ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        esi, [esi + 8]
6045ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw  xmm2, xmm3 // UV
6046ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm0, [eax] // Y
6047ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm1, xmm2
6048ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 16]
6049ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw  xmm1, xmm0 // UYVY
6050ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpckhbw  xmm2, xmm0
6051ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     [edi], xmm1
6052ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     [edi + 16], xmm2
6053ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edi, [edi + 32]
6054ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        ecx, 16
6055ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
6056ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
6057ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        edi
6058ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
6059ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
6060ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
6061ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
6062ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
6063ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBPOLYNOMIALROW_SSE2
60647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
6065ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBPolynomialRow_SSE2(const uint8* src_argb,
6066ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                            uint8* dst_argb, const float* poly,
6067ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                            int width) {
6068ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
6069ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
6070ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4 + 4]   /* src_argb */
6071ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 4 + 8]   /* dst_argb */
6072ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, [esp + 4 + 12]  /* poly */
6073ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 4 + 16]  /* width */
6074ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pxor       xmm3, xmm3  // 0 constant for zero extending bytes to ints.
6075ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
6076ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // 2 pixel loop.
6077ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
6078ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian//    pmovzxbd  xmm0, dword ptr [eax]  // BGRA pixel
6079ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian//    pmovzxbd  xmm4, dword ptr [eax + 4]  // BGRA pixel
6080ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movq       xmm0, qword ptr [eax]  // BGRABGRA
6081ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 8]
6082ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw  xmm0, xmm3
6083ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm4, xmm0
6084ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklwd  xmm0, xmm3  // pixel 0
6085ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpckhwd  xmm4, xmm3  // pixel 1
6086ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    cvtdq2ps   xmm0, xmm0  // 4 floats
6087ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    cvtdq2ps   xmm4, xmm4
6088ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm1, xmm0  // X
6089ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm5, xmm4
6090ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mulps      xmm0, [esi + 16]  // C1 * X
6091ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mulps      xmm4, [esi + 16]
6092ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    addps      xmm0, [esi]  // result = C0 + C1 * X
6093ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    addps      xmm4, [esi]
6094ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm2, xmm1
6095ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm6, xmm5
6096ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mulps      xmm2, xmm1  // X * X
6097ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mulps      xmm6, xmm5
6098ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mulps      xmm1, xmm2  // X * X * X
6099ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mulps      xmm5, xmm6
6100ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mulps      xmm2, [esi + 32]  // C2 * X * X
6101ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mulps      xmm6, [esi + 32]
6102ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mulps      xmm1, [esi + 48]  // C3 * X * X * X
6103ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mulps      xmm5, [esi + 48]
6104ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    addps      xmm0, xmm2  // result += C2 * X * X
6105ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    addps      xmm4, xmm6
6106ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    addps      xmm0, xmm1  // result += C3 * X * X * X
6107ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    addps      xmm4, xmm5
6108ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    cvttps2dq  xmm0, xmm0
6109ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    cvttps2dq  xmm4, xmm4
6110ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm0, xmm4
6111ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm0, xmm0
6112ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movq       qword ptr [edx], xmm0
6113ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 8]
61147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 2
6115ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
6116ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
6117ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
6118ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
6119ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
6120ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif  // HAS_ARGBPOLYNOMIALROW_SSE2
6121ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
6122ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBPOLYNOMIALROW_AVX2
61237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
6124ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBPolynomialRow_AVX2(const uint8* src_argb,
6125ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                            uint8* dst_argb, const float* poly,
6126ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                            int width) {
6127ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
6128ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4]   /* src_argb */
6129ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 8]   /* dst_argb */
6130ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 12]   /* poly */
6131ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vbroadcastf128 ymm4, [ecx]       // C0
6132ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vbroadcastf128 ymm5, [ecx + 16]  // C1
6133ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vbroadcastf128 ymm6, [ecx + 32]  // C2
6134ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vbroadcastf128 ymm7, [ecx + 48]  // C3
6135ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 16]  /* width */
6136ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
6137ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // 2 pixel loop.
6138ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian convertloop:
6139ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpmovzxbd   ymm0, qword ptr [eax]  // 2 BGRA pixels
6140ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea         eax, [eax + 8]
6141ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vcvtdq2ps   ymm0, ymm0        // X 8 floats
6142ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmulps      ymm2, ymm0, ymm0  // X * X
6143ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmulps      ymm3, ymm0, ymm7  // C3 * X
6144ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vfmadd132ps ymm0, ymm4, ymm5  // result = C0 + C1 * X
6145ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vfmadd231ps ymm0, ymm2, ymm6  // result += C2 * X * X
6146ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vfmadd231ps ymm0, ymm2, ymm3  // result += C3 * X * X * X
6147ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vcvttps2dq  ymm0, ymm0
6148ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpackusdw   ymm0, ymm0, ymm0  // b0g0r0a0_00000000_b0g0r0a0_00000000
6149ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpermq      ymm0, ymm0, 0xd8  // b0g0r0a0_b0g0r0a0_00000000_00000000
6150ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vpackuswb   xmm0, xmm0, xmm0  // bgrabgra_00000000_00000000_00000000
6151ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vmovq       qword ptr [edx], xmm0
6152ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea         edx, [edx + 8]
61537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub         ecx, 2
6154ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg          convertloop
6155ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vzeroupper
6156ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
6157ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
6158ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
6159ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif  // HAS_ARGBPOLYNOMIALROW_AVX2
6160ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
6161ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBCOLORTABLEROW_X86
6162ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Tranform ARGB pixels with color table.
61637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
6164ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
6165ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                           int width) {
6166ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
6167ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
6168ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4 + 4]   /* dst_argb */
6169ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, [esp + 4 + 8]   /* table_argb */
6170ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 4 + 12]  /* width */
6171ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
6172ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // 1 pixel loop.
6173ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  convertloop:
6174ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      edx, byte ptr [eax]
6175ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 4]
6176ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      edx, byte ptr [esi + edx * 4]
6177ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        byte ptr [eax - 4], dl
6178ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      edx, byte ptr [eax - 4 + 1]
6179ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      edx, byte ptr [esi + edx * 4 + 1]
6180ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        byte ptr [eax - 4 + 1], dl
6181ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      edx, byte ptr [eax - 4 + 2]
6182ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      edx, byte ptr [esi + edx * 4 + 2]
6183ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        byte ptr [eax - 4 + 2], dl
6184ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      edx, byte ptr [eax - 4 + 3]
6185ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      edx, byte ptr [esi + edx * 4 + 3]
6186ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        byte ptr [eax - 4 + 3], dl
6187ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    dec        ecx
6188ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
6189ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
6190ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
6191ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
6192ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
6193ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif  // HAS_ARGBCOLORTABLEROW_X86
6194ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
6195ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_RGBCOLORTABLEROW_X86
6196ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Tranform RGB pixels with color table.
61977ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
6198ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
6199ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
6200ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
6201ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4 + 4]   /* dst_argb */
6202ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, [esp + 4 + 8]   /* table_argb */
6203ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 4 + 12]  /* width */
6204ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
6205ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // 1 pixel loop.
6206ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  convertloop:
6207ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      edx, byte ptr [eax]
6208ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 4]
6209ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      edx, byte ptr [esi + edx * 4]
6210ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        byte ptr [eax - 4], dl
6211ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      edx, byte ptr [eax - 4 + 1]
6212ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      edx, byte ptr [esi + edx * 4 + 1]
6213ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        byte ptr [eax - 4 + 1], dl
6214ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      edx, byte ptr [eax - 4 + 2]
6215ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      edx, byte ptr [esi + edx * 4 + 2]
6216ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        byte ptr [eax - 4 + 2], dl
6217ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    dec        ecx
6218ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
6219ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
6220ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
6221ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
6222ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
6223ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
6224ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif  // HAS_RGBCOLORTABLEROW_X86
6225ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
6226ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
6227ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Tranform RGB pixels with luma table.
62287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
6229ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
6230ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                                 int width,
6231ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                                 const uint8* luma, uint32 lumacoeff) {
6232ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
6233ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
6234ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       edi
6235ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 8 + 4]   /* src_argb */
6236ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edi, [esp + 8 + 8]   /* dst_argb */
6237ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 8 + 12]  /* width */
6238ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       xmm2, dword ptr [esp + 8 + 16]  // luma table
6239ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       xmm3, dword ptr [esp + 8 + 20]  // lumacoeff
6240ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufd     xmm2, xmm2, 0
6241ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufd     xmm3, xmm3, 0
6242ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pcmpeqb    xmm4, xmm4        // generate mask 0xff00ff00
6243ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psllw      xmm4, 8
6244ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pxor       xmm5, xmm5
6245ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
6246ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // 4 pixel loop.
6247ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  convertloop:
6248ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm0, qword ptr [eax]      // generate luma ptr
6249ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm0, xmm3
6250ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    phaddw     xmm0, xmm0
6251ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pand       xmm0, xmm4  // mask out low bits
6252ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklwd  xmm0, xmm5
6253ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddd      xmm0, xmm2  // add table base
6254ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       esi, xmm0
6255ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
6256ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
6257ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      edx, byte ptr [eax]
6258ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      edx, byte ptr [esi + edx]
6259ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        byte ptr [edi], dl
6260ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      edx, byte ptr [eax + 1]
6261ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      edx, byte ptr [esi + edx]
6262ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        byte ptr [edi + 1], dl
6263ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      edx, byte ptr [eax + 2]
6264ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      edx, byte ptr [esi + edx]
6265ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        byte ptr [edi + 2], dl
6266ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      edx, byte ptr [eax + 3]  // copy alpha.
6267ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        byte ptr [edi + 3], dl
6268ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
6269ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       esi, xmm0
6270ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
6271ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
6272ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      edx, byte ptr [eax + 4]
6273ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      edx, byte ptr [esi + edx]
6274ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        byte ptr [edi + 4], dl
6275ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      edx, byte ptr [eax + 5]
6276ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      edx, byte ptr [esi + edx]
6277ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        byte ptr [edi + 5], dl
6278ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      edx, byte ptr [eax + 6]
6279ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      edx, byte ptr [esi + edx]
6280ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        byte ptr [edi + 6], dl
6281ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      edx, byte ptr [eax + 7]  // copy alpha.
6282ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        byte ptr [edi + 7], dl
6283ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
6284ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       esi, xmm0
6285ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
6286ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
6287ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      edx, byte ptr [eax + 8]
6288ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      edx, byte ptr [esi + edx]
6289ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        byte ptr [edi + 8], dl
6290ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      edx, byte ptr [eax + 9]
6291ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      edx, byte ptr [esi + edx]
6292ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        byte ptr [edi + 9], dl
6293ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      edx, byte ptr [eax + 10]
6294ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      edx, byte ptr [esi + edx]
6295ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        byte ptr [edi + 10], dl
6296ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      edx, byte ptr [eax + 11]  // copy alpha.
6297ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        byte ptr [edi + 11], dl
6298ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
6299ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       esi, xmm0
6300ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
6301ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      edx, byte ptr [eax + 12]
6302ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      edx, byte ptr [esi + edx]
6303ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        byte ptr [edi + 12], dl
6304ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      edx, byte ptr [eax + 13]
6305ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      edx, byte ptr [esi + edx]
6306ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        byte ptr [edi + 13], dl
6307ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      edx, byte ptr [eax + 14]
6308ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      edx, byte ptr [esi + edx]
6309ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        byte ptr [edi + 14], dl
6310ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      edx, byte ptr [eax + 15]  // copy alpha.
6311ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        byte ptr [edi + 15], dl
6312ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
6313ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 16]
6314ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edi, [edi + 16]
63157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 4
6316ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         convertloop
6317ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
6318ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        edi
6319ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
6320ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
6321ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
6322ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
6323ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
6324ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
6325ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif  // defined(_M_X64)
63267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif  // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64))
6327ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
6328ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef __cplusplus
6329ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}  // extern "C"
6330ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}  // namespace libyuv
6331ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif
6332