15327adda475f79405a008a967d30bf7c92e994admikhal@webrtc.org/*
2b0c97975894a5eebebf9d93147cdd941a3accb63fbarchard@google.com *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
35327adda475f79405a008a967d30bf7c92e994admikhal@webrtc.org *
45327adda475f79405a008a967d30bf7c92e994admikhal@webrtc.org *  Use of this source code is governed by a BSD-style license
55327adda475f79405a008a967d30bf7c92e994admikhal@webrtc.org *  that can be found in the LICENSE file in the root of the source
65327adda475f79405a008a967d30bf7c92e994admikhal@webrtc.org *  tree. An additional intellectual property rights grant can be found
7cde587092fef0dbed2c35602f30b79e7b892e766fbarchard@google.com *  in the file PATENTS. All contributing project authors may
85327adda475f79405a008a967d30bf7c92e994admikhal@webrtc.org *  be found in the AUTHORS file in the root of the source tree.
95327adda475f79405a008a967d30bf7c92e994admikhal@webrtc.org */
105327adda475f79405a008a967d30bf7c92e994admikhal@webrtc.org
11142f6c4ed5eaeec0176f255e64bac8d8c70b42e1fbarchard@google.com#include "libyuv/row.h"
125327adda475f79405a008a967d30bf7c92e994admikhal@webrtc.org
13e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com#if defined (_M_X64)
14e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com#include <emmintrin.h>
15e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com#include <tmmintrin.h>  // For _mm_maddubs_epi16
16e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com#endif
17e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com
18fe5ff7ed5451496281697bda9cb85084c532926cfbarchard@google.com#ifdef __cplusplus
19fe5ff7ed5451496281697bda9cb85084c532926cfbarchard@google.comnamespace libyuv {
205327adda475f79405a008a967d30bf7c92e994admikhal@webrtc.orgextern "C" {
21fe5ff7ed5451496281697bda9cb85084c532926cfbarchard@google.com#endif
225327adda475f79405a008a967d30bf7c92e994admikhal@webrtc.org
23e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com// This module is for Visual C.
24e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER)
25e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com
26e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com#define YG 74  /* (int8)(1.164 * 64 + 0.5) */
27e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com
28e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com#define UB 127  /* min(127,(int8)(2.018 * 64)) */
29e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com#define UG -25  /* (int8)(-0.391 * 64 - 0.5) */
30e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com#define UR 0
31e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com
32e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com#define VB 0
33e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com#define VG -52  /* (int8)(-0.813 * 64 - 0.5) */
34e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com#define VR 102  /* (int8)(1.596 * 64 + 0.5) */
35e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com
36e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com// Bias
37e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com#define BB UB * 128 + VB * 128
38e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com#define BG UG * 128 + VG * 128
39e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com#define BR UR * 128 + VR * 128
40e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com
41e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.comstatic const vec8 kUVToB = {
42e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com  UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
43e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com};
44e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com
45e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.comstatic const vec8 kUVToR = {
46e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com  UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
47e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com};
48e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com
49e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.comstatic const vec8 kUVToG = {
50e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com  UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
51e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com};
52e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com
53e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.comstatic const vec8 kVUToB = {
54e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com  VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB,
55e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com};
56e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com
57e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.comstatic const vec8 kVUToR = {
58e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com  VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR,
59e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com};
60e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com
61e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.comstatic const vec8 kVUToG = {
62e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com  VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
63e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com};
64e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com
65e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.comstatic const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG };
66e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.comstatic const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 };
67e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.comstatic const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };
68e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.comstatic const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };
69e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.comstatic const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
70e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com
71e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com// 64 bit
72e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com#if defined(_M_X64)
73e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com
74e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com// Aligned destination version.
75e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com__declspec(align(16))
76e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.comvoid I422ToARGBRow_SSSE3(const uint8* y_buf,
77e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com                         const uint8* u_buf,
78e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com                         const uint8* v_buf,
79e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com                         uint8* dst_argb,
80e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com                         int width) {
81e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com
82e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com  __m128i xmm0, xmm1, xmm2, xmm3;
83e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com  const __m128i xmm5 = _mm_set1_epi8(-1);
84e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com  const __m128i xmm4 = _mm_setzero_si128();
85e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com  const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
86e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com
87e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com  while (width > 0) {
88e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);
89e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));
90e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
91e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);
92e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm1 = _mm_load_si128(&xmm0);
93e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm2 = _mm_load_si128(&xmm0);
94e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kUVToB);
95e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kUVToG);
96e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kUVToR);
97e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm0 = _mm_sub_epi16(xmm0, *(__m128i*)kUVBiasB);
98e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm1 = _mm_sub_epi16(xmm1, *(__m128i*)kUVBiasG);
99e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm2 = _mm_sub_epi16(xmm2, *(__m128i*)kUVBiasR);
100e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm3 = _mm_loadl_epi64((__m128i*)y_buf);
101e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm3 = _mm_unpacklo_epi8(xmm3, xmm4);
102e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm3 = _mm_subs_epi16(xmm3, *(__m128i*)kYSub16);
103e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm3 = _mm_mullo_epi16(xmm3, *(__m128i*)kYToRgb);
104e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm0 = _mm_adds_epi16(xmm0, xmm3);
105e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm1 = _mm_adds_epi16(xmm1, xmm3);
106e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm2 = _mm_adds_epi16(xmm2, xmm3);
107e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm0 = _mm_srai_epi16(xmm0, 6);
108e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm1 = _mm_srai_epi16(xmm1, 6);
109e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm2 = _mm_srai_epi16(xmm2, 6);
110e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm0 = _mm_packus_epi16(xmm0, xmm0);
111e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm1 = _mm_packus_epi16(xmm1, xmm1);
112e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm2 = _mm_packus_epi16(xmm2, xmm2);
113e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
114e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);
115e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm1 = _mm_load_si128(&xmm0);
116e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);
117e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);
118e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com
119e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    _mm_store_si128((__m128i *)dst_argb, xmm0);
120e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    _mm_store_si128((__m128i *)(dst_argb + 16), xmm1);
121e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com
122e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    y_buf += 8;
123e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    u_buf += 4;
124e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    dst_argb += 32;
125e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    width -= 8;
126e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com  }
127e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com}
128e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com
129e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com// Unaligned destination version.
130e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.comvoid I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
131e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com                                   const uint8* u_buf,
132e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com                                   const uint8* v_buf,
133e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com                                   uint8* dst_argb,
134e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com                                   int width) {
135e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com
136e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com  __m128i xmm0, xmm1, xmm2, xmm3;
137e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com  const __m128i xmm5 = _mm_set1_epi8(-1);
138e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com  const __m128i xmm4 = _mm_setzero_si128();
139e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com  const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
140e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com
141e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com  while (width > 0) {
142e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);
143e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));
144e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
145e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);
146e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm1 = _mm_load_si128(&xmm0);
147e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm2 = _mm_load_si128(&xmm0);
148e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kUVToB);
149e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kUVToG);
150e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kUVToR);
151e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm0 = _mm_sub_epi16(xmm0, *(__m128i*)kUVBiasB);
152e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm1 = _mm_sub_epi16(xmm1, *(__m128i*)kUVBiasG);
153e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm2 = _mm_sub_epi16(xmm2, *(__m128i*)kUVBiasR);
154e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm3 = _mm_loadl_epi64((__m128i*)y_buf);
155e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm3 = _mm_unpacklo_epi8(xmm3, xmm4);
156e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm3 = _mm_subs_epi16(xmm3, *(__m128i*)kYSub16);
157e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm3 = _mm_mullo_epi16(xmm3, *(__m128i*)kYToRgb);
158e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm0 = _mm_adds_epi16(xmm0, xmm3);
159e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm1 = _mm_adds_epi16(xmm1, xmm3);
160e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm2 = _mm_adds_epi16(xmm2, xmm3);
161e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm0 = _mm_srai_epi16(xmm0, 6);
162e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm1 = _mm_srai_epi16(xmm1, 6);
163e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm2 = _mm_srai_epi16(xmm2, 6);
164e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm0 = _mm_packus_epi16(xmm0, xmm0);
165e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm1 = _mm_packus_epi16(xmm1, xmm1);
166e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm2 = _mm_packus_epi16(xmm2, xmm2);
167e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
168e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);
169e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm1 = _mm_load_si128(&xmm0);
170e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);
171e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);
172e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com
173e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    _mm_storeu_si128((__m128i *)dst_argb, xmm0);
174e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1);
175e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com
176e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    y_buf += 8;
177e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    u_buf += 4;
178e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    dst_argb += 32;
179e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com    width -= 8;
180e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com  }
181e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com}
182e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com// 32 bit
183e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com#else  // defined(_M_X64)
1842d11d43a6e21865b904705acce6535ae4c2d3caffbarchard@google.com
185585a126140be298e60a4daa26140ead0e94eaaa1fbarchard@google.com#ifdef HAS_ARGBTOYROW_SSSE3
186585a126140be298e60a4daa26140ead0e94eaaa1fbarchard@google.com
187c4c578e327a9dbc9dafe113634612e9a349a8c1ffbarchard@google.com// Constants for ARGB.
188851a702b39aefb717990a0578d8701d051cbab32fbarchard@google.comstatic const vec8 kARGBToY = {
189585a126140be298e60a4daa26140ead0e94eaaa1fbarchard@google.com  13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
190585a126140be298e60a4daa26140ead0e94eaaa1fbarchard@google.com};
191585a126140be298e60a4daa26140ead0e94eaaa1fbarchard@google.com
1924e0d7cc2c60e8aa85954c48927c6be08ee2b9db4fbarchard@google.com// JPeg full range.
193851a702b39aefb717990a0578d8701d051cbab32fbarchard@google.comstatic const vec8 kARGBToYJ = {
194050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com  15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
1954e0d7cc2c60e8aa85954c48927c6be08ee2b9db4fbarchard@google.com};
1964e0d7cc2c60e8aa85954c48927c6be08ee2b9db4fbarchard@google.com
197851a702b39aefb717990a0578d8701d051cbab32fbarchard@google.comstatic const vec8 kARGBToU = {
198585a126140be298e60a4daa26140ead0e94eaaa1fbarchard@google.com  112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
199585a126140be298e60a4daa26140ead0e94eaaa1fbarchard@google.com};
200585a126140be298e60a4daa26140ead0e94eaaa1fbarchard@google.com
201851a702b39aefb717990a0578d8701d051cbab32fbarchard@google.comstatic const vec8 kARGBToUJ = {
202050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com  127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
203050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com};
204050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com
205851a702b39aefb717990a0578d8701d051cbab32fbarchard@google.comstatic const vec8 kARGBToV = {
206585a126140be298e60a4daa26140ead0e94eaaa1fbarchard@google.com  -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
207585a126140be298e60a4daa26140ead0e94eaaa1fbarchard@google.com};
208585a126140be298e60a4daa26140ead0e94eaaa1fbarchard@google.com
209851a702b39aefb717990a0578d8701d051cbab32fbarchard@google.comstatic const vec8 kARGBToVJ = {
210050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com  -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
211050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com};
212050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com
213caf6e2470b15fd4a8df03351b07683352226824cfbarchard@google.com// vpermd for vphaddw + vpackuswb vpermd.
214446f91d040aea92c0522745d176fe8017bd22382fbarchard@google.comstatic const lvec32 kPermdARGBToY_AVX = {
215551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com  0, 4, 1, 5, 2, 6, 3, 7
216551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com};
217551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com
218caf6e2470b15fd4a8df03351b07683352226824cfbarchard@google.com// vpshufb for vphaddw + vpackuswb packed to shorts.
219851a702b39aefb717990a0578d8701d051cbab32fbarchard@google.comstatic const lvec8 kShufARGBToUV_AVX = {
220caf6e2470b15fd4a8df03351b07683352226824cfbarchard@google.com  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
221caf6e2470b15fd4a8df03351b07683352226824cfbarchard@google.com  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
222caf6e2470b15fd4a8df03351b07683352226824cfbarchard@google.com};
223caf6e2470b15fd4a8df03351b07683352226824cfbarchard@google.com
224c4c578e327a9dbc9dafe113634612e9a349a8c1ffbarchard@google.com// Constants for BGRA.
225851a702b39aefb717990a0578d8701d051cbab32fbarchard@google.comstatic const vec8 kBGRAToY = {
2269394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com  0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
2279394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com};
2289394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com
229851a702b39aefb717990a0578d8701d051cbab32fbarchard@google.comstatic const vec8 kBGRAToU = {
2309394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com  0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
2319394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com};
2329394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com
233851a702b39aefb717990a0578d8701d051cbab32fbarchard@google.comstatic const vec8 kBGRAToV = {
2349394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com  0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
2359394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com};
2369394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com
237c4c578e327a9dbc9dafe113634612e9a349a8c1ffbarchard@google.com// Constants for ABGR.
238851a702b39aefb717990a0578d8701d051cbab32fbarchard@google.comstatic const vec8 kABGRToY = {
2399394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com  33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
2409394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com};
2419394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com
242851a702b39aefb717990a0578d8701d051cbab32fbarchard@google.comstatic const vec8 kABGRToU = {
2439394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com  -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
2449394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com};
2459394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com
246851a702b39aefb717990a0578d8701d051cbab32fbarchard@google.comstatic const vec8 kABGRToV = {
2479394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com  112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
2489394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com};
2499394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com
25025dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com// Constants for RGBA.
251851a702b39aefb717990a0578d8701d051cbab32fbarchard@google.comstatic const vec8 kRGBAToY = {
25225dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com  0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
25325dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com};
25425dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com
255851a702b39aefb717990a0578d8701d051cbab32fbarchard@google.comstatic const vec8 kRGBAToU = {
25625dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com  0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
25725dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com};
25825dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com
259851a702b39aefb717990a0578d8701d051cbab32fbarchard@google.comstatic const vec8 kRGBAToV = {
26025dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com  0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
26125dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com};
26225dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com
263851a702b39aefb717990a0578d8701d051cbab32fbarchard@google.comstatic const uvec8 kAddY16 = {
264228bdc24e44264baf3402124aaa6d4d81c8896f5fbarchard@google.com  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
265585a126140be298e60a4daa26140ead0e94eaaa1fbarchard@google.com};
266585a126140be298e60a4daa26140ead0e94eaaa1fbarchard@google.com
267851a702b39aefb717990a0578d8701d051cbab32fbarchard@google.comstatic const vec16 kAddYJ64 = {
2684e0d7cc2c60e8aa85954c48927c6be08ee2b9db4fbarchard@google.com  64, 64, 64, 64, 64, 64, 64, 64
2694e0d7cc2c60e8aa85954c48927c6be08ee2b9db4fbarchard@google.com};
270551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com
271851a702b39aefb717990a0578d8701d051cbab32fbarchard@google.comstatic const uvec8 kAddUV128 = {
2729394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
2739394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
274585a126140be298e60a4daa26140ead0e94eaaa1fbarchard@google.com};
275585a126140be298e60a4daa26140ead0e94eaaa1fbarchard@google.com
276851a702b39aefb717990a0578d8701d051cbab32fbarchard@google.comstatic const uvec16 kAddUVJ128 = {
277050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com  0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
278050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com};
279050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com
280ba1f52692605bbf8fedb8a915275c71fa186d291fbarchard@google.com// Shuffle table for converting RGB24 to ARGB.
281851a702b39aefb717990a0578d8701d051cbab32fbarchard@google.comstatic const uvec8 kShuffleMaskRGB24ToARGB = {
2829394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com  0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
2839394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com};
2849394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com
2859394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com// Shuffle table for converting RAW to ARGB.
286851a702b39aefb717990a0578d8701d051cbab32fbarchard@google.comstatic const uvec8 kShuffleMaskRAWToARGB = {
2879394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com  2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
2889394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com};
2899394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com
2909eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com// Shuffle table for converting ARGB to RGB24.
291851a702b39aefb717990a0578d8701d051cbab32fbarchard@google.comstatic const uvec8 kShuffleMaskARGBToRGB24 = {
292f1b6063f50ced6f1b5f9b735011b382a5c1c963ffbarchard@google.com  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
293f1b6063f50ced6f1b5f9b735011b382a5c1c963ffbarchard@google.com};
2949eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com
2959eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com// Shuffle table for converting ARGB to RAW.
296851a702b39aefb717990a0578d8701d051cbab32fbarchard@google.comstatic const uvec8 kShuffleMaskARGBToRAW = {
297f3fb7b692068862b1091c02b41ac48bfa9258d51fbarchard@google.com  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
298f1b6063f50ced6f1b5f9b735011b382a5c1c963ffbarchard@google.com};
2999eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com
3004de0c439aae9f2d40246dfebce82c18a159ebdc8fbarchard@google.com// Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
301851a702b39aefb717990a0578d8701d051cbab32fbarchard@google.comstatic const uvec8 kShuffleMaskARGBToRGB24_0 = {
302827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
303827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com};
304827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com
305827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com// Shuffle table for converting ARGB to RAW.
306851a702b39aefb717990a0578d8701d051cbab32fbarchard@google.comstatic const uvec8 kShuffleMaskARGBToRAW_0 = {
307827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
308827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com};
309827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com
31000b69a2fe66183be5f72cb80c59f22e137b45359fbarchard@google.com// Duplicates gray value 3 times and fills in alpha opaque.
311d2f4413d29d15b94d971630ba555dd0cd8fcc8c2fbarchard@google.com__declspec(naked) __declspec(align(16))
312b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.comvoid I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
313b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com  __asm {
314b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    mov        eax, [esp + 4]        // src_y
315b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    mov        edx, [esp + 8]        // dst_argb
316b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    mov        ecx, [esp + 12]       // pix
317b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    pcmpeqb    xmm5, xmm5            // generate mask 0xff000000
318b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    pslld      xmm5, 24
319b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com
320c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
321f7a5048f548a92825fc32fb107c092b10627a03dfbarchard@google.com  convertloop:
322b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    movq       xmm0, qword ptr [eax]
323b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    lea        eax,  [eax + 8]
324b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    punpcklbw  xmm0, xmm0
325b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    movdqa     xmm1, xmm0
326b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    punpcklwd  xmm0, xmm0
327b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    punpckhwd  xmm1, xmm1
328b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    por        xmm0, xmm5
329b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    por        xmm1, xmm5
330b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    movdqa     [edx], xmm0
331b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    movdqa     [edx + 16], xmm1
332b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    lea        edx, [edx + 32]
333b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    sub        ecx, 8
33418184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com    jg         convertloop
335b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    ret
336b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com  }
337b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com}
338b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com
339d2f4413d29d15b94d971630ba555dd0cd8fcc8c2fbarchard@google.com__declspec(naked) __declspec(align(16))
34000b69a2fe66183be5f72cb80c59f22e137b45359fbarchard@google.comvoid I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
34100b69a2fe66183be5f72cb80c59f22e137b45359fbarchard@google.com                                  int pix) {
34200b69a2fe66183be5f72cb80c59f22e137b45359fbarchard@google.com  __asm {
34300b69a2fe66183be5f72cb80c59f22e137b45359fbarchard@google.com    mov        eax, [esp + 4]        // src_y
34400b69a2fe66183be5f72cb80c59f22e137b45359fbarchard@google.com    mov        edx, [esp + 8]        // dst_argb
34500b69a2fe66183be5f72cb80c59f22e137b45359fbarchard@google.com    mov        ecx, [esp + 12]       // pix
34600b69a2fe66183be5f72cb80c59f22e137b45359fbarchard@google.com    pcmpeqb    xmm5, xmm5            // generate mask 0xff000000
34700b69a2fe66183be5f72cb80c59f22e137b45359fbarchard@google.com    pslld      xmm5, 24
34800b69a2fe66183be5f72cb80c59f22e137b45359fbarchard@google.com
349c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
35000b69a2fe66183be5f72cb80c59f22e137b45359fbarchard@google.com  convertloop:
35100b69a2fe66183be5f72cb80c59f22e137b45359fbarchard@google.com    movq       xmm0, qword ptr [eax]
35200b69a2fe66183be5f72cb80c59f22e137b45359fbarchard@google.com    lea        eax,  [eax + 8]
35300b69a2fe66183be5f72cb80c59f22e137b45359fbarchard@google.com    punpcklbw  xmm0, xmm0
35400b69a2fe66183be5f72cb80c59f22e137b45359fbarchard@google.com    movdqa     xmm1, xmm0
35500b69a2fe66183be5f72cb80c59f22e137b45359fbarchard@google.com    punpcklwd  xmm0, xmm0
35600b69a2fe66183be5f72cb80c59f22e137b45359fbarchard@google.com    punpckhwd  xmm1, xmm1
35700b69a2fe66183be5f72cb80c59f22e137b45359fbarchard@google.com    por        xmm0, xmm5
35800b69a2fe66183be5f72cb80c59f22e137b45359fbarchard@google.com    por        xmm1, xmm5
35900b69a2fe66183be5f72cb80c59f22e137b45359fbarchard@google.com    movdqu     [edx], xmm0
36000b69a2fe66183be5f72cb80c59f22e137b45359fbarchard@google.com    movdqu     [edx + 16], xmm1
36100b69a2fe66183be5f72cb80c59f22e137b45359fbarchard@google.com    lea        edx, [edx + 32]
36200b69a2fe66183be5f72cb80c59f22e137b45359fbarchard@google.com    sub        ecx, 8
36300b69a2fe66183be5f72cb80c59f22e137b45359fbarchard@google.com    jg         convertloop
36400b69a2fe66183be5f72cb80c59f22e137b45359fbarchard@google.com    ret
36500b69a2fe66183be5f72cb80c59f22e137b45359fbarchard@google.com  }
36600b69a2fe66183be5f72cb80c59f22e137b45359fbarchard@google.com}
36700b69a2fe66183be5f72cb80c59f22e137b45359fbarchard@google.com
36800b69a2fe66183be5f72cb80c59f22e137b45359fbarchard@google.com__declspec(naked) __declspec(align(16))
369ba1f52692605bbf8fedb8a915275c71fa186d291fbarchard@google.comvoid RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
370f5f6fd2aa778c379c442a2210ddd8d2ee03e8eb6fbarchard@google.com  __asm {
371ba1f52692605bbf8fedb8a915275c71fa186d291fbarchard@google.com    mov       eax, [esp + 4]   // src_rgb24
372b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    mov       edx, [esp + 8]   // dst_argb
373b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    mov       ecx, [esp + 12]  // pix
374b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
375b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    pslld     xmm5, 24
376ba1f52692605bbf8fedb8a915275c71fa186d291fbarchard@google.com    movdqa    xmm4, kShuffleMaskRGB24ToARGB
377b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com
378c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
379eaedc1d72735e68d45a0b42221a04902e648a21dfbarchard@google.com convertloop:
380b1dd02d66cbda3e0c571bf81c247f850cdb3e2fdfbarchard@google.com    movdqu    xmm0, [eax]
381b1dd02d66cbda3e0c571bf81c247f850cdb3e2fdfbarchard@google.com    movdqu    xmm1, [eax + 16]
382b1dd02d66cbda3e0c571bf81c247f850cdb3e2fdfbarchard@google.com    movdqu    xmm3, [eax + 32]
383b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    lea       eax, [eax + 48]
384b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    movdqa    xmm2, xmm3
385b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
386b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    pshufb    xmm2, xmm4
387b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    por       xmm2, xmm5
388b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
389b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    pshufb    xmm0, xmm4
390b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    movdqa    [edx + 32], xmm2
391b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    por       xmm0, xmm5
392b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    pshufb    xmm1, xmm4
393b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    movdqa    [edx], xmm0
394b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    por       xmm1, xmm5
395b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
396b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    pshufb    xmm3, xmm4
397b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    movdqa    [edx + 16], xmm1
398b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    por       xmm3, xmm5
39918184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com    sub       ecx, 16
400b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    movdqa    [edx + 48], xmm3
401b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    lea       edx, [edx + 64]
40218184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com    jg        convertloop
403b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    ret
404b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com  }
405b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com}
406b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com
407d2f4413d29d15b94d971630ba555dd0cd8fcc8c2fbarchard@google.com__declspec(naked) __declspec(align(16))
408b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.comvoid RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
409b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com                        int pix) {
410f5f6fd2aa778c379c442a2210ddd8d2ee03e8eb6fbarchard@google.com  __asm {
411b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    mov       eax, [esp + 4]   // src_raw
412b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    mov       edx, [esp + 8]   // dst_argb
413b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    mov       ecx, [esp + 12]  // pix
414b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
415b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    pslld     xmm5, 24
4166334808d9d40071249ba9b51b65aa4e3b6e7f43ffbarchard@google.com    movdqa    xmm4, kShuffleMaskRAWToARGB
417b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com
418c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
419eaedc1d72735e68d45a0b42221a04902e648a21dfbarchard@google.com convertloop:
420b1dd02d66cbda3e0c571bf81c247f850cdb3e2fdfbarchard@google.com    movdqu    xmm0, [eax]
421b1dd02d66cbda3e0c571bf81c247f850cdb3e2fdfbarchard@google.com    movdqu    xmm1, [eax + 16]
422b1dd02d66cbda3e0c571bf81c247f850cdb3e2fdfbarchard@google.com    movdqu    xmm3, [eax + 32]
423b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    lea       eax, [eax + 48]
424b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    movdqa    xmm2, xmm3
425b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
426b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    pshufb    xmm2, xmm4
427b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    por       xmm2, xmm5
428b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
429b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    pshufb    xmm0, xmm4
430b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    movdqa    [edx + 32], xmm2
431b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    por       xmm0, xmm5
432b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    pshufb    xmm1, xmm4
433b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    movdqa    [edx], xmm0
434b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    por       xmm1, xmm5
435b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
436b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    pshufb    xmm3, xmm4
437b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    movdqa    [edx + 16], xmm1
438b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    por       xmm3, xmm5
43918184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com    sub       ecx, 16
440b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    movdqa    [edx + 48], xmm3
441b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    lea       edx, [edx + 64]
44218184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com    jg        convertloop
443b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    ret
444b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com  }
445b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com}
446b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com
447c4c578e327a9dbc9dafe113634612e9a349a8c1ffbarchard@google.com// pmul method to replicate bits.
448c4c578e327a9dbc9dafe113634612e9a349a8c1ffbarchard@google.com// Math to replicate bits:
449ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com// (v << 8) | (v << 3)
450ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com// v * 256 + v * 8
451ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com// v * (256 + 8)
452ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
453c4c578e327a9dbc9dafe113634612e9a349a8c1ffbarchard@google.com// 20 instructions.
454d2f4413d29d15b94d971630ba555dd0cd8fcc8c2fbarchard@google.com__declspec(naked) __declspec(align(16))
455ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.comvoid RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
456ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com                          int pix) {
457f5f6fd2aa778c379c442a2210ddd8d2ee03e8eb6fbarchard@google.com  __asm {
458ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
459ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    movd      xmm5, eax
460ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    pshufd    xmm5, xmm5, 0
4616d6b7709f754391252dc716b92801cc8ec425642fbarchard@google.com    mov       eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
462ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    movd      xmm6, eax
463ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    pshufd    xmm6, xmm6, 0
464ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
465ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    psllw     xmm3, 11
466ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    pcmpeqb   xmm4, xmm4       // generate mask 0x07e007e0 for Green
467ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    psllw     xmm4, 10
468ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    psrlw     xmm4, 5
469ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
470ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    psllw     xmm7, 8
471ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com
472ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    mov       eax, [esp + 4]   // src_rgb565
473ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    mov       edx, [esp + 8]   // dst_argb
474ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    mov       ecx, [esp + 12]  // pix
475ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    sub       edx, eax
476ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    sub       edx, eax
477ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com
478c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
479ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com convertloop:
480b1dd02d66cbda3e0c571bf81c247f850cdb3e2fdfbarchard@google.com    movdqu    xmm0, [eax]   // fetch 8 pixels of bgr565
481ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    movdqa    xmm1, xmm0
482ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    movdqa    xmm2, xmm0
483ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    pand      xmm1, xmm3    // R in upper 5 bits
484ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    psllw     xmm2, 11      // B in upper 5 bits
485ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    pmulhuw   xmm1, xmm5    // * (256 + 8)
486ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    pmulhuw   xmm2, xmm5    // * (256 + 8)
487ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    psllw     xmm1, 8
488ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    por       xmm1, xmm2    // RB
489ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    pand      xmm0, xmm4    // G in middle 6 bits
490ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    pmulhuw   xmm0, xmm6    // << 5 * (256 + 4)
491ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    por       xmm0, xmm7    // AG
492ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    movdqa    xmm2, xmm1
493ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    punpcklbw xmm1, xmm0
494ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    punpckhbw xmm2, xmm0
495ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    movdqa    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
496ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    movdqa    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
497ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    lea       eax, [eax + 16]
498ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    sub       ecx, 8
49918184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com    jg        convertloop
500ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    ret
501ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com  }
502ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com}
503ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com
504ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com// 24 instructions
505d2f4413d29d15b94d971630ba555dd0cd8fcc8c2fbarchard@google.com__declspec(naked) __declspec(align(16))
506ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.comvoid ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
507ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com                            int pix) {
508f5f6fd2aa778c379c442a2210ddd8d2ee03e8eb6fbarchard@google.com  __asm {
509ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
510ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    movd      xmm5, eax
511ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    pshufd    xmm5, xmm5, 0
512ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    mov       eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
513ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    movd      xmm6, eax
514ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    pshufd    xmm6, xmm6, 0
515ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
516ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    psllw     xmm3, 11
5170e6ce93c84f710e6a589c6c6edfe480ad0567f0cfbarchard@google.com    movdqa    xmm4, xmm3       // generate mask 0x03e003e0 for Green
518ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    psrlw     xmm4, 6
519ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
520ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    psllw     xmm7, 8
521ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com
522ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    mov       eax, [esp + 4]   // src_argb1555
523ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    mov       edx, [esp + 8]   // dst_argb
524ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    mov       ecx, [esp + 12]  // pix
525ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    sub       edx, eax
526ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    sub       edx, eax
527ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com
528c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
529ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com convertloop:
530b1dd02d66cbda3e0c571bf81c247f850cdb3e2fdfbarchard@google.com    movdqu    xmm0, [eax]   // fetch 8 pixels of 1555
531ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    movdqa    xmm1, xmm0
532ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    movdqa    xmm2, xmm0
533ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    psllw     xmm1, 1       // R in upper 5 bits
534ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    psllw     xmm2, 11      // B in upper 5 bits
535ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    pand      xmm1, xmm3
536ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    pmulhuw   xmm2, xmm5    // * (256 + 8)
537ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    pmulhuw   xmm1, xmm5    // * (256 + 8)
538ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    psllw     xmm1, 8
539ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    por       xmm1, xmm2    // RB
540ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    movdqa    xmm2, xmm0
541ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    pand      xmm0, xmm4    // G in middle 5 bits
542ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    psraw     xmm2, 8       // A
543ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    pmulhuw   xmm0, xmm6    // << 6 * (256 + 8)
544ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    pand      xmm2, xmm7
545ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    por       xmm0, xmm2    // AG
546ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    movdqa    xmm2, xmm1
547ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    punpcklbw xmm1, xmm0
548ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    punpckhbw xmm2, xmm0
549ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    movdqa    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
550ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    movdqa    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
551ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    lea       eax, [eax + 16]
552ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    sub       ecx, 8
55318184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com    jg        convertloop
554ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    ret
555ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com  }
556ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com}
5576aa761da6d5de07a602425bbe070f5dc067c3d68fbarchard@google.com
558c4c578e327a9dbc9dafe113634612e9a349a8c1ffbarchard@google.com// 18 instructions.
559d2f4413d29d15b94d971630ba555dd0cd8fcc8c2fbarchard@google.com__declspec(naked) __declspec(align(16))
56017272be539c30cebca4cd11e2945ae94cd876a20fbarchard@google.comvoid ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
56117272be539c30cebca4cd11e2945ae94cd876a20fbarchard@google.com                            int pix) {
562f5f6fd2aa778c379c442a2210ddd8d2ee03e8eb6fbarchard@google.com  __asm {
56317272be539c30cebca4cd11e2945ae94cd876a20fbarchard@google.com    mov       eax, 0x0f0f0f0f  // generate mask 0x0f0f0f0f
56417272be539c30cebca4cd11e2945ae94cd876a20fbarchard@google.com    movd      xmm4, eax
56517272be539c30cebca4cd11e2945ae94cd876a20fbarchard@google.com    pshufd    xmm4, xmm4, 0
56617272be539c30cebca4cd11e2945ae94cd876a20fbarchard@google.com    movdqa    xmm5, xmm4       // 0xf0f0f0f0 for high nibbles
56717272be539c30cebca4cd11e2945ae94cd876a20fbarchard@google.com    pslld     xmm5, 4
56817272be539c30cebca4cd11e2945ae94cd876a20fbarchard@google.com    mov       eax, [esp + 4]   // src_argb4444
56917272be539c30cebca4cd11e2945ae94cd876a20fbarchard@google.com    mov       edx, [esp + 8]   // dst_argb
57017272be539c30cebca4cd11e2945ae94cd876a20fbarchard@google.com    mov       ecx, [esp + 12]  // pix
571ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    sub       edx, eax
572ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    sub       edx, eax
57317272be539c30cebca4cd11e2945ae94cd876a20fbarchard@google.com
574c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
57517272be539c30cebca4cd11e2945ae94cd876a20fbarchard@google.com convertloop:
576b1dd02d66cbda3e0c571bf81c247f850cdb3e2fdfbarchard@google.com    movdqu    xmm0, [eax]   // fetch 8 pixels of bgra4444
57717272be539c30cebca4cd11e2945ae94cd876a20fbarchard@google.com    movdqa    xmm2, xmm0
57817272be539c30cebca4cd11e2945ae94cd876a20fbarchard@google.com    pand      xmm0, xmm4    // mask low nibbles
57917272be539c30cebca4cd11e2945ae94cd876a20fbarchard@google.com    pand      xmm2, xmm5    // mask high nibbles
58017272be539c30cebca4cd11e2945ae94cd876a20fbarchard@google.com    movdqa    xmm1, xmm0
58117272be539c30cebca4cd11e2945ae94cd876a20fbarchard@google.com    movdqa    xmm3, xmm2
58217272be539c30cebca4cd11e2945ae94cd876a20fbarchard@google.com    psllw     xmm1, 4
58317272be539c30cebca4cd11e2945ae94cd876a20fbarchard@google.com    psrlw     xmm3, 4
58417272be539c30cebca4cd11e2945ae94cd876a20fbarchard@google.com    por       xmm0, xmm1
58517272be539c30cebca4cd11e2945ae94cd876a20fbarchard@google.com    por       xmm2, xmm3
586ba1f52692605bbf8fedb8a915275c71fa186d291fbarchard@google.com    movdqa    xmm1, xmm0
58717272be539c30cebca4cd11e2945ae94cd876a20fbarchard@google.com    punpcklbw xmm0, xmm2
588ba1f52692605bbf8fedb8a915275c71fa186d291fbarchard@google.com    punpckhbw xmm1, xmm2
589ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    movdqa    [eax * 2 + edx], xmm0  // store 4 pixels of ARGB
590ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    movdqa    [eax * 2 + edx + 16], xmm1  // store next 4 pixels of ARGB
591ccd6d9b2de6af7985775a2e5537190cf5794dd44fbarchard@google.com    lea       eax, [eax + 16]
592ba1f52692605bbf8fedb8a915275c71fa186d291fbarchard@google.com    sub       ecx, 8
59318184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com    jg        convertloop
59417272be539c30cebca4cd11e2945ae94cd876a20fbarchard@google.com    ret
59517272be539c30cebca4cd11e2945ae94cd876a20fbarchard@google.com  }
59617272be539c30cebca4cd11e2945ae94cd876a20fbarchard@google.com}
59717272be539c30cebca4cd11e2945ae94cd876a20fbarchard@google.com
598d2f4413d29d15b94d971630ba555dd0cd8fcc8c2fbarchard@google.com__declspec(naked) __declspec(align(16))
5999eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.comvoid ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
600f5f6fd2aa778c379c442a2210ddd8d2ee03e8eb6fbarchard@google.com  __asm {
6019eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com    mov       eax, [esp + 4]   // src_argb
6029eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com    mov       edx, [esp + 8]   // dst_rgb
6039eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com    mov       ecx, [esp + 12]  // pix
604510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    movdqa    xmm6, kShuffleMaskARGBToRGB24
6059eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com
606c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
6079eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com convertloop:
6087e7c7753ba712c0d1365276c27ee13866b948415fbarchard@google.com    movdqu    xmm0, [eax]   // fetch 16 pixels of argb
6097e7c7753ba712c0d1365276c27ee13866b948415fbarchard@google.com    movdqu    xmm1, [eax + 16]
6107e7c7753ba712c0d1365276c27ee13866b948415fbarchard@google.com    movdqu    xmm2, [eax + 32]
6117e7c7753ba712c0d1365276c27ee13866b948415fbarchard@google.com    movdqu    xmm3, [eax + 48]
6129eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com    lea       eax, [eax + 64]
613510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
614510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    pshufb    xmm1, xmm6
615510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    pshufb    xmm2, xmm6
616510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    pshufb    xmm3, xmm6
617510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    movdqa    xmm4, xmm1   // 4 bytes from 1 for 0
618510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    psrldq    xmm1, 4      // 8 bytes from 1
619510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    pslldq    xmm4, 12     // 4 bytes from 1 for 0
620510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
621510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    por       xmm0, xmm4   // 4 bytes from 1 for 0
622510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    pslldq    xmm5, 8      // 8 bytes from 2 for 1
6237e7c7753ba712c0d1365276c27ee13866b948415fbarchard@google.com    movdqu    [edx], xmm0  // store 0
624510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    por       xmm1, xmm5   // 8 bytes from 2 for 1
625510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    psrldq    xmm2, 8      // 4 bytes from 2
626510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    pslldq    xmm3, 4      // 12 bytes from 3 for 2
627510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    por       xmm2, xmm3   // 12 bytes from 3 for 2
6287e7c7753ba712c0d1365276c27ee13866b948415fbarchard@google.com    movdqu    [edx + 16], xmm1   // store 1
6297e7c7753ba712c0d1365276c27ee13866b948415fbarchard@google.com    movdqu    [edx + 32], xmm2   // store 2
6309eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com    lea       edx, [edx + 48]
6319eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com    sub       ecx, 16
63218184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com    jg        convertloop
6339eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com    ret
6349eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com  }
6359eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com}
6369eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com
637d2f4413d29d15b94d971630ba555dd0cd8fcc8c2fbarchard@google.com__declspec(naked) __declspec(align(16))
6389eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.comvoid ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
639f5f6fd2aa778c379c442a2210ddd8d2ee03e8eb6fbarchard@google.com  __asm {
6409eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com    mov       eax, [esp + 4]   // src_argb
6419eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com    mov       edx, [esp + 8]   // dst_rgb
6429eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com    mov       ecx, [esp + 12]  // pix
643510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    movdqa    xmm6, kShuffleMaskARGBToRAW
6449eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com
645c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
6469eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com convertloop:
6477e7c7753ba712c0d1365276c27ee13866b948415fbarchard@google.com    movdqu    xmm0, [eax]   // fetch 16 pixels of argb
6487e7c7753ba712c0d1365276c27ee13866b948415fbarchard@google.com    movdqu    xmm1, [eax + 16]
6497e7c7753ba712c0d1365276c27ee13866b948415fbarchard@google.com    movdqu    xmm2, [eax + 32]
6507e7c7753ba712c0d1365276c27ee13866b948415fbarchard@google.com    movdqu    xmm3, [eax + 48]
6519eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com    lea       eax, [eax + 64]
652510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
653510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    pshufb    xmm1, xmm6
654510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    pshufb    xmm2, xmm6
655510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    pshufb    xmm3, xmm6
656510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    movdqa    xmm4, xmm1   // 4 bytes from 1 for 0
657510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    psrldq    xmm1, 4      // 8 bytes from 1
658510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    pslldq    xmm4, 12     // 4 bytes from 1 for 0
659510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
660510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    por       xmm0, xmm4   // 4 bytes from 1 for 0
661510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    pslldq    xmm5, 8      // 8 bytes from 2 for 1
6627e7c7753ba712c0d1365276c27ee13866b948415fbarchard@google.com    movdqu    [edx], xmm0  // store 0
663510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    por       xmm1, xmm5   // 8 bytes from 2 for 1
664510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    psrldq    xmm2, 8      // 4 bytes from 2
665510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    pslldq    xmm3, 4      // 12 bytes from 3 for 2
666510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    por       xmm2, xmm3   // 12 bytes from 3 for 2
6677e7c7753ba712c0d1365276c27ee13866b948415fbarchard@google.com    movdqu    [edx + 16], xmm1   // store 1
6687e7c7753ba712c0d1365276c27ee13866b948415fbarchard@google.com    movdqu    [edx + 32], xmm2   // store 2
6699eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com    lea       edx, [edx + 48]
6709eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com    sub       ecx, 16
67118184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com    jg        convertloop
6729eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com    ret
6739eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com  }
6749eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com}
6759eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com
676d2f4413d29d15b94d971630ba555dd0cd8fcc8c2fbarchard@google.com__declspec(naked) __declspec(align(16))
6779eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.comvoid ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
678f5f6fd2aa778c379c442a2210ddd8d2ee03e8eb6fbarchard@google.com  __asm {
6799eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com    mov       eax, [esp + 4]   // src_argb
6809eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com    mov       edx, [esp + 8]   // dst_rgb
6819eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com    mov       ecx, [esp + 12]  // pix
682510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    pcmpeqb   xmm3, xmm3       // generate mask 0x0000001f
683510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    psrld     xmm3, 27
684510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    pcmpeqb   xmm4, xmm4       // generate mask 0x000007e0
685510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    psrld     xmm4, 26
686510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    pslld     xmm4, 5
687510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    pcmpeqb   xmm5, xmm5       // generate mask 0xfffff800
688510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    pslld     xmm5, 11
6899eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com
690c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
6919eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com convertloop:
6929eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com    movdqa    xmm0, [eax]   // fetch 4 pixels of argb
6939eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com    movdqa    xmm1, xmm0    // B
6949eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com    movdqa    xmm2, xmm0    // G
695510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    pslld     xmm0, 8       // R
696510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    psrld     xmm1, 3       // B
697510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    psrld     xmm2, 5       // G
698510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    psrad     xmm0, 16      // R
699510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    pand      xmm1, xmm3    // B
700510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    pand      xmm2, xmm4    // G
701510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    pand      xmm0, xmm5    // R
702510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    por       xmm1, xmm2    // BG
703510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    por       xmm0, xmm1    // BGR
7049eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com    packssdw  xmm0, xmm0
70524d2656b65beb2a86acf1913a5c025a6aca21299fbarchard@google.com    lea       eax, [eax + 16]
70615449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com    movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565
7079eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com    lea       edx, [edx + 8]
7089eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com    sub       ecx, 4
70918184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com    jg        convertloop
7109eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com    ret
7119eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com  }
7129eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com}
7139eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com
714c4c578e327a9dbc9dafe113634612e9a349a8c1ffbarchard@google.com// TODO(fbarchard): Improve sign extension/packing.
715d2f4413d29d15b94d971630ba555dd0cd8fcc8c2fbarchard@google.com__declspec(naked) __declspec(align(16))
7169eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.comvoid ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
717f5f6fd2aa778c379c442a2210ddd8d2ee03e8eb6fbarchard@google.com  __asm {
7189eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com    mov       eax, [esp + 4]   // src_argb
7199eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com    mov       edx, [esp + 8]   // dst_rgb
7209eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com    mov       ecx, [esp + 12]  // pix
721510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    pcmpeqb   xmm4, xmm4       // generate mask 0x0000001f
722510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    psrld     xmm4, 27
723510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    movdqa    xmm5, xmm4       // generate mask 0x000003e0
724510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    pslld     xmm5, 5
725510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    movdqa    xmm6, xmm4       // generate mask 0x00007c00
726510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    pslld     xmm6, 10
727510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    pcmpeqb   xmm7, xmm7       // generate mask 0xffff8000
728510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    pslld     xmm7, 15
7299eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com
730c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
7319eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com convertloop:
7329eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com    movdqa    xmm0, [eax]   // fetch 4 pixels of argb
7339eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com    movdqa    xmm1, xmm0    // B
7349eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com    movdqa    xmm2, xmm0    // G
735510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    movdqa    xmm3, xmm0    // R
736510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    psrad     xmm0, 16      // A
737510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    psrld     xmm1, 3       // B
738510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    psrld     xmm2, 6       // G
739510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    psrld     xmm3, 9       // R
740510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    pand      xmm0, xmm7    // A
741510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    pand      xmm1, xmm4    // B
742510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    pand      xmm2, xmm5    // G
743510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    pand      xmm3, xmm6    // R
744510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    por       xmm0, xmm1    // BA
745510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    por       xmm2, xmm3    // GR
746510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    por       xmm0, xmm2    // BGRA
74724d2656b65beb2a86acf1913a5c025a6aca21299fbarchard@google.com    packssdw  xmm0, xmm0
74824d2656b65beb2a86acf1913a5c025a6aca21299fbarchard@google.com    lea       eax, [eax + 16]
74924d2656b65beb2a86acf1913a5c025a6aca21299fbarchard@google.com    movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB1555
7509eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com    lea       edx, [edx + 8]
7519eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com    sub       ecx, 4
75218184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com    jg        convertloop
7539eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com    ret
7549eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com  }
7559eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com}
7569eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com
757d2f4413d29d15b94d971630ba555dd0cd8fcc8c2fbarchard@google.com__declspec(naked) __declspec(align(16))
7589eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.comvoid ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
759f5f6fd2aa778c379c442a2210ddd8d2ee03e8eb6fbarchard@google.com  __asm {
760510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    mov       eax, [esp + 4]   // src_argb
761510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    mov       edx, [esp + 8]   // dst_rgb
762510fe70cb5c59f51cb48d854aef5393f6c85307dfbarchard@google.com    mov       ecx, [esp + 12]  // pix
7639eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com    pcmpeqb   xmm4, xmm4       // generate mask 0xf000f000
7649eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com    psllw     xmm4, 12
7659eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com    movdqa    xmm3, xmm4       // generate mask 0x00f000f0
7669eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com    psrlw     xmm3, 8
7679eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com
768c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
7699eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com convertloop:
7709eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com    movdqa    xmm0, [eax]   // fetch 4 pixels of argb
7719eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com    movdqa    xmm1, xmm0
7729eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com    pand      xmm0, xmm3    // low nibble
7739eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com    pand      xmm1, xmm4    // high nibble
7749eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com    psrl      xmm0, 4
7759eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com    psrl      xmm1, 8
7769eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com    por       xmm0, xmm1
7779eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com    packuswb  xmm0, xmm0
77824d2656b65beb2a86acf1913a5c025a6aca21299fbarchard@google.com    lea       eax, [eax + 16]
7799eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com    movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB4444
7809eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com    lea       edx, [edx + 8]
7819eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com    sub       ecx, 4
78218184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com    jg        convertloop
7839eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com    ret
7849eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com  }
7859eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com}
7869eefb2e8dd2c40a8b6bd0f02d794fe78332fc08ffbarchard@google.com
787c4c578e327a9dbc9dafe113634612e9a349a8c1ffbarchard@google.com// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
788d2f4413d29d15b94d971630ba555dd0cd8fcc8c2fbarchard@google.com__declspec(naked) __declspec(align(16))
789585a126140be298e60a4daa26140ead0e94eaaa1fbarchard@google.comvoid ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
790f5f6fd2aa778c379c442a2210ddd8d2ee03e8eb6fbarchard@google.com  __asm {
7919394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    mov        eax, [esp + 4]   /* src_argb */
7929394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    mov        edx, [esp + 8]   /* dst_y */
7939394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    mov        ecx, [esp + 12]  /* pix */
7946334808d9d40071249ba9b51b65aa4e3b6e7f43ffbarchard@google.com    movdqa     xmm5, kAddY16
7956334808d9d40071249ba9b51b65aa4e3b6e7f43ffbarchard@google.com    movdqa     xmm4, kARGBToY
796585a126140be298e60a4daa26140ead0e94eaaa1fbarchard@google.com
797c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
798eaedc1d72735e68d45a0b42221a04902e648a21dfbarchard@google.com convertloop:
7999394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    movdqa     xmm0, [eax]
8009394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    movdqa     xmm1, [eax + 16]
8019394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    movdqa     xmm2, [eax + 32]
8029394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    movdqa     xmm3, [eax + 48]
803b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    pmaddubsw  xmm0, xmm4
804b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    pmaddubsw  xmm1, xmm4
805b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    pmaddubsw  xmm2, xmm4
806b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    pmaddubsw  xmm3, xmm4
8079394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    lea        eax, [eax + 64]
8089394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    phaddw     xmm0, xmm1
8099394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    phaddw     xmm2, xmm3
8109394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    psrlw      xmm0, 7
8119394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    psrlw      xmm2, 7
8129394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    packuswb   xmm0, xmm2
813b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    paddb      xmm0, xmm5
814aa4750f86da4747c8a3d1488cd25c49c434fbe65fbarchard@google.com    sub        ecx, 16
8159394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    movdqa     [edx], xmm0
8169394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    lea        edx, [edx + 16]
81718184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com    jg         convertloop
8189394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    ret
8199394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com  }
8209394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com}
8219394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com
822cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
823cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com__declspec(naked) __declspec(align(16))
824cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.comvoid ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
825cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com  __asm {
826cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com    mov        eax, [esp + 4]   /* src_argb */
827cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com    mov        edx, [esp + 8]   /* dst_y */
828cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com    mov        ecx, [esp + 12]  /* pix */
8294e0d7cc2c60e8aa85954c48927c6be08ee2b9db4fbarchard@google.com    movdqa     xmm4, kARGBToYJ
8304e0d7cc2c60e8aa85954c48927c6be08ee2b9db4fbarchard@google.com    movdqa     xmm5, kAddYJ64
831cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com
832c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
833cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com convertloop:
834cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com    movdqa     xmm0, [eax]
835cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com    movdqa     xmm1, [eax + 16]
836cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com    movdqa     xmm2, [eax + 32]
837cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com    movdqa     xmm3, [eax + 48]
838cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com    pmaddubsw  xmm0, xmm4
839cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com    pmaddubsw  xmm1, xmm4
840cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com    pmaddubsw  xmm2, xmm4
841cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com    pmaddubsw  xmm3, xmm4
842cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com    lea        eax, [eax + 64]
843cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com    phaddw     xmm0, xmm1
844cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com    phaddw     xmm2, xmm3
8458c9de166a11222d6aa38deb12449b30451d2eca5fbarchard@google.com    paddw      xmm0, xmm5  // Add .5 for rounding.
8464e0d7cc2c60e8aa85954c48927c6be08ee2b9db4fbarchard@google.com    paddw      xmm2, xmm5
847cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com    psrlw      xmm0, 7
848cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com    psrlw      xmm2, 7
849cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com    packuswb   xmm0, xmm2
850cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com    sub        ecx, 16
851cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com    movdqa     [edx], xmm0
852cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com    lea        edx, [edx + 16]
853cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com    jg         convertloop
854cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com    ret
855cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com  }
856cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com}
857cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com
858551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com#ifdef HAS_ARGBTOYROW_AVX2
859551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
860551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com__declspec(naked) __declspec(align(32))
861551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.comvoid ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
862551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com  __asm {
863551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com    mov        eax, [esp + 4]   /* src_argb */
864551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com    mov        edx, [esp + 8]   /* dst_y */
865551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com    mov        ecx, [esp + 12]  /* pix */
866446f91d040aea92c0522745d176fe8017bd22382fbarchard@google.com    vbroadcastf128 ymm4, kARGBToY
867446f91d040aea92c0522745d176fe8017bd22382fbarchard@google.com    vbroadcastf128 ymm5, kAddY16
868446f91d040aea92c0522745d176fe8017bd22382fbarchard@google.com    vmovdqa    ymm6, kPermdARGBToY_AVX
869551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com
870c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
871551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com convertloop:
872b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vmovdqu    ymm0, [eax]
873b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vmovdqu    ymm1, [eax + 32]
874b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vmovdqu    ymm2, [eax + 64]
875b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vmovdqu    ymm3, [eax + 96]
876551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com    vpmaddubsw ymm0, ymm0, ymm4
877551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com    vpmaddubsw ymm1, ymm1, ymm4
878551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com    vpmaddubsw ymm2, ymm2, ymm4
879551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com    vpmaddubsw ymm3, ymm3, ymm4
880551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com    lea        eax, [eax + 128]
881caf6e2470b15fd4a8df03351b07683352226824cfbarchard@google.com    vphaddw    ymm0, ymm0, ymm1  // mutates.
882551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com    vphaddw    ymm2, ymm2, ymm3
883551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com    vpsrlw     ymm0, ymm0, 7
884551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com    vpsrlw     ymm2, ymm2, 7
885caf6e2470b15fd4a8df03351b07683352226824cfbarchard@google.com    vpackuswb  ymm0, ymm0, ymm2  // mutates.
886551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com    vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.
887551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com    vpaddb     ymm0, ymm0, ymm5
888551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com    sub        ecx, 32
889b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vmovdqu    [edx], ymm0
890551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com    lea        edx, [edx + 32]
891551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com    jg         convertloop
8929b4c00b908d37727c6caf82337813d567732be1cfbarchard@google.com    vzeroupper
893551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com    ret
894551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com  }
895551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com}
896551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com#endif  //  HAS_ARGBTOYROW_AVX2
897551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com
89891c50c3a7d8736aa5834d6c54ae1c6bbea581e1ffbarchard@google.com#ifdef HAS_ARGBTOYROW_AVX2
89991c50c3a7d8736aa5834d6c54ae1c6bbea581e1ffbarchard@google.com// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
90091c50c3a7d8736aa5834d6c54ae1c6bbea581e1ffbarchard@google.com__declspec(naked) __declspec(align(32))
90191c50c3a7d8736aa5834d6c54ae1c6bbea581e1ffbarchard@google.comvoid ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
90291c50c3a7d8736aa5834d6c54ae1c6bbea581e1ffbarchard@google.com  __asm {
90391c50c3a7d8736aa5834d6c54ae1c6bbea581e1ffbarchard@google.com    mov        eax, [esp + 4]   /* src_argb */
90491c50c3a7d8736aa5834d6c54ae1c6bbea581e1ffbarchard@google.com    mov        edx, [esp + 8]   /* dst_y */
90591c50c3a7d8736aa5834d6c54ae1c6bbea581e1ffbarchard@google.com    mov        ecx, [esp + 12]  /* pix */
906446f91d040aea92c0522745d176fe8017bd22382fbarchard@google.com    vbroadcastf128 ymm4, kARGBToYJ
907446f91d040aea92c0522745d176fe8017bd22382fbarchard@google.com    vbroadcastf128 ymm5, kAddYJ64
908446f91d040aea92c0522745d176fe8017bd22382fbarchard@google.com    vmovdqa    ymm6, kPermdARGBToY_AVX
90991c50c3a7d8736aa5834d6c54ae1c6bbea581e1ffbarchard@google.com
910c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
91191c50c3a7d8736aa5834d6c54ae1c6bbea581e1ffbarchard@google.com convertloop:
91291c50c3a7d8736aa5834d6c54ae1c6bbea581e1ffbarchard@google.com    vmovdqu    ymm0, [eax]
91391c50c3a7d8736aa5834d6c54ae1c6bbea581e1ffbarchard@google.com    vmovdqu    ymm1, [eax + 32]
91491c50c3a7d8736aa5834d6c54ae1c6bbea581e1ffbarchard@google.com    vmovdqu    ymm2, [eax + 64]
91591c50c3a7d8736aa5834d6c54ae1c6bbea581e1ffbarchard@google.com    vmovdqu    ymm3, [eax + 96]
91691c50c3a7d8736aa5834d6c54ae1c6bbea581e1ffbarchard@google.com    vpmaddubsw ymm0, ymm0, ymm4
91791c50c3a7d8736aa5834d6c54ae1c6bbea581e1ffbarchard@google.com    vpmaddubsw ymm1, ymm1, ymm4
91891c50c3a7d8736aa5834d6c54ae1c6bbea581e1ffbarchard@google.com    vpmaddubsw ymm2, ymm2, ymm4
91991c50c3a7d8736aa5834d6c54ae1c6bbea581e1ffbarchard@google.com    vpmaddubsw ymm3, ymm3, ymm4
92091c50c3a7d8736aa5834d6c54ae1c6bbea581e1ffbarchard@google.com    lea        eax, [eax + 128]
92191c50c3a7d8736aa5834d6c54ae1c6bbea581e1ffbarchard@google.com    vphaddw    ymm0, ymm0, ymm1  // mutates.
92291c50c3a7d8736aa5834d6c54ae1c6bbea581e1ffbarchard@google.com    vphaddw    ymm2, ymm2, ymm3
92391c50c3a7d8736aa5834d6c54ae1c6bbea581e1ffbarchard@google.com    vpaddw     ymm0, ymm0, ymm5  // Add .5 for rounding.
92491c50c3a7d8736aa5834d6c54ae1c6bbea581e1ffbarchard@google.com    vpaddw     ymm2, ymm2, ymm5
92591c50c3a7d8736aa5834d6c54ae1c6bbea581e1ffbarchard@google.com    vpsrlw     ymm0, ymm0, 7
92691c50c3a7d8736aa5834d6c54ae1c6bbea581e1ffbarchard@google.com    vpsrlw     ymm2, ymm2, 7
92791c50c3a7d8736aa5834d6c54ae1c6bbea581e1ffbarchard@google.com    vpackuswb  ymm0, ymm0, ymm2  // mutates.
92891c50c3a7d8736aa5834d6c54ae1c6bbea581e1ffbarchard@google.com    vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.
92991c50c3a7d8736aa5834d6c54ae1c6bbea581e1ffbarchard@google.com    sub        ecx, 32
93091c50c3a7d8736aa5834d6c54ae1c6bbea581e1ffbarchard@google.com    vmovdqu    [edx], ymm0
93191c50c3a7d8736aa5834d6c54ae1c6bbea581e1ffbarchard@google.com    lea        edx, [edx + 32]
93291c50c3a7d8736aa5834d6c54ae1c6bbea581e1ffbarchard@google.com    jg         convertloop
93391c50c3a7d8736aa5834d6c54ae1c6bbea581e1ffbarchard@google.com
93491c50c3a7d8736aa5834d6c54ae1c6bbea581e1ffbarchard@google.com    vzeroupper
93591c50c3a7d8736aa5834d6c54ae1c6bbea581e1ffbarchard@google.com    ret
93691c50c3a7d8736aa5834d6c54ae1c6bbea581e1ffbarchard@google.com  }
93791c50c3a7d8736aa5834d6c54ae1c6bbea581e1ffbarchard@google.com}
93891c50c3a7d8736aa5834d6c54ae1c6bbea581e1ffbarchard@google.com#endif  //  HAS_ARGBTOYJROW_AVX2
93991c50c3a7d8736aa5834d6c54ae1c6bbea581e1ffbarchard@google.com
940d2f4413d29d15b94d971630ba555dd0cd8fcc8c2fbarchard@google.com__declspec(naked) __declspec(align(16))
941b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.comvoid ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
942f5f6fd2aa778c379c442a2210ddd8d2ee03e8eb6fbarchard@google.com  __asm {
943b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    mov        eax, [esp + 4]   /* src_argb */
944b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    mov        edx, [esp + 8]   /* dst_y */
945b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    mov        ecx, [esp + 12]  /* pix */
946b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqa     xmm5, kAddY16
947b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqa     xmm4, kARGBToY
948b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com
949c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
950b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com convertloop:
951b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqu     xmm0, [eax]
952b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqu     xmm1, [eax + 16]
953b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqu     xmm2, [eax + 32]
954b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqu     xmm3, [eax + 48]
955b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    pmaddubsw  xmm0, xmm4
956b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    pmaddubsw  xmm1, xmm4
957b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    pmaddubsw  xmm2, xmm4
958b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    pmaddubsw  xmm3, xmm4
959b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    lea        eax, [eax + 64]
960b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    phaddw     xmm0, xmm1
961b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    phaddw     xmm2, xmm3
962b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    psrlw      xmm0, 7
963b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    psrlw      xmm2, 7
964b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    packuswb   xmm0, xmm2
965b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    paddb      xmm0, xmm5
966aa4750f86da4747c8a3d1488cd25c49c434fbe65fbarchard@google.com    sub        ecx, 16
967b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqu     [edx], xmm0
968b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    lea        edx, [edx + 16]
96918184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com    jg         convertloop
970b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    ret
971b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com  }
972b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com}
973b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com
974d2f4413d29d15b94d971630ba555dd0cd8fcc8c2fbarchard@google.com__declspec(naked) __declspec(align(16))
975cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.comvoid ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
976cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com  __asm {
977cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com    mov        eax, [esp + 4]   /* src_argb */
978cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com    mov        edx, [esp + 8]   /* dst_y */
979cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com    mov        ecx, [esp + 12]  /* pix */
9804e0d7cc2c60e8aa85954c48927c6be08ee2b9db4fbarchard@google.com    movdqa     xmm4, kARGBToYJ
9814e0d7cc2c60e8aa85954c48927c6be08ee2b9db4fbarchard@google.com    movdqa     xmm5, kAddYJ64
982cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com
983c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
984cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com convertloop:
985cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com    movdqu     xmm0, [eax]
986cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com    movdqu     xmm1, [eax + 16]
987cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com    movdqu     xmm2, [eax + 32]
988cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com    movdqu     xmm3, [eax + 48]
989cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com    pmaddubsw  xmm0, xmm4
990cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com    pmaddubsw  xmm1, xmm4
991cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com    pmaddubsw  xmm2, xmm4
992cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com    pmaddubsw  xmm3, xmm4
993cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com    lea        eax, [eax + 64]
994cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com    phaddw     xmm0, xmm1
995cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com    phaddw     xmm2, xmm3
9964e0d7cc2c60e8aa85954c48927c6be08ee2b9db4fbarchard@google.com    paddw      xmm0, xmm5
9974e0d7cc2c60e8aa85954c48927c6be08ee2b9db4fbarchard@google.com    paddw      xmm2, xmm5
998cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com    psrlw      xmm0, 7
999cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com    psrlw      xmm2, 7
1000cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com    packuswb   xmm0, xmm2
1001cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com    sub        ecx, 16
1002cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com    movdqu     [edx], xmm0
1003cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com    lea        edx, [edx + 16]
1004cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com    jg         convertloop
1005cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com    ret
1006cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com  }
1007cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com}
1008cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com
1009cfaa66c04154eec91951e2596cfe55eb6f2f749efbarchard@google.com__declspec(naked) __declspec(align(16))
10109394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.comvoid BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
1011f5f6fd2aa778c379c442a2210ddd8d2ee03e8eb6fbarchard@google.com  __asm {
10129394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    mov        eax, [esp + 4]   /* src_argb */
10139394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    mov        edx, [esp + 8]   /* dst_y */
10149394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    mov        ecx, [esp + 12]  /* pix */
10156334808d9d40071249ba9b51b65aa4e3b6e7f43ffbarchard@google.com    movdqa     xmm5, kAddY16
10166334808d9d40071249ba9b51b65aa4e3b6e7f43ffbarchard@google.com    movdqa     xmm4, kBGRAToY
10179394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com
1018c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
1019eaedc1d72735e68d45a0b42221a04902e648a21dfbarchard@google.com convertloop:
10209394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    movdqa     xmm0, [eax]
10219394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    movdqa     xmm1, [eax + 16]
10229394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    movdqa     xmm2, [eax + 32]
10239394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    movdqa     xmm3, [eax + 48]
1024b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    pmaddubsw  xmm0, xmm4
1025b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    pmaddubsw  xmm1, xmm4
1026b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    pmaddubsw  xmm2, xmm4
1027b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    pmaddubsw  xmm3, xmm4
10289394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    lea        eax, [eax + 64]
10299394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    phaddw     xmm0, xmm1
10309394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    phaddw     xmm2, xmm3
10319394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    psrlw      xmm0, 7
10329394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    psrlw      xmm2, 7
10339394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    packuswb   xmm0, xmm2
1034b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    paddb      xmm0, xmm5
1035aa4750f86da4747c8a3d1488cd25c49c434fbe65fbarchard@google.com    sub        ecx, 16
10369394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    movdqa     [edx], xmm0
10379394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    lea        edx, [edx + 16]
103818184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com    jg         convertloop
10399394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    ret
10409394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com  }
10419394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com}
10429394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com
1043d2f4413d29d15b94d971630ba555dd0cd8fcc8c2fbarchard@google.com__declspec(naked) __declspec(align(16))
1044b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.comvoid BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
1045f5f6fd2aa778c379c442a2210ddd8d2ee03e8eb6fbarchard@google.com  __asm {
1046b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    mov        eax, [esp + 4]   /* src_argb */
1047b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    mov        edx, [esp + 8]   /* dst_y */
1048b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    mov        ecx, [esp + 12]  /* pix */
1049b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqa     xmm5, kAddY16
1050b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqa     xmm4, kBGRAToY
1051b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com
1052c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
1053b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com convertloop:
1054b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqu     xmm0, [eax]
1055b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqu     xmm1, [eax + 16]
1056b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqu     xmm2, [eax + 32]
1057b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqu     xmm3, [eax + 48]
1058b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    pmaddubsw  xmm0, xmm4
1059b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    pmaddubsw  xmm1, xmm4
1060b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    pmaddubsw  xmm2, xmm4
1061b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    pmaddubsw  xmm3, xmm4
1062b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    lea        eax, [eax + 64]
1063b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    phaddw     xmm0, xmm1
1064b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    phaddw     xmm2, xmm3
1065b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    psrlw      xmm0, 7
1066b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    psrlw      xmm2, 7
1067b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    packuswb   xmm0, xmm2
1068b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    paddb      xmm0, xmm5
1069aa4750f86da4747c8a3d1488cd25c49c434fbe65fbarchard@google.com    sub        ecx, 16
1070b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqu     [edx], xmm0
1071b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    lea        edx, [edx + 16]
107218184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com    jg         convertloop
1073b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    ret
1074b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com  }
1075b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com}
1076b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com
1077d2f4413d29d15b94d971630ba555dd0cd8fcc8c2fbarchard@google.com__declspec(naked) __declspec(align(16))
10789394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.comvoid ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
1079f5f6fd2aa778c379c442a2210ddd8d2ee03e8eb6fbarchard@google.com  __asm {
10809394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    mov        eax, [esp + 4]   /* src_argb */
10819394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    mov        edx, [esp + 8]   /* dst_y */
10829394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    mov        ecx, [esp + 12]  /* pix */
10836334808d9d40071249ba9b51b65aa4e3b6e7f43ffbarchard@google.com    movdqa     xmm5, kAddY16
10846334808d9d40071249ba9b51b65aa4e3b6e7f43ffbarchard@google.com    movdqa     xmm4, kABGRToY
10859394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com
1086c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
1087eaedc1d72735e68d45a0b42221a04902e648a21dfbarchard@google.com convertloop:
10889394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    movdqa     xmm0, [eax]
10899394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    movdqa     xmm1, [eax + 16]
10909394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    movdqa     xmm2, [eax + 32]
10919394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    movdqa     xmm3, [eax + 48]
1092b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    pmaddubsw  xmm0, xmm4
1093b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    pmaddubsw  xmm1, xmm4
1094b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    pmaddubsw  xmm2, xmm4
1095b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    pmaddubsw  xmm3, xmm4
10969394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    lea        eax, [eax + 64]
10979394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    phaddw     xmm0, xmm1
10989394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    phaddw     xmm2, xmm3
10999394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    psrlw      xmm0, 7
11009394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    psrlw      xmm2, 7
11019394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    packuswb   xmm0, xmm2
1102b61497636a648c771ac55d184a80b17aca7414f5fbarchard@google.com    paddb      xmm0, xmm5
1103aa4750f86da4747c8a3d1488cd25c49c434fbe65fbarchard@google.com    sub        ecx, 16
11049394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    movdqa     [edx], xmm0
11059394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    lea        edx, [edx + 16]
110618184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com    jg         convertloop
1107585a126140be298e60a4daa26140ead0e94eaaa1fbarchard@google.com    ret
1108585a126140be298e60a4daa26140ead0e94eaaa1fbarchard@google.com  }
1109585a126140be298e60a4daa26140ead0e94eaaa1fbarchard@google.com}
1110585a126140be298e60a4daa26140ead0e94eaaa1fbarchard@google.com
1111d2f4413d29d15b94d971630ba555dd0cd8fcc8c2fbarchard@google.com__declspec(naked) __declspec(align(16))
1112b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.comvoid ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
1113f5f6fd2aa778c379c442a2210ddd8d2ee03e8eb6fbarchard@google.com  __asm {
1114b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    mov        eax, [esp + 4]   /* src_argb */
1115b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    mov        edx, [esp + 8]   /* dst_y */
1116b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    mov        ecx, [esp + 12]  /* pix */
1117b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqa     xmm5, kAddY16
1118b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqa     xmm4, kABGRToY
1119b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com
1120c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
1121b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com convertloop:
1122b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqu     xmm0, [eax]
1123b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqu     xmm1, [eax + 16]
1124b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqu     xmm2, [eax + 32]
1125b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqu     xmm3, [eax + 48]
1126b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    pmaddubsw  xmm0, xmm4
1127b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    pmaddubsw  xmm1, xmm4
1128b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    pmaddubsw  xmm2, xmm4
1129b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    pmaddubsw  xmm3, xmm4
1130b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    lea        eax, [eax + 64]
1131b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    phaddw     xmm0, xmm1
1132b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    phaddw     xmm2, xmm3
1133b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    psrlw      xmm0, 7
1134b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    psrlw      xmm2, 7
1135b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    packuswb   xmm0, xmm2
1136b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    paddb      xmm0, xmm5
113718184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com    sub        ecx, 16
1138b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqu     [edx], xmm0
1139b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    lea        edx, [edx + 16]
114018184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com    jg         convertloop
1141b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    ret
1142b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com  }
1143b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com}
1144b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com
1145d2f4413d29d15b94d971630ba555dd0cd8fcc8c2fbarchard@google.com__declspec(naked) __declspec(align(16))
114625dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.comvoid RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
1147f5f6fd2aa778c379c442a2210ddd8d2ee03e8eb6fbarchard@google.com  __asm {
114825dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    mov        eax, [esp + 4]   /* src_argb */
114925dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    mov        edx, [esp + 8]   /* dst_y */
115025dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    mov        ecx, [esp + 12]  /* pix */
115125dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    movdqa     xmm5, kAddY16
115225dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    movdqa     xmm4, kRGBAToY
115325dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com
1154c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
115525dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com convertloop:
115625dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    movdqa     xmm0, [eax]
115725dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    movdqa     xmm1, [eax + 16]
115825dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    movdqa     xmm2, [eax + 32]
115925dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    movdqa     xmm3, [eax + 48]
116025dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    pmaddubsw  xmm0, xmm4
116125dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    pmaddubsw  xmm1, xmm4
116225dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    pmaddubsw  xmm2, xmm4
116325dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    pmaddubsw  xmm3, xmm4
116425dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    lea        eax, [eax + 64]
116525dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    phaddw     xmm0, xmm1
116625dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    phaddw     xmm2, xmm3
116725dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    psrlw      xmm0, 7
116825dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    psrlw      xmm2, 7
116925dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    packuswb   xmm0, xmm2
117025dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    paddb      xmm0, xmm5
1171aa4750f86da4747c8a3d1488cd25c49c434fbe65fbarchard@google.com    sub        ecx, 16
117225dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    movdqa     [edx], xmm0
117325dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    lea        edx, [edx + 16]
117425dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    jg         convertloop
117525dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    ret
117625dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com  }
117725dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com}
117825dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com
117925dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com__declspec(naked) __declspec(align(16))
118025dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.comvoid RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
1181f5f6fd2aa778c379c442a2210ddd8d2ee03e8eb6fbarchard@google.com  __asm {
118225dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    mov        eax, [esp + 4]   /* src_argb */
118325dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    mov        edx, [esp + 8]   /* dst_y */
118425dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    mov        ecx, [esp + 12]  /* pix */
118525dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    movdqa     xmm5, kAddY16
118625dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    movdqa     xmm4, kRGBAToY
118725dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com
1188c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
118925dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com convertloop:
119025dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    movdqu     xmm0, [eax]
119125dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    movdqu     xmm1, [eax + 16]
119225dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    movdqu     xmm2, [eax + 32]
119325dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    movdqu     xmm3, [eax + 48]
119425dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    pmaddubsw  xmm0, xmm4
119525dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    pmaddubsw  xmm1, xmm4
119625dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    pmaddubsw  xmm2, xmm4
119725dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    pmaddubsw  xmm3, xmm4
119825dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    lea        eax, [eax + 64]
119925dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    phaddw     xmm0, xmm1
120025dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    phaddw     xmm2, xmm3
120125dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    psrlw      xmm0, 7
120225dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    psrlw      xmm2, 7
120325dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    packuswb   xmm0, xmm2
120425dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    paddb      xmm0, xmm5
120525dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    sub        ecx, 16
120625dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    movdqu     [edx], xmm0
120725dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    lea        edx, [edx + 16]
120825dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    jg         convertloop
120925dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    ret
121025dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com  }
121125dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com}
121225dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com
121325dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com__declspec(naked) __declspec(align(16))
1214585a126140be298e60a4daa26140ead0e94eaaa1fbarchard@google.comvoid ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1215585a126140be298e60a4daa26140ead0e94eaaa1fbarchard@google.com                       uint8* dst_u, uint8* dst_v, int width) {
1216f5f6fd2aa778c379c442a2210ddd8d2ee03e8eb6fbarchard@google.com  __asm {
1217585a126140be298e60a4daa26140ead0e94eaaa1fbarchard@google.com    push       esi
1218585a126140be298e60a4daa26140ead0e94eaaa1fbarchard@google.com    push       edi
1219585a126140be298e60a4daa26140ead0e94eaaa1fbarchard@google.com    mov        eax, [esp + 8 + 4]   // src_argb
1220585a126140be298e60a4daa26140ead0e94eaaa1fbarchard@google.com    mov        esi, [esp + 8 + 8]   // src_stride_argb
1221585a126140be298e60a4daa26140ead0e94eaaa1fbarchard@google.com    mov        edx, [esp + 8 + 12]  // dst_u
1222585a126140be298e60a4daa26140ead0e94eaaa1fbarchard@google.com    mov        edi, [esp + 8 + 16]  // dst_v
1223585a126140be298e60a4daa26140ead0e94eaaa1fbarchard@google.com    mov        ecx, [esp + 8 + 20]  // pix
12246334808d9d40071249ba9b51b65aa4e3b6e7f43ffbarchard@google.com    movdqa     xmm7, kARGBToU
12256334808d9d40071249ba9b51b65aa4e3b6e7f43ffbarchard@google.com    movdqa     xmm6, kARGBToV
12266334808d9d40071249ba9b51b65aa4e3b6e7f43ffbarchard@google.com    movdqa     xmm5, kAddUV128
12279394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    sub        edi, edx             // stride from u to v
1228585a126140be298e60a4daa26140ead0e94eaaa1fbarchard@google.com
1229c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
1230eaedc1d72735e68d45a0b42221a04902e648a21dfbarchard@google.com convertloop:
12319394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    /* step 1 - subsample 16x2 argb pixels to 8x1 */
12329394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    movdqa     xmm0, [eax]
1233585a126140be298e60a4daa26140ead0e94eaaa1fbarchard@google.com    movdqa     xmm1, [eax + 16]
12349394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    movdqa     xmm2, [eax + 32]
12359394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    movdqa     xmm3, [eax + 48]
12369394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    pavgb      xmm0, [eax + esi]
12379394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    pavgb      xmm1, [eax + esi + 16]
12389394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    pavgb      xmm2, [eax + esi + 32]
12399394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    pavgb      xmm3, [eax + esi + 48]
12409394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    lea        eax,  [eax + 64]
12419394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    movdqa     xmm4, xmm0
1242585a126140be298e60a4daa26140ead0e94eaaa1fbarchard@google.com    shufps     xmm0, xmm1, 0x88
12439394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    shufps     xmm4, xmm1, 0xdd
12449394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    pavgb      xmm0, xmm4
12459394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    movdqa     xmm4, xmm2
12469394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    shufps     xmm2, xmm3, 0x88
12479394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    shufps     xmm4, xmm3, 0xdd
12489394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    pavgb      xmm2, xmm4
1249585a126140be298e60a4daa26140ead0e94eaaa1fbarchard@google.com
1250585a126140be298e60a4daa26140ead0e94eaaa1fbarchard@google.com    // step 2 - convert to U and V
1251585a126140be298e60a4daa26140ead0e94eaaa1fbarchard@google.com    // from here down is very similar to Y code except
12529394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    // instead of 16 different pixels, its 8 pixels of U and 8 of V
1253585a126140be298e60a4daa26140ead0e94eaaa1fbarchard@google.com    movdqa     xmm1, xmm0
12549394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    movdqa     xmm3, xmm2
12559394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    pmaddubsw  xmm0, xmm7  // U
12569394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    pmaddubsw  xmm2, xmm7
12579394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    pmaddubsw  xmm1, xmm6  // V
12589394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    pmaddubsw  xmm3, xmm6
12599394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    phaddw     xmm0, xmm2
12609394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    phaddw     xmm1, xmm3
12619394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    psraw      xmm0, 8
12629394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    psraw      xmm1, 8
12639394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    packsswb   xmm0, xmm1
12649394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    paddb      xmm0, xmm5            // -> unsigned
12659394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com
12669394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    // step 3 - store 8 U and 8 V values
126718184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com    sub        ecx, 16
12689394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    movlps     qword ptr [edx], xmm0 // U
12699394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    movhps     qword ptr [edx + edi], xmm0 // V
12709394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    lea        edx, [edx + 8]
127118184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com    jg         convertloop
127218184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com
1273585a126140be298e60a4daa26140ead0e94eaaa1fbarchard@google.com    pop        edi
1274585a126140be298e60a4daa26140ead0e94eaaa1fbarchard@google.com    pop        esi
1275585a126140be298e60a4daa26140ead0e94eaaa1fbarchard@google.com    ret
1276585a126140be298e60a4daa26140ead0e94eaaa1fbarchard@google.com  }
1277585a126140be298e60a4daa26140ead0e94eaaa1fbarchard@google.com}
1278585a126140be298e60a4daa26140ead0e94eaaa1fbarchard@google.com
1279050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com__declspec(naked) __declspec(align(16))
1280050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.comvoid ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1281050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com                        uint8* dst_u, uint8* dst_v, int width) {
1282050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com  __asm {
1283050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    push       esi
1284050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    push       edi
1285050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    mov        eax, [esp + 8 + 4]   // src_argb
1286050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    mov        esi, [esp + 8 + 8]   // src_stride_argb
1287050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    mov        edx, [esp + 8 + 12]  // dst_u
1288050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    mov        edi, [esp + 8 + 16]  // dst_v
1289050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    mov        ecx, [esp + 8 + 20]  // pix
1290050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    movdqa     xmm7, kARGBToUJ
1291050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    movdqa     xmm6, kARGBToVJ
1292050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    movdqa     xmm5, kAddUVJ128
1293050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    sub        edi, edx             // stride from u to v
1294050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com
1295c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
1296050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com convertloop:
1297050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    /* step 1 - subsample 16x2 argb pixels to 8x1 */
1298050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    movdqa     xmm0, [eax]
1299050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    movdqa     xmm1, [eax + 16]
1300050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    movdqa     xmm2, [eax + 32]
1301050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    movdqa     xmm3, [eax + 48]
1302050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    pavgb      xmm0, [eax + esi]
1303050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    pavgb      xmm1, [eax + esi + 16]
1304050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    pavgb      xmm2, [eax + esi + 32]
1305050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    pavgb      xmm3, [eax + esi + 48]
1306050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    lea        eax,  [eax + 64]
1307050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    movdqa     xmm4, xmm0
1308050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    shufps     xmm0, xmm1, 0x88
1309050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    shufps     xmm4, xmm1, 0xdd
1310050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    pavgb      xmm0, xmm4
1311050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    movdqa     xmm4, xmm2
1312050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    shufps     xmm2, xmm3, 0x88
1313050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    shufps     xmm4, xmm3, 0xdd
1314050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    pavgb      xmm2, xmm4
1315050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com
1316050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    // step 2 - convert to U and V
1317050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    // from here down is very similar to Y code except
1318050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    // instead of 16 different pixels, its 8 pixels of U and 8 of V
1319050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    movdqa     xmm1, xmm0
1320050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    movdqa     xmm3, xmm2
1321050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    pmaddubsw  xmm0, xmm7  // U
1322050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    pmaddubsw  xmm2, xmm7
1323050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    pmaddubsw  xmm1, xmm6  // V
1324050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    pmaddubsw  xmm3, xmm6
1325050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    phaddw     xmm0, xmm2
1326050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    phaddw     xmm1, xmm3
1327050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    paddw      xmm0, xmm5            // +.5 rounding -> unsigned
1328050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    paddw      xmm1, xmm5
1329050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    psraw      xmm0, 8
1330050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    psraw      xmm1, 8
1331050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    packsswb   xmm0, xmm1
1332050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com
1333050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    // step 3 - store 8 U and 8 V values
1334050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    sub        ecx, 16
1335050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    movlps     qword ptr [edx], xmm0 // U
1336050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    movhps     qword ptr [edx + edi], xmm0 // V
1337050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    lea        edx, [edx + 8]
1338050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    jg         convertloop
1339050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com
1340050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    pop        edi
1341050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    pop        esi
1342050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    ret
1343050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com  }
1344050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com}
1345050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com
1346551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com#ifdef HAS_ARGBTOUVROW_AVX2
1347551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com__declspec(naked) __declspec(align(32))
1348551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.comvoid ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
1349551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com                      uint8* dst_u, uint8* dst_v, int width) {
1350551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com  __asm {
1351551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com    push       esi
1352551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com    push       edi
1353551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com    mov        eax, [esp + 8 + 4]   // src_argb
1354551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com    mov        esi, [esp + 8 + 8]   // src_stride_argb
1355551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com    mov        edx, [esp + 8 + 12]  // dst_u
1356551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com    mov        edi, [esp + 8 + 16]  // dst_v
1357551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com    mov        ecx, [esp + 8 + 20]  // pix
1358446f91d040aea92c0522745d176fe8017bd22382fbarchard@google.com    vbroadcastf128 ymm5, kAddUV128
1359446f91d040aea92c0522745d176fe8017bd22382fbarchard@google.com    vbroadcastf128 ymm6, kARGBToV
1360446f91d040aea92c0522745d176fe8017bd22382fbarchard@google.com    vbroadcastf128 ymm7, kARGBToU
1361551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com    sub        edi, edx             // stride from u to v
1362551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com
1363c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
1364551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com convertloop:
1365b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    /* step 1 - subsample 32x2 argb pixels to 16x1 */
1366caf6e2470b15fd4a8df03351b07683352226824cfbarchard@google.com    vmovdqu    ymm0, [eax]
1367caf6e2470b15fd4a8df03351b07683352226824cfbarchard@google.com    vmovdqu    ymm1, [eax + 32]
1368caf6e2470b15fd4a8df03351b07683352226824cfbarchard@google.com    vmovdqu    ymm2, [eax + 64]
1369caf6e2470b15fd4a8df03351b07683352226824cfbarchard@google.com    vmovdqu    ymm3, [eax + 96]
1370caf6e2470b15fd4a8df03351b07683352226824cfbarchard@google.com    vpavgb     ymm0, ymm0, [eax + esi]
1371caf6e2470b15fd4a8df03351b07683352226824cfbarchard@google.com    vpavgb     ymm1, ymm1, [eax + esi + 32]
1372caf6e2470b15fd4a8df03351b07683352226824cfbarchard@google.com    vpavgb     ymm2, ymm2, [eax + esi + 64]
1373caf6e2470b15fd4a8df03351b07683352226824cfbarchard@google.com    vpavgb     ymm3, ymm3, [eax + esi + 96]
1374caf6e2470b15fd4a8df03351b07683352226824cfbarchard@google.com    lea        eax,  [eax + 128]
1375caf6e2470b15fd4a8df03351b07683352226824cfbarchard@google.com    vshufps    ymm4, ymm0, ymm1, 0x88
1376caf6e2470b15fd4a8df03351b07683352226824cfbarchard@google.com    vshufps    ymm0, ymm0, ymm1, 0xdd
1377caf6e2470b15fd4a8df03351b07683352226824cfbarchard@google.com    vpavgb     ymm0, ymm0, ymm4  // mutated by vshufps
1378caf6e2470b15fd4a8df03351b07683352226824cfbarchard@google.com    vshufps    ymm4, ymm2, ymm3, 0x88
1379caf6e2470b15fd4a8df03351b07683352226824cfbarchard@google.com    vshufps    ymm2, ymm2, ymm3, 0xdd
1380caf6e2470b15fd4a8df03351b07683352226824cfbarchard@google.com    vpavgb     ymm2, ymm2, ymm4  // mutated by vshufps
1381551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com
1382551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com    // step 2 - convert to U and V
1383551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com    // from here down is very similar to Y code except
1384551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com    // instead of 32 different pixels, its 16 pixels of U and 16 of V
1385caf6e2470b15fd4a8df03351b07683352226824cfbarchard@google.com    vpmaddubsw ymm1, ymm0, ymm7  // U
1386caf6e2470b15fd4a8df03351b07683352226824cfbarchard@google.com    vpmaddubsw ymm3, ymm2, ymm7
1387caf6e2470b15fd4a8df03351b07683352226824cfbarchard@google.com    vpmaddubsw ymm0, ymm0, ymm6  // V
1388caf6e2470b15fd4a8df03351b07683352226824cfbarchard@google.com    vpmaddubsw ymm2, ymm2, ymm6
1389caf6e2470b15fd4a8df03351b07683352226824cfbarchard@google.com    vphaddw    ymm1, ymm1, ymm3  // mutates
1390caf6e2470b15fd4a8df03351b07683352226824cfbarchard@google.com    vphaddw    ymm0, ymm0, ymm2
1391caf6e2470b15fd4a8df03351b07683352226824cfbarchard@google.com    vpsraw     ymm1, ymm1, 8
1392caf6e2470b15fd4a8df03351b07683352226824cfbarchard@google.com    vpsraw     ymm0, ymm0, 8
1393caf6e2470b15fd4a8df03351b07683352226824cfbarchard@google.com    vpacksswb  ymm0, ymm1, ymm0  // mutates
1394caf6e2470b15fd4a8df03351b07683352226824cfbarchard@google.com    vpermq     ymm0, ymm0, 0xd8  // For vpacksswb
1395caf6e2470b15fd4a8df03351b07683352226824cfbarchard@google.com    vpshufb    ymm0, ymm0, kShufARGBToUV_AVX  // For vshufps + vphaddw
1396caf6e2470b15fd4a8df03351b07683352226824cfbarchard@google.com    vpaddb     ymm0, ymm0, ymm5  // -> unsigned
1397551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com
1398551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com    // step 3 - store 16 U and 16 V values
1399551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com    sub         ecx, 32
1400b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vextractf128 [edx], ymm0, 0 // U
1401b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vextractf128 [edx + edi], ymm0, 1 // V
1402551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com    lea        edx, [edx + 16]
1403551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com    jg         convertloop
1404551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com
1405551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com    pop        edi
1406551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com    pop        esi
14079b4c00b908d37727c6caf82337813d567732be1cfbarchard@google.com    vzeroupper
1408551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com    ret
1409551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com  }
1410551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com}
1411551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com#endif  // HAS_ARGBTOUVROW_AVX2
1412551d2b297e6a071fe58a8f2da2cb69cc0ec56ed8fbarchard@google.com
1413d2f4413d29d15b94d971630ba555dd0cd8fcc8c2fbarchard@google.com__declspec(naked) __declspec(align(16))
1414b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.comvoid ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
1415b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com                                 uint8* dst_u, uint8* dst_v, int width) {
1416f5f6fd2aa778c379c442a2210ddd8d2ee03e8eb6fbarchard@google.com  __asm {
1417b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    push       esi
1418b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    push       edi
1419b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    mov        eax, [esp + 8 + 4]   // src_argb
1420b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    mov        esi, [esp + 8 + 8]   // src_stride_argb
1421b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    mov        edx, [esp + 8 + 12]  // dst_u
1422b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    mov        edi, [esp + 8 + 16]  // dst_v
1423b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    mov        ecx, [esp + 8 + 20]  // pix
1424b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqa     xmm7, kARGBToU
1425b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqa     xmm6, kARGBToV
1426b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqa     xmm5, kAddUV128
1427b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    sub        edi, edx             // stride from u to v
1428b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com
1429c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
1430b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com convertloop:
1431b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    /* step 1 - subsample 16x2 argb pixels to 8x1 */
1432b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqu     xmm0, [eax]
1433b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqu     xmm1, [eax + 16]
1434b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqu     xmm2, [eax + 32]
1435b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqu     xmm3, [eax + 48]
1436b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqu     xmm4, [eax + esi]
1437b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    pavgb      xmm0, xmm4
1438b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqu     xmm4, [eax + esi + 16]
1439b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    pavgb      xmm1, xmm4
1440b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqu     xmm4, [eax + esi + 32]
1441b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    pavgb      xmm2, xmm4
1442b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqu     xmm4, [eax + esi + 48]
1443b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    pavgb      xmm3, xmm4
1444b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    lea        eax,  [eax + 64]
1445b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqa     xmm4, xmm0
1446b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    shufps     xmm0, xmm1, 0x88
1447b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    shufps     xmm4, xmm1, 0xdd
1448b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    pavgb      xmm0, xmm4
1449b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqa     xmm4, xmm2
1450b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    shufps     xmm2, xmm3, 0x88
1451b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    shufps     xmm4, xmm3, 0xdd
1452b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    pavgb      xmm2, xmm4
1453b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com
1454b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    // step 2 - convert to U and V
1455b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    // from here down is very similar to Y code except
1456b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    // instead of 16 different pixels, its 8 pixels of U and 8 of V
1457b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqa     xmm1, xmm0
1458b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqa     xmm3, xmm2
1459b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    pmaddubsw  xmm0, xmm7  // U
1460b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    pmaddubsw  xmm2, xmm7
1461b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    pmaddubsw  xmm1, xmm6  // V
1462b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    pmaddubsw  xmm3, xmm6
1463b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    phaddw     xmm0, xmm2
1464b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    phaddw     xmm1, xmm3
1465b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    psraw      xmm0, 8
1466b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    psraw      xmm1, 8
1467b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    packsswb   xmm0, xmm1
1468b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    paddb      xmm0, xmm5            // -> unsigned
1469b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com
1470b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    // step 3 - store 8 U and 8 V values
147118184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com    sub        ecx, 16
1472b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movlps     qword ptr [edx], xmm0 // U
1473b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movhps     qword ptr [edx + edi], xmm0 // V
1474b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    lea        edx, [edx + 8]
147518184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com    jg         convertloop
147618184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com
1477b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    pop        edi
1478b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    pop        esi
1479b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    ret
1480b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com  }
1481b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com}
1482b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com
1483d2f4413d29d15b94d971630ba555dd0cd8fcc8c2fbarchard@google.com__declspec(naked) __declspec(align(16))
1484050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.comvoid ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
1485050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com                                 uint8* dst_u, uint8* dst_v, int width) {
1486050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com  __asm {
1487050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    push       esi
1488050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    push       edi
1489050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    mov        eax, [esp + 8 + 4]   // src_argb
1490050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    mov        esi, [esp + 8 + 8]   // src_stride_argb
1491050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    mov        edx, [esp + 8 + 12]  // dst_u
1492050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    mov        edi, [esp + 8 + 16]  // dst_v
1493050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    mov        ecx, [esp + 8 + 20]  // pix
1494050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    movdqa     xmm7, kARGBToUJ
1495050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    movdqa     xmm6, kARGBToVJ
1496050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    movdqa     xmm5, kAddUVJ128
1497050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    sub        edi, edx             // stride from u to v
1498050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com
1499c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
1500050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com convertloop:
1501050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    /* step 1 - subsample 16x2 argb pixels to 8x1 */
1502050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    movdqu     xmm0, [eax]
1503050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    movdqu     xmm1, [eax + 16]
1504050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    movdqu     xmm2, [eax + 32]
1505050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    movdqu     xmm3, [eax + 48]
1506050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    movdqu     xmm4, [eax + esi]
1507050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    pavgb      xmm0, xmm4
1508050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    movdqu     xmm4, [eax + esi + 16]
1509050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    pavgb      xmm1, xmm4
1510050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    movdqu     xmm4, [eax + esi + 32]
1511050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    pavgb      xmm2, xmm4
1512050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    movdqu     xmm4, [eax + esi + 48]
1513050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    pavgb      xmm3, xmm4
1514050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    lea        eax,  [eax + 64]
1515050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    movdqa     xmm4, xmm0
1516050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    shufps     xmm0, xmm1, 0x88
1517050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    shufps     xmm4, xmm1, 0xdd
1518050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    pavgb      xmm0, xmm4
1519050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    movdqa     xmm4, xmm2
1520050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    shufps     xmm2, xmm3, 0x88
1521050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    shufps     xmm4, xmm3, 0xdd
1522050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    pavgb      xmm2, xmm4
1523050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com
1524050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    // step 2 - convert to U and V
1525050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    // from here down is very similar to Y code except
1526050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    // instead of 16 different pixels, its 8 pixels of U and 8 of V
1527050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    movdqa     xmm1, xmm0
1528050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    movdqa     xmm3, xmm2
1529050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    pmaddubsw  xmm0, xmm7  // U
1530050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    pmaddubsw  xmm2, xmm7
1531050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    pmaddubsw  xmm1, xmm6  // V
1532050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    pmaddubsw  xmm3, xmm6
1533050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    phaddw     xmm0, xmm2
1534050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    phaddw     xmm1, xmm3
1535050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    paddw      xmm0, xmm5            // +.5 rounding -> unsigned
1536050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    paddw      xmm1, xmm5
1537050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    psraw      xmm0, 8
1538050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    psraw      xmm1, 8
1539050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    packsswb   xmm0, xmm1
1540050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com
1541050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    // step 3 - store 8 U and 8 V values
1542050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    sub        ecx, 16
1543050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    movlps     qword ptr [edx], xmm0 // U
1544050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    movhps     qword ptr [edx + edi], xmm0 // V
1545050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    lea        edx, [edx + 8]
1546050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    jg         convertloop
1547050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com
1548050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    pop        edi
1549050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    pop        esi
1550050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    ret
1551050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com  }
1552050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com}
1553050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com
1554050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com__declspec(naked) __declspec(align(16))
155541e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.comvoid ARGBToUV444Row_SSSE3(const uint8* src_argb0,
155641e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com                          uint8* dst_u, uint8* dst_v, int width) {
1557f5f6fd2aa778c379c442a2210ddd8d2ee03e8eb6fbarchard@google.com  __asm {
155841e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    push       edi
155941e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    mov        eax, [esp + 4 + 4]   // src_argb
156041e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    mov        edx, [esp + 4 + 8]   // dst_u
156141e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    mov        edi, [esp + 4 + 12]  // dst_v
156241e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    mov        ecx, [esp + 4 + 16]  // pix
156341e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    movdqa     xmm7, kARGBToU
156441e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    movdqa     xmm6, kARGBToV
156541e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    movdqa     xmm5, kAddUV128
156641e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    sub        edi, edx             // stride from u to v
156741e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com
1568c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
156941e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com convertloop:
157041e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    /* convert to U and V */
157141e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    movdqa     xmm0, [eax]          // U
157241e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    movdqa     xmm1, [eax + 16]
157341e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    movdqa     xmm2, [eax + 32]
157441e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    movdqa     xmm3, [eax + 48]
157541e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    pmaddubsw  xmm0, xmm7
157641e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    pmaddubsw  xmm1, xmm7
157741e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    pmaddubsw  xmm2, xmm7
157841e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    pmaddubsw  xmm3, xmm7
157941e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    phaddw     xmm0, xmm1
158041e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    phaddw     xmm2, xmm3
1581d8b73cacbefdf3cf397a410edba1d58296844adbfbarchard@google.com    psraw      xmm0, 8
1582d8b73cacbefdf3cf397a410edba1d58296844adbfbarchard@google.com    psraw      xmm2, 8
1583d8b73cacbefdf3cf397a410edba1d58296844adbfbarchard@google.com    packsswb   xmm0, xmm2
158441e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    paddb      xmm0, xmm5
158541e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    sub        ecx,  16
158641e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    movdqa     [edx], xmm0
158741e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com
158841e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    movdqa     xmm0, [eax]          // V
158941e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    movdqa     xmm1, [eax + 16]
159041e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    movdqa     xmm2, [eax + 32]
159141e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    movdqa     xmm3, [eax + 48]
159241e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    pmaddubsw  xmm0, xmm6
159341e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    pmaddubsw  xmm1, xmm6
159441e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    pmaddubsw  xmm2, xmm6
159541e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    pmaddubsw  xmm3, xmm6
159641e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    phaddw     xmm0, xmm1
159741e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    phaddw     xmm2, xmm3
1598d8b73cacbefdf3cf397a410edba1d58296844adbfbarchard@google.com    psraw      xmm0, 8
1599d8b73cacbefdf3cf397a410edba1d58296844adbfbarchard@google.com    psraw      xmm2, 8
1600d8b73cacbefdf3cf397a410edba1d58296844adbfbarchard@google.com    packsswb   xmm0, xmm2
160141e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    paddb      xmm0, xmm5
160241e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    lea        eax,  [eax + 64]
160341e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    movdqa     [edx + edi], xmm0
160441e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    lea        edx,  [edx + 16]
160541e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    jg         convertloop
160641e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com
160741e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    pop        edi
160841e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    ret
160941e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com  }
161041e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com}
161141e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com
161241e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com__declspec(naked) __declspec(align(16))
161341e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.comvoid ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb0,
161441e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com                                    uint8* dst_u, uint8* dst_v, int width) {
1615f5f6fd2aa778c379c442a2210ddd8d2ee03e8eb6fbarchard@google.com  __asm {
161641e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    push       edi
161741e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    mov        eax, [esp + 4 + 4]   // src_argb
161841e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    mov        edx, [esp + 4 + 8]   // dst_u
161941e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    mov        edi, [esp + 4 + 12]  // dst_v
162041e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    mov        ecx, [esp + 4 + 16]  // pix
162141e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    movdqa     xmm7, kARGBToU
162241e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    movdqa     xmm6, kARGBToV
162341e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    movdqa     xmm5, kAddUV128
162441e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    sub        edi, edx             // stride from u to v
162541e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com
1626c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
162741e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com convertloop:
162841e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    /* convert to U and V */
162941e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    movdqu     xmm0, [eax]          // U
163041e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    movdqu     xmm1, [eax + 16]
163141e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    movdqu     xmm2, [eax + 32]
163241e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    movdqu     xmm3, [eax + 48]
163341e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    pmaddubsw  xmm0, xmm7
163441e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    pmaddubsw  xmm1, xmm7
163541e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    pmaddubsw  xmm2, xmm7
163641e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    pmaddubsw  xmm3, xmm7
163741e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    phaddw     xmm0, xmm1
163841e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    phaddw     xmm2, xmm3
1639d8b73cacbefdf3cf397a410edba1d58296844adbfbarchard@google.com    psraw      xmm0, 8
1640d8b73cacbefdf3cf397a410edba1d58296844adbfbarchard@google.com    psraw      xmm2, 8
1641d8b73cacbefdf3cf397a410edba1d58296844adbfbarchard@google.com    packsswb   xmm0, xmm2
164241e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    paddb      xmm0, xmm5
164341e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    sub        ecx,  16
164441e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    movdqu     [edx], xmm0
164541e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com
164641e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    movdqu     xmm0, [eax]          // V
164741e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    movdqu     xmm1, [eax + 16]
164841e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    movdqu     xmm2, [eax + 32]
164941e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    movdqu     xmm3, [eax + 48]
165041e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    pmaddubsw  xmm0, xmm6
165141e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    pmaddubsw  xmm1, xmm6
165241e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    pmaddubsw  xmm2, xmm6
165341e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    pmaddubsw  xmm3, xmm6
165441e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    phaddw     xmm0, xmm1
165541e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    phaddw     xmm2, xmm3
1656d8b73cacbefdf3cf397a410edba1d58296844adbfbarchard@google.com    psraw      xmm0, 8
1657d8b73cacbefdf3cf397a410edba1d58296844adbfbarchard@google.com    psraw      xmm2, 8
1658d8b73cacbefdf3cf397a410edba1d58296844adbfbarchard@google.com    packsswb   xmm0, xmm2
165941e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    paddb      xmm0, xmm5
166041e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    lea        eax,  [eax + 64]
166141e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    movdqu     [edx + edi], xmm0
166241e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    lea        edx,  [edx + 16]
166341e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    jg         convertloop
166441e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com
166541e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    pop        edi
166641e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com    ret
166741e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com  }
166841e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com}
166941e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com
167041e972ec31e83b531f4cef30b5be63ffa6aa3cfdfbarchard@google.com__declspec(naked) __declspec(align(16))
1671bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.comvoid ARGBToUV422Row_SSSE3(const uint8* src_argb0,
1672bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com                          uint8* dst_u, uint8* dst_v, int width) {
1673f5f6fd2aa778c379c442a2210ddd8d2ee03e8eb6fbarchard@google.com  __asm {
1674bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    push       edi
1675bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    mov        eax, [esp + 4 + 4]   // src_argb
1676bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    mov        edx, [esp + 4 + 8]   // dst_u
1677bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    mov        edi, [esp + 4 + 12]  // dst_v
1678bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    mov        ecx, [esp + 4 + 16]  // pix
1679bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    movdqa     xmm7, kARGBToU
1680bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    movdqa     xmm6, kARGBToV
1681bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    movdqa     xmm5, kAddUV128
1682bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    sub        edi, edx             // stride from u to v
1683bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com
1684c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
1685bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com convertloop:
1686bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    /* step 1 - subsample 16x2 argb pixels to 8x1 */
1687bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    movdqa     xmm0, [eax]
1688bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    movdqa     xmm1, [eax + 16]
1689bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    movdqa     xmm2, [eax + 32]
1690bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    movdqa     xmm3, [eax + 48]
1691bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    lea        eax,  [eax + 64]
1692bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    movdqa     xmm4, xmm0
1693bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    shufps     xmm0, xmm1, 0x88
1694bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    shufps     xmm4, xmm1, 0xdd
1695bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    pavgb      xmm0, xmm4
1696bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    movdqa     xmm4, xmm2
1697bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    shufps     xmm2, xmm3, 0x88
1698bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    shufps     xmm4, xmm3, 0xdd
1699bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    pavgb      xmm2, xmm4
1700bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com
1701bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    // step 2 - convert to U and V
1702bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    // from here down is very similar to Y code except
1703bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    // instead of 16 different pixels, its 8 pixels of U and 8 of V
1704bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    movdqa     xmm1, xmm0
1705bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    movdqa     xmm3, xmm2
1706bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    pmaddubsw  xmm0, xmm7  // U
1707bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    pmaddubsw  xmm2, xmm7
1708bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    pmaddubsw  xmm1, xmm6  // V
1709bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    pmaddubsw  xmm3, xmm6
1710bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    phaddw     xmm0, xmm2
1711bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    phaddw     xmm1, xmm3
1712bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    psraw      xmm0, 8
1713bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    psraw      xmm1, 8
1714bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    packsswb   xmm0, xmm1
1715bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    paddb      xmm0, xmm5            // -> unsigned
1716bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com
1717bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    // step 3 - store 8 U and 8 V values
1718bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    sub        ecx, 16
1719bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    movlps     qword ptr [edx], xmm0 // U
1720bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    movhps     qword ptr [edx + edi], xmm0 // V
1721bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    lea        edx, [edx + 8]
1722bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    jg         convertloop
1723bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com
1724bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    pop        edi
1725bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    ret
1726bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com  }
1727bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com}
1728bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com
1729bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com__declspec(naked) __declspec(align(16))
1730bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.comvoid ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0,
1731bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com                                    uint8* dst_u, uint8* dst_v, int width) {
1732f5f6fd2aa778c379c442a2210ddd8d2ee03e8eb6fbarchard@google.com  __asm {
1733bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    push       edi
1734bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    mov        eax, [esp + 4 + 4]   // src_argb
1735bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    mov        edx, [esp + 4 + 8]   // dst_u
1736bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    mov        edi, [esp + 4 + 12]  // dst_v
1737bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    mov        ecx, [esp + 4 + 16]  // pix
1738bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    movdqa     xmm7, kARGBToU
1739bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    movdqa     xmm6, kARGBToV
1740bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    movdqa     xmm5, kAddUV128
1741bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    sub        edi, edx             // stride from u to v
1742bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com
1743c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
1744bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com convertloop:
1745bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    /* step 1 - subsample 16x2 argb pixels to 8x1 */
1746bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    movdqu     xmm0, [eax]
1747bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    movdqu     xmm1, [eax + 16]
1748bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    movdqu     xmm2, [eax + 32]
1749bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    movdqu     xmm3, [eax + 48]
1750bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    lea        eax,  [eax + 64]
1751bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    movdqa     xmm4, xmm0
1752bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    shufps     xmm0, xmm1, 0x88
1753bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    shufps     xmm4, xmm1, 0xdd
1754bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    pavgb      xmm0, xmm4
1755bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    movdqa     xmm4, xmm2
1756bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    shufps     xmm2, xmm3, 0x88
1757bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    shufps     xmm4, xmm3, 0xdd
1758bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    pavgb      xmm2, xmm4
1759bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com
1760bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    // step 2 - convert to U and V
1761bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    // from here down is very similar to Y code except
1762bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    // instead of 16 different pixels, its 8 pixels of U and 8 of V
1763bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    movdqa     xmm1, xmm0
1764bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    movdqa     xmm3, xmm2
1765bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    pmaddubsw  xmm0, xmm7  // U
1766bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    pmaddubsw  xmm2, xmm7
1767bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    pmaddubsw  xmm1, xmm6  // V
1768bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    pmaddubsw  xmm3, xmm6
1769bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    phaddw     xmm0, xmm2
1770bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    phaddw     xmm1, xmm3
1771bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    psraw      xmm0, 8
1772bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    psraw      xmm1, 8
1773bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    packsswb   xmm0, xmm1
1774bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    paddb      xmm0, xmm5            // -> unsigned
1775bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com
1776bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    // step 3 - store 8 U and 8 V values
1777bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    sub        ecx, 16
1778bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    movlps     qword ptr [edx], xmm0 // U
1779bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    movhps     qword ptr [edx + edi], xmm0 // V
1780bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    lea        edx, [edx + 8]
1781bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    jg         convertloop
1782bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com
1783bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    pop        edi
1784bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com    ret
1785bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com  }
1786bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com}
1787bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com
1788bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com__declspec(naked) __declspec(align(16))
17899394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.comvoid BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
17909394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com                       uint8* dst_u, uint8* dst_v, int width) {
1791f5f6fd2aa778c379c442a2210ddd8d2ee03e8eb6fbarchard@google.com  __asm {
17929394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    push       esi
17939394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    push       edi
17949394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    mov        eax, [esp + 8 + 4]   // src_argb
17959394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    mov        esi, [esp + 8 + 8]   // src_stride_argb
17969394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    mov        edx, [esp + 8 + 12]  // dst_u
17979394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    mov        edi, [esp + 8 + 16]  // dst_v
17989394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    mov        ecx, [esp + 8 + 20]  // pix
17996334808d9d40071249ba9b51b65aa4e3b6e7f43ffbarchard@google.com    movdqa     xmm7, kBGRAToU
18006334808d9d40071249ba9b51b65aa4e3b6e7f43ffbarchard@google.com    movdqa     xmm6, kBGRAToV
18016334808d9d40071249ba9b51b65aa4e3b6e7f43ffbarchard@google.com    movdqa     xmm5, kAddUV128
18029394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    sub        edi, edx             // stride from u to v
1803585a126140be298e60a4daa26140ead0e94eaaa1fbarchard@google.com
1804c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
1805eaedc1d72735e68d45a0b42221a04902e648a21dfbarchard@google.com convertloop:
18069394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    /* step 1 - subsample 16x2 argb pixels to 8x1 */
18079394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    movdqa     xmm0, [eax]
18089394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    movdqa     xmm1, [eax + 16]
18099394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    movdqa     xmm2, [eax + 32]
18109394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    movdqa     xmm3, [eax + 48]
18119394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    pavgb      xmm0, [eax + esi]
18129394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    pavgb      xmm1, [eax + esi + 16]
18139394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    pavgb      xmm2, [eax + esi + 32]
18149394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    pavgb      xmm3, [eax + esi + 48]
18159394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    lea        eax,  [eax + 64]
18169394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    movdqa     xmm4, xmm0
18179394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    shufps     xmm0, xmm1, 0x88
18189394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    shufps     xmm4, xmm1, 0xdd
18199394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    pavgb      xmm0, xmm4
18209394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    movdqa     xmm4, xmm2
18219394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    shufps     xmm2, xmm3, 0x88
18229394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    shufps     xmm4, xmm3, 0xdd
18239394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    pavgb      xmm2, xmm4
18249394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com
18259394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    // step 2 - convert to U and V
18269394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    // from here down is very similar to Y code except
18279394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    // instead of 16 different pixels, its 8 pixels of U and 8 of V
18289394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    movdqa     xmm1, xmm0
18299394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    movdqa     xmm3, xmm2
18309394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    pmaddubsw  xmm0, xmm7  // U
18319394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    pmaddubsw  xmm2, xmm7
18329394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    pmaddubsw  xmm1, xmm6  // V
18339394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    pmaddubsw  xmm3, xmm6
18349394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    phaddw     xmm0, xmm2
18359394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    phaddw     xmm1, xmm3
18369394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    psraw      xmm0, 8
18379394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    psraw      xmm1, 8
18389394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    packsswb   xmm0, xmm1
18399394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    paddb      xmm0, xmm5            // -> unsigned
18409394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com
18419394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    // step 3 - store 8 U and 8 V values
184218184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com    sub        ecx, 16
18439394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    movlps     qword ptr [edx], xmm0 // U
18449394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    movhps     qword ptr [edx + edi], xmm0 // V
18459394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    lea        edx, [edx + 8]
184618184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com    jg         convertloop
184718184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com
18489394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    pop        edi
18499394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    pop        esi
18509394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    ret
18519394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com  }
1852585a126140be298e60a4daa26140ead0e94eaaa1fbarchard@google.com}
1853585a126140be298e60a4daa26140ead0e94eaaa1fbarchard@google.com
1854d2f4413d29d15b94d971630ba555dd0cd8fcc8c2fbarchard@google.com__declspec(naked) __declspec(align(16))
1855b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.comvoid BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
1856b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com                                 uint8* dst_u, uint8* dst_v, int width) {
1857f5f6fd2aa778c379c442a2210ddd8d2ee03e8eb6fbarchard@google.com  __asm {
1858b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    push       esi
1859b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    push       edi
1860b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    mov        eax, [esp + 8 + 4]   // src_argb
1861b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    mov        esi, [esp + 8 + 8]   // src_stride_argb
1862b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    mov        edx, [esp + 8 + 12]  // dst_u
1863b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    mov        edi, [esp + 8 + 16]  // dst_v
1864b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    mov        ecx, [esp + 8 + 20]  // pix
1865b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqa     xmm7, kBGRAToU
1866b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqa     xmm6, kBGRAToV
1867b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqa     xmm5, kAddUV128
1868b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    sub        edi, edx             // stride from u to v
1869b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com
1870c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
1871b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com convertloop:
1872b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    /* step 1 - subsample 16x2 argb pixels to 8x1 */
1873b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqu     xmm0, [eax]
1874b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqu     xmm1, [eax + 16]
1875b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqu     xmm2, [eax + 32]
1876b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqu     xmm3, [eax + 48]
1877b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqu     xmm4, [eax + esi]
1878b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    pavgb      xmm0, xmm4
1879b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqu     xmm4, [eax + esi + 16]
1880b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    pavgb      xmm1, xmm4
1881b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqu     xmm4, [eax + esi + 32]
1882b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    pavgb      xmm2, xmm4
1883b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqu     xmm4, [eax + esi + 48]
1884b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    pavgb      xmm3, xmm4
1885b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    lea        eax,  [eax + 64]
1886b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqa     xmm4, xmm0
1887b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    shufps     xmm0, xmm1, 0x88
1888b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    shufps     xmm4, xmm1, 0xdd
1889b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    pavgb      xmm0, xmm4
1890b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqa     xmm4, xmm2
1891b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    shufps     xmm2, xmm3, 0x88
1892b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    shufps     xmm4, xmm3, 0xdd
1893b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    pavgb      xmm2, xmm4
1894b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com
1895b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    // step 2 - convert to U and V
1896b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    // from here down is very similar to Y code except
1897b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    // instead of 16 different pixels, its 8 pixels of U and 8 of V
1898b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqa     xmm1, xmm0
1899b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqa     xmm3, xmm2
1900b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    pmaddubsw  xmm0, xmm7  // U
1901b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    pmaddubsw  xmm2, xmm7
1902b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    pmaddubsw  xmm1, xmm6  // V
1903b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    pmaddubsw  xmm3, xmm6
1904b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    phaddw     xmm0, xmm2
1905b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    phaddw     xmm1, xmm3
1906b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    psraw      xmm0, 8
1907b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    psraw      xmm1, 8
1908b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    packsswb   xmm0, xmm1
1909b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    paddb      xmm0, xmm5            // -> unsigned
1910b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com
1911b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    // step 3 - store 8 U and 8 V values
191218184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com    sub        ecx, 16
1913b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movlps     qword ptr [edx], xmm0 // U
1914b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movhps     qword ptr [edx + edi], xmm0 // V
1915b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    lea        edx, [edx + 8]
191618184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com    jg         convertloop
191718184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com
1918b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    pop        edi
1919b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    pop        esi
1920b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    ret
1921b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com  }
1922b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com}
1923b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com
1924d2f4413d29d15b94d971630ba555dd0cd8fcc8c2fbarchard@google.com__declspec(naked) __declspec(align(16))
19259394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.comvoid ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
19269394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com                       uint8* dst_u, uint8* dst_v, int width) {
1927f5f6fd2aa778c379c442a2210ddd8d2ee03e8eb6fbarchard@google.com  __asm {
19289394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    push       esi
19299394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    push       edi
19309394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    mov        eax, [esp + 8 + 4]   // src_argb
19319394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    mov        esi, [esp + 8 + 8]   // src_stride_argb
19329394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    mov        edx, [esp + 8 + 12]  // dst_u
19339394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    mov        edi, [esp + 8 + 16]  // dst_v
19349394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    mov        ecx, [esp + 8 + 20]  // pix
19356334808d9d40071249ba9b51b65aa4e3b6e7f43ffbarchard@google.com    movdqa     xmm7, kABGRToU
19366334808d9d40071249ba9b51b65aa4e3b6e7f43ffbarchard@google.com    movdqa     xmm6, kABGRToV
19376334808d9d40071249ba9b51b65aa4e3b6e7f43ffbarchard@google.com    movdqa     xmm5, kAddUV128
19389394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    sub        edi, edx             // stride from u to v
19399394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com
1940c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
1941eaedc1d72735e68d45a0b42221a04902e648a21dfbarchard@google.com convertloop:
19429394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    /* step 1 - subsample 16x2 argb pixels to 8x1 */
19439394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    movdqa     xmm0, [eax]
19449394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    movdqa     xmm1, [eax + 16]
19459394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    movdqa     xmm2, [eax + 32]
19469394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    movdqa     xmm3, [eax + 48]
19479394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    pavgb      xmm0, [eax + esi]
19489394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    pavgb      xmm1, [eax + esi + 16]
19499394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    pavgb      xmm2, [eax + esi + 32]
19509394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    pavgb      xmm3, [eax + esi + 48]
19519394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    lea        eax,  [eax + 64]
19529394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    movdqa     xmm4, xmm0
19539394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    shufps     xmm0, xmm1, 0x88
19549394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    shufps     xmm4, xmm1, 0xdd
19559394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    pavgb      xmm0, xmm4
19569394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    movdqa     xmm4, xmm2
19579394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    shufps     xmm2, xmm3, 0x88
19589394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    shufps     xmm4, xmm3, 0xdd
19599394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    pavgb      xmm2, xmm4
19609394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com
19619394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    // step 2 - convert to U and V
19629394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    // from here down is very similar to Y code except
19639394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    // instead of 16 different pixels, its 8 pixels of U and 8 of V
19649394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    movdqa     xmm1, xmm0
19659394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    movdqa     xmm3, xmm2
19669394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    pmaddubsw  xmm0, xmm7  // U
19679394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    pmaddubsw  xmm2, xmm7
19689394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    pmaddubsw  xmm1, xmm6  // V
19699394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    pmaddubsw  xmm3, xmm6
1970b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    phaddw     xmm0, xmm2
1971b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    phaddw     xmm1, xmm3
1972b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    psraw      xmm0, 8
1973b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    psraw      xmm1, 8
1974b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    packsswb   xmm0, xmm1
1975b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    paddb      xmm0, xmm5            // -> unsigned
1976b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com
1977b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    // step 3 - store 8 U and 8 V values
197818184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com    sub        ecx, 16
1979b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movlps     qword ptr [edx], xmm0 // U
1980b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movhps     qword ptr [edx + edi], xmm0 // V
1981b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    lea        edx, [edx + 8]
198218184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com    jg         convertloop
198318184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com
1984b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    pop        edi
1985b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    pop        esi
1986b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    ret
1987b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com  }
1988b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com}
1989b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com
1990d2f4413d29d15b94d971630ba555dd0cd8fcc8c2fbarchard@google.com__declspec(naked) __declspec(align(16))
1991b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.comvoid ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
1992b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com                                 uint8* dst_u, uint8* dst_v, int width) {
1993f5f6fd2aa778c379c442a2210ddd8d2ee03e8eb6fbarchard@google.com  __asm {
1994b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    push       esi
1995b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    push       edi
1996b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    mov        eax, [esp + 8 + 4]   // src_argb
1997b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    mov        esi, [esp + 8 + 8]   // src_stride_argb
1998b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    mov        edx, [esp + 8 + 12]  // dst_u
1999b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    mov        edi, [esp + 8 + 16]  // dst_v
2000b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    mov        ecx, [esp + 8 + 20]  // pix
2001b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqa     xmm7, kABGRToU
2002b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqa     xmm6, kABGRToV
2003b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqa     xmm5, kAddUV128
2004b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    sub        edi, edx             // stride from u to v
2005b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com
2006c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
2007b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com convertloop:
2008b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    /* step 1 - subsample 16x2 argb pixels to 8x1 */
2009b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqu     xmm0, [eax]
2010b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqu     xmm1, [eax + 16]
2011b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqu     xmm2, [eax + 32]
2012b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqu     xmm3, [eax + 48]
2013b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqu     xmm4, [eax + esi]
2014b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    pavgb      xmm0, xmm4
2015b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqu     xmm4, [eax + esi + 16]
2016b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    pavgb      xmm1, xmm4
2017b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqu     xmm4, [eax + esi + 32]
2018b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    pavgb      xmm2, xmm4
2019b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqu     xmm4, [eax + esi + 48]
2020b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    pavgb      xmm3, xmm4
2021b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    lea        eax,  [eax + 64]
2022b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqa     xmm4, xmm0
2023b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    shufps     xmm0, xmm1, 0x88
2024b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    shufps     xmm4, xmm1, 0xdd
2025b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    pavgb      xmm0, xmm4
2026b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqa     xmm4, xmm2
2027b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    shufps     xmm2, xmm3, 0x88
2028b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    shufps     xmm4, xmm3, 0xdd
2029b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    pavgb      xmm2, xmm4
2030b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com
2031b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    // step 2 - convert to U and V
2032b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    // from here down is very similar to Y code except
2033b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    // instead of 16 different pixels, its 8 pixels of U and 8 of V
2034b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqa     xmm1, xmm0
2035b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    movdqa     xmm3, xmm2
2036b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    pmaddubsw  xmm0, xmm7  // U
2037b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    pmaddubsw  xmm2, xmm7
2038b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    pmaddubsw  xmm1, xmm6  // V
2039b5b27d131adf623aa98109fe4196cd492c2d8b60fbarchard@google.com    pmaddubsw  xmm3, xmm6
20409394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    phaddw     xmm0, xmm2
20419394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    phaddw     xmm1, xmm3
20429394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    psraw      xmm0, 8
20439394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    psraw      xmm1, 8
20449394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    packsswb   xmm0, xmm1
20459394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    paddb      xmm0, xmm5            // -> unsigned
20469394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com
20479394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    // step 3 - store 8 U and 8 V values
204818184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com    sub        ecx, 16
20499394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    movlps     qword ptr [edx], xmm0 // U
20509394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    movhps     qword ptr [edx + edi], xmm0 // V
20519394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    lea        edx, [edx + 8]
205218184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com    jg         convertloop
205318184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com
20549394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    pop        edi
20559394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    pop        esi
20569394ed99fcc9802a068ba4a44c36aed79ce87157fbarchard@google.com    ret
2057585a126140be298e60a4daa26140ead0e94eaaa1fbarchard@google.com  }
2058585a126140be298e60a4daa26140ead0e94eaaa1fbarchard@google.com}
205925dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com
206025dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com__declspec(naked) __declspec(align(16))
206125dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.comvoid RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
206225dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com                       uint8* dst_u, uint8* dst_v, int width) {
2063f5f6fd2aa778c379c442a2210ddd8d2ee03e8eb6fbarchard@google.com  __asm {
206425dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    push       esi
206525dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    push       edi
206625dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    mov        eax, [esp + 8 + 4]   // src_argb
206725dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    mov        esi, [esp + 8 + 8]   // src_stride_argb
206825dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    mov        edx, [esp + 8 + 12]  // dst_u
206925dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    mov        edi, [esp + 8 + 16]  // dst_v
207025dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    mov        ecx, [esp + 8 + 20]  // pix
207125dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    movdqa     xmm7, kRGBAToU
207225dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    movdqa     xmm6, kRGBAToV
207325dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    movdqa     xmm5, kAddUV128
207425dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    sub        edi, edx             // stride from u to v
207525dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com
2076c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
207725dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com convertloop:
207825dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    /* step 1 - subsample 16x2 argb pixels to 8x1 */
207925dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    movdqa     xmm0, [eax]
208025dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    movdqa     xmm1, [eax + 16]
208125dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    movdqa     xmm2, [eax + 32]
208225dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    movdqa     xmm3, [eax + 48]
208325dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    pavgb      xmm0, [eax + esi]
208425dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    pavgb      xmm1, [eax + esi + 16]
208525dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    pavgb      xmm2, [eax + esi + 32]
208625dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    pavgb      xmm3, [eax + esi + 48]
208725dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    lea        eax,  [eax + 64]
208825dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    movdqa     xmm4, xmm0
208925dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    shufps     xmm0, xmm1, 0x88
209025dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    shufps     xmm4, xmm1, 0xdd
209125dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    pavgb      xmm0, xmm4
209225dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    movdqa     xmm4, xmm2
209325dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    shufps     xmm2, xmm3, 0x88
209425dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    shufps     xmm4, xmm3, 0xdd
209525dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    pavgb      xmm2, xmm4
209625dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com
209725dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    // step 2 - convert to U and V
209825dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    // from here down is very similar to Y code except
209925dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    // instead of 16 different pixels, its 8 pixels of U and 8 of V
210025dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    movdqa     xmm1, xmm0
210125dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    movdqa     xmm3, xmm2
210225dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    pmaddubsw  xmm0, xmm7  // U
210325dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    pmaddubsw  xmm2, xmm7
210425dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    pmaddubsw  xmm1, xmm6  // V
210525dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    pmaddubsw  xmm3, xmm6
210625dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    phaddw     xmm0, xmm2
210725dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    phaddw     xmm1, xmm3
210825dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    psraw      xmm0, 8
210925dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    psraw      xmm1, 8
211025dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    packsswb   xmm0, xmm1
211125dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    paddb      xmm0, xmm5            // -> unsigned
211225dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com
211325dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    // step 3 - store 8 U and 8 V values
211425dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    sub        ecx, 16
211525dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    movlps     qword ptr [edx], xmm0 // U
211625dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    movhps     qword ptr [edx + edi], xmm0 // V
211725dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    lea        edx, [edx + 8]
211825dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    jg         convertloop
211925dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com
212025dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    pop        edi
212125dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    pop        esi
212225dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    ret
212325dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com  }
212425dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com}
212525dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com
212625dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com__declspec(naked) __declspec(align(16))
212725dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.comvoid RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
212825dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com                                 uint8* dst_u, uint8* dst_v, int width) {
2129f5f6fd2aa778c379c442a2210ddd8d2ee03e8eb6fbarchard@google.com  __asm {
213025dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    push       esi
213125dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    push       edi
213225dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    mov        eax, [esp + 8 + 4]   // src_argb
213325dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    mov        esi, [esp + 8 + 8]   // src_stride_argb
213425dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    mov        edx, [esp + 8 + 12]  // dst_u
213525dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    mov        edi, [esp + 8 + 16]  // dst_v
213625dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    mov        ecx, [esp + 8 + 20]  // pix
213725dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    movdqa     xmm7, kRGBAToU
213825dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    movdqa     xmm6, kRGBAToV
213925dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    movdqa     xmm5, kAddUV128
214025dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    sub        edi, edx             // stride from u to v
214125dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com
2142c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
214325dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com convertloop:
214425dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    /* step 1 - subsample 16x2 argb pixels to 8x1 */
214525dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    movdqu     xmm0, [eax]
214625dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    movdqu     xmm1, [eax + 16]
214725dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    movdqu     xmm2, [eax + 32]
214825dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    movdqu     xmm3, [eax + 48]
214925dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    movdqu     xmm4, [eax + esi]
215025dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    pavgb      xmm0, xmm4
215125dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    movdqu     xmm4, [eax + esi + 16]
215225dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    pavgb      xmm1, xmm4
215325dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    movdqu     xmm4, [eax + esi + 32]
215425dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    pavgb      xmm2, xmm4
215525dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    movdqu     xmm4, [eax + esi + 48]
215625dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    pavgb      xmm3, xmm4
215725dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    lea        eax,  [eax + 64]
215825dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    movdqa     xmm4, xmm0
215925dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    shufps     xmm0, xmm1, 0x88
216025dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    shufps     xmm4, xmm1, 0xdd
216125dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    pavgb      xmm0, xmm4
216225dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    movdqa     xmm4, xmm2
216325dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    shufps     xmm2, xmm3, 0x88
216425dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    shufps     xmm4, xmm3, 0xdd
216525dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    pavgb      xmm2, xmm4
216625dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com
216725dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    // step 2 - convert to U and V
216825dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    // from here down is very similar to Y code except
216925dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    // instead of 16 different pixels, its 8 pixels of U and 8 of V
217025dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    movdqa     xmm1, xmm0
217125dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    movdqa     xmm3, xmm2
217225dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    pmaddubsw  xmm0, xmm7  // U
217325dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    pmaddubsw  xmm2, xmm7
217425dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    pmaddubsw  xmm1, xmm6  // V
217525dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    pmaddubsw  xmm3, xmm6
217625dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    phaddw     xmm0, xmm2
217725dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    phaddw     xmm1, xmm3
217825dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    psraw      xmm0, 8
217925dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    psraw      xmm1, 8
218025dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    packsswb   xmm0, xmm1
218125dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    paddb      xmm0, xmm5            // -> unsigned
218225dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com
218325dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    // step 3 - store 8 U and 8 V values
218425dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    sub        ecx, 16
218525dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    movlps     qword ptr [edx], xmm0 // U
218625dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    movhps     qword ptr [edx + edi], xmm0 // V
218725dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    lea        edx, [edx + 8]
218825dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    jg         convertloop
218925dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com
219025dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    pop        edi
219125dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    pop        esi
219225dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    ret
219325dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com  }
219425dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com}
21954c416e8849dcda3e884c6204bcdaa264f66d4288fbarchard@google.com#endif  // HAS_ARGBTOYROW_SSSE3
2196585a126140be298e60a4daa26140ead0e94eaaa1fbarchard@google.com
2197c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.com#ifdef HAS_I422TOARGBROW_AVX2
2198c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.com
2199851a702b39aefb717990a0578d8701d051cbab32fbarchard@google.comstatic const lvec8 kUVToB_AVX = {
2200c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.com  UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB,
2201c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.com  UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
2202c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.com};
2203851a702b39aefb717990a0578d8701d051cbab32fbarchard@google.comstatic const lvec8 kUVToR_AVX = {
2204c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.com  UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR,
2205c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.com  UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
2206c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.com};
2207851a702b39aefb717990a0578d8701d051cbab32fbarchard@google.comstatic const lvec8 kUVToG_AVX = {
2208c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.com  UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
2209c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.com  UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
2210c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.com};
2211851a702b39aefb717990a0578d8701d051cbab32fbarchard@google.comstatic const lvec16 kYToRgb_AVX = {
22122b115a5237a8133a2eff060880f29198adf35eecfbarchard@google.com  YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG
22132b115a5237a8133a2eff060880f29198adf35eecfbarchard@google.com};
2214851a702b39aefb717990a0578d8701d051cbab32fbarchard@google.comstatic const lvec16 kYSub16_AVX = {
22152b115a5237a8133a2eff060880f29198adf35eecfbarchard@google.com  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
22162b115a5237a8133a2eff060880f29198adf35eecfbarchard@google.com};
2217851a702b39aefb717990a0578d8701d051cbab32fbarchard@google.comstatic const lvec16 kUVBiasB_AVX = {
22182b115a5237a8133a2eff060880f29198adf35eecfbarchard@google.com  BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB
22192b115a5237a8133a2eff060880f29198adf35eecfbarchard@google.com};
2220851a702b39aefb717990a0578d8701d051cbab32fbarchard@google.comstatic const lvec16 kUVBiasG_AVX = {
22212b115a5237a8133a2eff060880f29198adf35eecfbarchard@google.com  BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG
22222b115a5237a8133a2eff060880f29198adf35eecfbarchard@google.com};
2223851a702b39aefb717990a0578d8701d051cbab32fbarchard@google.comstatic const lvec16 kUVBiasR_AVX = {
22242b115a5237a8133a2eff060880f29198adf35eecfbarchard@google.com  BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR
22252b115a5237a8133a2eff060880f29198adf35eecfbarchard@google.com};
2226c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.com
2227c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.com// 16 pixels
2228c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.com// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2229c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.com__declspec(naked) __declspec(align(16))
2230c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.comvoid I422ToARGBRow_AVX2(const uint8* y_buf,
2231e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com                        const uint8* u_buf,
2232e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com                        const uint8* v_buf,
2233e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com                        uint8* dst_argb,
2234e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com                        int width) {
2235c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.com  __asm {
2236c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.com    push       esi
2237c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.com    push       edi
2238c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.com    mov        eax, [esp + 8 + 4]   // Y
2239c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.com    mov        esi, [esp + 8 + 8]   // U
2240c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.com    mov        edi, [esp + 8 + 12]  // V
2241c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.com    mov        edx, [esp + 8 + 16]  // argb
2242c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.com    mov        ecx, [esp + 8 + 20]  // width
2243c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.com    sub        edi, esi
2244c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.com    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
2245c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.com    vpxor      ymm4, ymm4, ymm4
2246c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.com
2247c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
2248c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.com convertloop:
2249c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.com    vmovq      xmm0, qword ptr [esi]          //  U
2250c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.com    vmovq      xmm1, qword ptr [esi + edi]    //  V
2251cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    lea        esi,  [esi + 8]
2252cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    vpunpcklbw ymm0, ymm0, ymm1               // UV
2253c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.com    vpermq     ymm0, ymm0, 0xd8
2254c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.com    vpunpcklwd ymm0, ymm0, ymm0              // UVUV
2255cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    vpmaddubsw ymm2, ymm0, kUVToB_AVX        // scale B UV
2256cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    vpmaddubsw ymm1, ymm0, kUVToG_AVX        // scale G UV
2257cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    vpmaddubsw ymm0, ymm0, kUVToR_AVX        // scale R UV
2258cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    vpsubw     ymm2, ymm2, kUVBiasB_AVX      // unbias back to signed
2259cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    vpsubw     ymm1, ymm1, kUVBiasG_AVX
2260cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    vpsubw     ymm0, ymm0, kUVBiasR_AVX
2261cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com
2262cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    // Step 2: Find Y contribution to 16 R,G,B values
2263cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    vmovdqu    xmm3, [eax]                  // NOLINT
2264cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    lea        eax, [eax + 16]
2265c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.com    vpermq     ymm3, ymm3, 0xd8
2266c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.com    vpunpcklbw ymm3, ymm3, ymm4
2267cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    vpsubsw    ymm3, ymm3, kYSub16_AVX
2268cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    vpmullw    ymm3, ymm3, kYToRgb_AVX
2269cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    vpaddsw    ymm2, ymm2, ymm3           // B += Y
2270cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    vpaddsw    ymm1, ymm1, ymm3           // G += Y
2271cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    vpaddsw    ymm0, ymm0, ymm3           // R += Y
2272cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    vpsraw     ymm2, ymm2, 6
2273cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    vpsraw     ymm1, ymm1, 6
2274cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    vpsraw     ymm0, ymm0, 6
2275c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.com    vpackuswb  ymm2, ymm2, ymm2           // B
2276c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.com    vpackuswb  ymm1, ymm1, ymm1           // G
2277cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    vpackuswb  ymm0, ymm0, ymm0           // R
2278c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.com
2279c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.com    // Step 3: Weave into ARGB
2280c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.com    vpunpcklbw ymm2, ymm2, ymm1           // BG
2281c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.com    vpermq     ymm2, ymm2, 0xd8
2282c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.com    vpunpcklbw ymm0, ymm0, ymm5           // RA
2283c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.com    vpermq     ymm0, ymm0, 0xd8
2284cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    vpunpcklwd ymm1, ymm2, ymm0           // BGRA first 8 pixels
2285cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    vpunpckhwd ymm2, ymm2, ymm0           // BGRA next 8 pixels
2286c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.com    vmovdqu    [edx], ymm1
2287c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.com    vmovdqu    [edx + 32], ymm2
2288c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.com    lea        edx,  [edx + 64]
2289c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.com    sub        ecx, 16
2290c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.com    jg         convertloop
2291c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.com    vzeroupper
2292c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.com
2293c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.com    pop        edi
2294c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.com    pop        esi
2295c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.com    ret
2296c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.com  }
2297c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.com}
2298c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.com#endif  // HAS_I422TOARGBROW_AVX2
2299c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.com
2300c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.com#ifdef HAS_I422TOARGBROW_SSSE3
2301c297d103f199dc8c9565ea0f35bdb0832a9d10b8fbarchard@google.com
2302c4c578e327a9dbc9dafe113634612e9a349a8c1ffbarchard@google.com// TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
2303e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com
230447e856c632f0a310004601b86493220a6993d7b4fbarchard@google.com// Read 8 UV from 444.
2305b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com#define READYUV444 __asm {                                                     \
2306bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    __asm movq       xmm0, qword ptr [esi] /* U */                /* NOLINT */ \
2307bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    __asm movq       xmm1, qword ptr [esi + edi] /* V */          /* NOLINT */ \
2308e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    __asm lea        esi,  [esi + 8]                                           \
2309e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
2310e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com  }
2311e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com
2312c4c578e327a9dbc9dafe113634612e9a349a8c1ffbarchard@google.com// Read 4 UV from 422, upsample to 8 UV.
2313b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com#define READYUV422 __asm {                                                     \
2314d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    __asm movd       xmm0, [esi]          /* U */                              \
2315d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    __asm movd       xmm1, [esi + edi]    /* V */                              \
2316d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    __asm lea        esi,  [esi + 4]                                           \
2317d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
2318d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
2319d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com  }
2320d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com
2321c4c578e327a9dbc9dafe113634612e9a349a8c1ffbarchard@google.com// Read 2 UV from 411, upsample to 8 UV.
2322b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com#define READYUV411 __asm {                                                     \
23230d19fc5ed37530b1feae839db7b9d1242a9f407ffbarchard@google.com    __asm movzx      ebx, word ptr [esi]        /* U */           /* NOLINT */ \
232447e856c632f0a310004601b86493220a6993d7b4fbarchard@google.com    __asm movd       xmm0, ebx                                                 \
23250d19fc5ed37530b1feae839db7b9d1242a9f407ffbarchard@google.com    __asm movzx      ebx, word ptr [esi + edi]  /* V */           /* NOLINT */ \
232647e856c632f0a310004601b86493220a6993d7b4fbarchard@google.com    __asm movd       xmm1, ebx                                                 \
2327e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    __asm lea        esi,  [esi + 2]                                           \
2328e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
2329e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
2330e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    __asm punpckldq  xmm0, xmm0           /* UVUV (upsample) */                \
23314c416e8849dcda3e884c6204bcdaa264f66d4288fbarchard@google.com  }
23324c416e8849dcda3e884c6204bcdaa264f66d4288fbarchard@google.com
2333c4c578e327a9dbc9dafe113634612e9a349a8c1ffbarchard@google.com// Read 4 UV from NV12, upsample to 8 UV.
2334b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com#define READNV12 __asm {                                                       \
2335bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    __asm movq       xmm0, qword ptr [esi] /* UV */               /* NOLINT */ \
23362d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    __asm lea        esi,  [esi + 8]                                           \
23372d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
23382d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com  }
23392d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com
2340c4c578e327a9dbc9dafe113634612e9a349a8c1ffbarchard@google.com// Convert 8 pixels: 8 UV and 8 Y.
2341b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com#define YUVTORGB __asm {                                                       \
23424c416e8849dcda3e884c6204bcdaa264f66d4288fbarchard@google.com    /* Step 1: Find 4 UV contributions to 8 R,G,B values */                    \
2343e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    __asm movdqa     xmm1, xmm0                                                \
2344e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    __asm movdqa     xmm2, xmm0                                                \
2345e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    __asm pmaddubsw  xmm0, kUVToB        /* scale B UV */                      \
2346e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    __asm pmaddubsw  xmm1, kUVToG        /* scale G UV */                      \
2347e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    __asm pmaddubsw  xmm2, kUVToR        /* scale R UV */                      \
2348e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    __asm psubw      xmm0, kUVBiasB      /* unbias back to signed */           \
2349e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    __asm psubw      xmm1, kUVBiasG                                            \
2350e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    __asm psubw      xmm2, kUVBiasR                                            \
2351e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    /* Step 2: Find Y contribution to 8 R,G,B values */                        \
2352e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    __asm movq       xmm3, qword ptr [eax]                        /* NOLINT */ \
2353e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    __asm lea        eax, [eax + 8]                                            \
2354e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    __asm punpcklbw  xmm3, xmm4                                                \
2355e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    __asm psubsw     xmm3, kYSub16                                             \
2356e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    __asm pmullw     xmm3, kYToRgb                                             \
2357e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    __asm paddsw     xmm0, xmm3           /* B += Y */                         \
2358e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    __asm paddsw     xmm1, xmm3           /* G += Y */                         \
2359e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    __asm paddsw     xmm2, xmm3           /* R += Y */                         \
2360e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    __asm psraw      xmm0, 6                                                   \
2361e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    __asm psraw      xmm1, 6                                                   \
2362e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    __asm psraw      xmm2, 6                                                   \
2363e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    __asm packuswb   xmm0, xmm0           /* B */                              \
2364e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    __asm packuswb   xmm1, xmm1           /* G */                              \
2365e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    __asm packuswb   xmm2, xmm2           /* R */                              \
2366e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com  }
2367e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com
2368c4c578e327a9dbc9dafe113634612e9a349a8c1ffbarchard@google.com// Convert 8 pixels: 8 VU and 8 Y.
2369b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com#define YVUTORGB __asm {                                                       \
23702d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    /* Step 1: Find 4 UV contributions to 8 R,G,B values */                    \
23712d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    __asm movdqa     xmm1, xmm0                                                \
23722d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    __asm movdqa     xmm2, xmm0                                                \
23732d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    __asm pmaddubsw  xmm0, kVUToB        /* scale B UV */                      \
23742d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    __asm pmaddubsw  xmm1, kVUToG        /* scale G UV */                      \
23752d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    __asm pmaddubsw  xmm2, kVUToR        /* scale R UV */                      \
23762d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    __asm psubw      xmm0, kUVBiasB      /* unbias back to signed */           \
23772d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    __asm psubw      xmm1, kUVBiasG                                            \
23782d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    __asm psubw      xmm2, kUVBiasR                                            \
23792d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    /* Step 2: Find Y contribution to 8 R,G,B values */                        \
23802d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    __asm movq       xmm3, qword ptr [eax]                        /* NOLINT */ \
23812d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    __asm lea        eax, [eax + 8]                                            \
23822d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    __asm punpcklbw  xmm3, xmm4                                                \
23832d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    __asm psubsw     xmm3, kYSub16                                             \
23842d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    __asm pmullw     xmm3, kYToRgb                                             \
23852d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    __asm paddsw     xmm0, xmm3           /* B += Y */                         \
23862d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    __asm paddsw     xmm1, xmm3           /* G += Y */                         \
23872d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    __asm paddsw     xmm2, xmm3           /* R += Y */                         \
23882d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    __asm psraw      xmm0, 6                                                   \
23892d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    __asm psraw      xmm1, 6                                                   \
23902d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    __asm psraw      xmm2, 6                                                   \
23912d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    __asm packuswb   xmm0, xmm0           /* B */                              \
23922d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    __asm packuswb   xmm1, xmm1           /* G */                              \
23932d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    __asm packuswb   xmm2, xmm2           /* R */                              \
23942d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com  }
23952d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com
2396e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com// 8 pixels, dest aligned 16.
2397c4c578e327a9dbc9dafe113634612e9a349a8c1ffbarchard@google.com// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
2398d2f4413d29d15b94d971630ba555dd0cd8fcc8c2fbarchard@google.com__declspec(naked) __declspec(align(16))
2399e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.comvoid I444ToARGBRow_SSSE3(const uint8* y_buf,
2400e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com                         const uint8* u_buf,
2401e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com                         const uint8* v_buf,
2402bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com                         uint8* dst_argb,
2403e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com                         int width) {
2404d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com  __asm {
2405d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    push       esi
2406d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    push       edi
2407d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    mov        eax, [esp + 8 + 4]   // Y
2408d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    mov        esi, [esp + 8 + 8]   // U
2409d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    mov        edi, [esp + 8 + 12]  // V
2410e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    mov        edx, [esp + 8 + 16]  // argb
2411d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    mov        ecx, [esp + 8 + 20]  // width
2412d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    sub        edi, esi
2413d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
2414d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    pxor       xmm4, xmm4
2415d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com
2416c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
2417eaedc1d72735e68d45a0b42221a04902e648a21dfbarchard@google.com convertloop:
24184c416e8849dcda3e884c6204bcdaa264f66d4288fbarchard@google.com    READYUV444
24194c416e8849dcda3e884c6204bcdaa264f66d4288fbarchard@google.com    YUVTORGB
2420d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com
2421d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    // Step 3: Weave into ARGB
2422d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    punpcklbw  xmm0, xmm1           // BG
2423d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    punpcklbw  xmm2, xmm5           // RA
2424d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    movdqa     xmm1, xmm0
2425d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
2426d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
24273fe369661abbd1bbca12bd69dc8be0be9a5f9792fbarchard@google.com    movdqa     [edx], xmm0
2428d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    movdqa     [edx + 16], xmm1
2429d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    lea        edx,  [edx + 32]
2430d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    sub        ecx, 8
243118184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com    jg         convertloop
2432d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com
2433d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    pop        edi
2434d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    pop        esi
2435d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    ret
2436d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com  }
2437d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com}
2438d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com
2439e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com// 8 pixels, dest aligned 16.
2440c4c578e327a9dbc9dafe113634612e9a349a8c1ffbarchard@google.com// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2441d2f4413d29d15b94d971630ba555dd0cd8fcc8c2fbarchard@google.com__declspec(naked) __declspec(align(16))
2442827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.comvoid I422ToRGB24Row_SSSE3(const uint8* y_buf,
2443827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com                          const uint8* u_buf,
2444827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com                          const uint8* v_buf,
2445bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com                          uint8* dst_rgb24,
2446827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com                          int width) {
2447827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com  __asm {
2448827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    push       esi
2449827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    push       edi
2450827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    mov        eax, [esp + 8 + 4]   // Y
2451827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    mov        esi, [esp + 8 + 8]   // U
2452827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    mov        edi, [esp + 8 + 12]  // V
2453827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    mov        edx, [esp + 8 + 16]  // rgb24
2454827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    mov        ecx, [esp + 8 + 20]  // width
2455827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    sub        edi, esi
2456827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    pxor       xmm4, xmm4
2457827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    movdqa     xmm5, kShuffleMaskARGBToRGB24_0
2458827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    movdqa     xmm6, kShuffleMaskARGBToRGB24
2459827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com
2460c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
2461827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com convertloop:
2462827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    READYUV422
2463827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    YUVTORGB
2464827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com
2465827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    // Step 3: Weave into RRGB
2466827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    punpcklbw  xmm0, xmm1           // BG
2467827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    punpcklbw  xmm2, xmm2           // RR
2468827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    movdqa     xmm1, xmm0
2469827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    punpcklwd  xmm0, xmm2           // BGRR first 4 pixels
2470827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    punpckhwd  xmm1, xmm2           // BGRR next 4 pixels
2471827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    pshufb     xmm0, xmm5           // Pack into first 8 and last 4 bytes.
2472827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    pshufb     xmm1, xmm6           // Pack into first 12 bytes.
2473827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    palignr    xmm1, xmm0, 12       // last 4 bytes of xmm0 + 12 from xmm1
2474827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    movq       qword ptr [edx], xmm0  // First 8 bytes
2475827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    movdqu     [edx + 8], xmm1      // Last 16 bytes. = 24 bytes, 8 RGB pixels.
2476827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    lea        edx,  [edx + 24]
2477827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    sub        ecx, 8
2478827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    jg         convertloop
2479827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com
2480827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    pop        edi
2481827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    pop        esi
2482827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    ret
2483827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com  }
2484827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com}
2485827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com
2486827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com// 8 pixels, dest aligned 16.
2487827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2488827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com__declspec(naked) __declspec(align(16))
2489827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.comvoid I422ToRAWRow_SSSE3(const uint8* y_buf,
2490827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com                        const uint8* u_buf,
2491827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com                        const uint8* v_buf,
2492bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com                        uint8* dst_raw,
2493827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com                        int width) {
2494827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com  __asm {
2495827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    push       esi
2496827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    push       edi
2497827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    mov        eax, [esp + 8 + 4]   // Y
2498827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    mov        esi, [esp + 8 + 8]   // U
2499827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    mov        edi, [esp + 8 + 12]  // V
2500827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    mov        edx, [esp + 8 + 16]  // raw
2501827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    mov        ecx, [esp + 8 + 20]  // width
2502827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    sub        edi, esi
2503827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    pxor       xmm4, xmm4
2504827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    movdqa     xmm5, kShuffleMaskARGBToRAW_0
2505827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    movdqa     xmm6, kShuffleMaskARGBToRAW
2506827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com
2507c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
2508827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com convertloop:
2509827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    READYUV422
2510827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    YUVTORGB
2511827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com
2512827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    // Step 3: Weave into RRGB
2513827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    punpcklbw  xmm0, xmm1           // BG
2514827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    punpcklbw  xmm2, xmm2           // RR
2515827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    movdqa     xmm1, xmm0
2516827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    punpcklwd  xmm0, xmm2           // BGRR first 4 pixels
2517827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    punpckhwd  xmm1, xmm2           // BGRR next 4 pixels
2518827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    pshufb     xmm0, xmm5           // Pack into first 8 and last 4 bytes.
2519827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    pshufb     xmm1, xmm6           // Pack into first 12 bytes.
2520827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    palignr    xmm1, xmm0, 12       // last 4 bytes of xmm0 + 12 from xmm1
2521827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    movq       qword ptr [edx], xmm0  // First 8 bytes
2522827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    movdqu     [edx + 8], xmm1      // Last 16 bytes. = 24 bytes, 8 RGB pixels.
2523827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    lea        edx,  [edx + 24]
2524827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    sub        ecx, 8
2525827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    jg         convertloop
2526827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com
2527827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    pop        edi
2528827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    pop        esi
2529827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com    ret
2530827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com  }
2531827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com}
2532827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com
2533af1aa56f0b4a5fcac6c36a1a0c02b6917f2c14f2fbarchard@google.com// 8 pixels, dest unaligned.
2534827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2535827de16bb1fa9fc5cb7237a8c32378cc3e30ae2dfbarchard@google.com__declspec(naked) __declspec(align(16))
253615449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.comvoid I422ToRGB565Row_SSSE3(const uint8* y_buf,
253715449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com                           const uint8* u_buf,
253815449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com                           const uint8* v_buf,
253915449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com                           uint8* rgb565_buf,
254015449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com                           int width) {
254115449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com  __asm {
254215449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com    push       esi
254315449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com    push       edi
254415449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com    mov        eax, [esp + 8 + 4]   // Y
254515449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com    mov        esi, [esp + 8 + 8]   // U
254615449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com    mov        edi, [esp + 8 + 12]  // V
254715449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com    mov        edx, [esp + 8 + 16]  // rgb565
254815449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com    mov        ecx, [esp + 8 + 20]  // width
254915449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com    sub        edi, esi
255015449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com    pxor       xmm4, xmm4
255115449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com    pcmpeqb    xmm5, xmm5       // generate mask 0x0000001f
255215449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com    psrld      xmm5, 27
255315449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com    pcmpeqb    xmm6, xmm6       // generate mask 0x000007e0
255415449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com    psrld      xmm6, 26
255515449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com    pslld      xmm6, 5
255615449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com    pcmpeqb    xmm7, xmm7       // generate mask 0xfffff800
255715449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com    pslld      xmm7, 11
255815449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com
2559c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
256015449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com convertloop:
256115449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com    READYUV422
256215449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com    YUVTORGB
256315449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com
256415449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com    // Step 3: Weave into RRGB
256515449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com    punpcklbw  xmm0, xmm1           // BG
256615449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com    punpcklbw  xmm2, xmm2           // RR
256715449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com    movdqa     xmm1, xmm0
256815449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com    punpcklwd  xmm0, xmm2           // BGRR first 4 pixels
256915449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com    punpckhwd  xmm1, xmm2           // BGRR next 4 pixels
257015449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com
257115449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com    // Step 3b: RRGB -> RGB565
257215449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com    movdqa     xmm3, xmm0    // B  first 4 pixels of argb
257315449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com    movdqa     xmm2, xmm0    // G
257415449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com    pslld      xmm0, 8       // R
257515449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com    psrld      xmm3, 3       // B
257615449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com    psrld      xmm2, 5       // G
257715449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com    psrad      xmm0, 16      // R
257815449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com    pand       xmm3, xmm5    // B
257915449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com    pand       xmm2, xmm6    // G
258015449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com    pand       xmm0, xmm7    // R
258115449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com    por        xmm3, xmm2    // BG
258215449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com    por        xmm0, xmm3    // BGR
258315449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com    movdqa     xmm3, xmm1    // B  next 4 pixels of argb
258415449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com    movdqa     xmm2, xmm1    // G
258515449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com    pslld      xmm1, 8       // R
258615449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com    psrld      xmm3, 3       // B
258715449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com    psrld      xmm2, 5       // G
258815449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com    psrad      xmm1, 16      // R
258915449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com    pand       xmm3, xmm5    // B
259015449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com    pand       xmm2, xmm6    // G
259115449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com    pand       xmm1, xmm7    // R
259215449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com    por        xmm3, xmm2    // BG
259315449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com    por        xmm1, xmm3    // BGR
259415449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com    packssdw   xmm0, xmm1
259515449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com    sub        ecx, 8
2596af1aa56f0b4a5fcac6c36a1a0c02b6917f2c14f2fbarchard@google.com    movdqu     [edx], xmm0   // store 8 pixels of RGB565
259715449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com    lea        edx, [edx + 16]
259815449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com    jg         convertloop
259915449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com
260015449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com    pop        edi
260115449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com    pop        esi
260215449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com    ret
260315449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com  }
260415449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com}
260515449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com
260615449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com// 8 pixels, dest aligned 16.
260715449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
260815449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com__declspec(naked) __declspec(align(16))
2609e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.comvoid I422ToARGBRow_SSSE3(const uint8* y_buf,
2610e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com                         const uint8* u_buf,
2611e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com                         const uint8* v_buf,
2612bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com                         uint8* dst_argb,
2613e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com                         int width) {
2614d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com  __asm {
2615d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    push       esi
2616d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    push       edi
2617d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    mov        eax, [esp + 8 + 4]   // Y
2618d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    mov        esi, [esp + 8 + 8]   // U
2619d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    mov        edi, [esp + 8 + 12]  // V
2620e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    mov        edx, [esp + 8 + 16]  // argb
2621d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    mov        ecx, [esp + 8 + 20]  // width
2622d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    sub        edi, esi
2623e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
2624d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    pxor       xmm4, xmm4
2625d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com
2626c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
2627eaedc1d72735e68d45a0b42221a04902e648a21dfbarchard@google.com convertloop:
26284c416e8849dcda3e884c6204bcdaa264f66d4288fbarchard@google.com    READYUV422
26294c416e8849dcda3e884c6204bcdaa264f66d4288fbarchard@google.com    YUVTORGB
2630d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com
2631e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    // Step 3: Weave into ARGB
2632e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    punpcklbw  xmm0, xmm1           // BG
2633e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    punpcklbw  xmm2, xmm5           // RA
2634e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    movdqa     xmm1, xmm0
2635e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
2636e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
2637e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    movdqa     [edx], xmm0
2638e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    movdqa     [edx + 16], xmm1
2639d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    lea        edx,  [edx + 32]
2640d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    sub        ecx, 8
264118184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com    jg         convertloop
2642d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com
2643d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    pop        edi
2644d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    pop        esi
2645d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    ret
2646d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com  }
2647d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com}
2648d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com
2649e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com// 8 pixels, dest aligned 16.
2650c4c578e327a9dbc9dafe113634612e9a349a8c1ffbarchard@google.com// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2651e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com// Similar to I420 but duplicate UV once more.
2652d2f4413d29d15b94d971630ba555dd0cd8fcc8c2fbarchard@google.com__declspec(naked) __declspec(align(16))
2653e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.comvoid I411ToARGBRow_SSSE3(const uint8* y_buf,
2654e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com                         const uint8* u_buf,
2655e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com                         const uint8* v_buf,
2656bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com                         uint8* dst_argb,
2657e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com                         int width) {
2658d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com  __asm {
265947e856c632f0a310004601b86493220a6993d7b4fbarchard@google.com    push       ebx
2660d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    push       esi
2661d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    push       edi
266247e856c632f0a310004601b86493220a6993d7b4fbarchard@google.com    mov        eax, [esp + 12 + 4]   // Y
266347e856c632f0a310004601b86493220a6993d7b4fbarchard@google.com    mov        esi, [esp + 12 + 8]   // U
266447e856c632f0a310004601b86493220a6993d7b4fbarchard@google.com    mov        edi, [esp + 12 + 12]  // V
266547e856c632f0a310004601b86493220a6993d7b4fbarchard@google.com    mov        edx, [esp + 12 + 16]  // argb
266647e856c632f0a310004601b86493220a6993d7b4fbarchard@google.com    mov        ecx, [esp + 12 + 20]  // width
2667d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    sub        edi, esi
2668d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
2669d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    pxor       xmm4, xmm4
2670d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com
2671c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
2672eaedc1d72735e68d45a0b42221a04902e648a21dfbarchard@google.com convertloop:
267347e856c632f0a310004601b86493220a6993d7b4fbarchard@google.com    READYUV411  // modifies EBX
26744c416e8849dcda3e884c6204bcdaa264f66d4288fbarchard@google.com    YUVTORGB
2675d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com
2676d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    // Step 3: Weave into ARGB
2677e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    punpcklbw  xmm0, xmm1           // BG
2678e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    punpcklbw  xmm2, xmm5           // RA
2679e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    movdqa     xmm1, xmm0
2680e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
2681e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
2682e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    movdqa     [edx], xmm0
2683d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    movdqa     [edx + 16], xmm1
2684d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    lea        edx,  [edx + 32]
2685d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    sub        ecx, 8
268618184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com    jg         convertloop
2687d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com
2688d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    pop        edi
2689d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    pop        esi
269047e856c632f0a310004601b86493220a6993d7b4fbarchard@google.com    pop        ebx
2691d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    ret
2692d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com  }
2693d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com}
2694d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com
26952d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com// 8 pixels, dest aligned 16.
2696c4c578e327a9dbc9dafe113634612e9a349a8c1ffbarchard@google.com// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
26972d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com__declspec(naked) __declspec(align(16))
26982d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.comvoid NV12ToARGBRow_SSSE3(const uint8* y_buf,
26992d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com                         const uint8* uv_buf,
2700bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com                         uint8* dst_argb,
27012d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com                         int width) {
27022d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com  __asm {
27032d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    push       esi
27042d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    mov        eax, [esp + 4 + 4]   // Y
27052d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    mov        esi, [esp + 4 + 8]   // UV
27062d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    mov        edx, [esp + 4 + 12]  // argb
27072d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    mov        ecx, [esp + 4 + 16]  // width
27082d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
27092d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    pxor       xmm4, xmm4
27102d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com
2711c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
27122d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com convertloop:
27132d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    READNV12
27142d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    YUVTORGB
27152d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com
27162d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    // Step 3: Weave into ARGB
27172d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    punpcklbw  xmm0, xmm1           // BG
27182d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    punpcklbw  xmm2, xmm5           // RA
27192d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    movdqa     xmm1, xmm0
27202d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
27212d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
27222d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    movdqa     [edx], xmm0
27232d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    movdqa     [edx + 16], xmm1
27242d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    lea        edx,  [edx + 32]
27252d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    sub        ecx, 8
27262d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    jg         convertloop
27272d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com
27282d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    pop        esi
27292d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    ret
27302d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com  }
27312d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com}
27322d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com
27332d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com// 8 pixels, dest aligned 16.
2734c4c578e327a9dbc9dafe113634612e9a349a8c1ffbarchard@google.com// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
27352d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com__declspec(naked) __declspec(align(16))
27362d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.comvoid NV21ToARGBRow_SSSE3(const uint8* y_buf,
27372d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com                         const uint8* uv_buf,
2738bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com                         uint8* dst_argb,
27392d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com                         int width) {
27402d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com  __asm {
27412d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    push       esi
27422d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    mov        eax, [esp + 4 + 4]   // Y
27432d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    mov        esi, [esp + 4 + 8]   // VU
27442d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    mov        edx, [esp + 4 + 12]  // argb
27452d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    mov        ecx, [esp + 4 + 16]  // width
27462d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
27472d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    pxor       xmm4, xmm4
27482d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com
2749c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
27502d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com convertloop:
27512d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    READNV12
27522d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    YVUTORGB
27532d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com
27542d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    // Step 3: Weave into ARGB
27552d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    punpcklbw  xmm0, xmm1           // BG
27562d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    punpcklbw  xmm2, xmm5           // RA
27572d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    movdqa     xmm1, xmm0
27582d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
27592d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
27602d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    movdqa     [edx], xmm0
27612d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    movdqa     [edx + 16], xmm1
27622d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    lea        edx,  [edx + 32]
27632d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    sub        ecx, 8
27642d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    jg         convertloop
27652d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com
27662d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    pop        esi
27672d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    ret
27682d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com  }
27692d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com}
27702d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com
2771e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com// 8 pixels, unaligned.
2772c4c578e327a9dbc9dafe113634612e9a349a8c1ffbarchard@google.com// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
2773d2f4413d29d15b94d971630ba555dd0cd8fcc8c2fbarchard@google.com__declspec(naked) __declspec(align(16))
2774e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.comvoid I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2775952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com                                   const uint8* u_buf,
2776952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com                                   const uint8* v_buf,
2777bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com                                   uint8* dst_argb,
2778952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com                                   int width) {
2779952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com  __asm {
2780952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    push       esi
2781952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    push       edi
2782952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    mov        eax, [esp + 8 + 4]   // Y
2783952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    mov        esi, [esp + 8 + 8]   // U
2784952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    mov        edi, [esp + 8 + 12]  // V
2785e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    mov        edx, [esp + 8 + 16]  // argb
2786952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    mov        ecx, [esp + 8 + 20]  // width
2787952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    sub        edi, esi
2788952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
2789952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    pxor       xmm4, xmm4
2790952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com
2791c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
2792952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com convertloop:
27934c416e8849dcda3e884c6204bcdaa264f66d4288fbarchard@google.com    READYUV444
27944c416e8849dcda3e884c6204bcdaa264f66d4288fbarchard@google.com    YUVTORGB
2795952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com
2796952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    // Step 3: Weave into ARGB
2797952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    punpcklbw  xmm0, xmm1           // BG
2798952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    punpcklbw  xmm2, xmm5           // RA
2799952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    movdqa     xmm1, xmm0
2800952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
2801952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
28024c416e8849dcda3e884c6204bcdaa264f66d4288fbarchard@google.com    movdqu     [edx], xmm0
28034c416e8849dcda3e884c6204bcdaa264f66d4288fbarchard@google.com    movdqu     [edx + 16], xmm1
2804e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    lea        edx,  [edx + 32]
2805e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    sub        ecx, 8
2806e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    jg         convertloop
2807e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com
2808e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    pop        edi
2809e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    pop        esi
2810e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    ret
2811e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com  }
2812e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com}
2813e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com
2814e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com// 8 pixels, unaligned.
2815c4c578e327a9dbc9dafe113634612e9a349a8c1ffbarchard@google.com// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2816e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com__declspec(naked) __declspec(align(16))
2817e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.comvoid I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2818e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com                                   const uint8* u_buf,
2819e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com                                   const uint8* v_buf,
2820bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com                                   uint8* dst_argb,
2821e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com                                   int width) {
2822e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com  __asm {
2823e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    push       esi
2824e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    push       edi
2825e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    mov        eax, [esp + 8 + 4]   // Y
2826e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    mov        esi, [esp + 8 + 8]   // U
2827e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    mov        edi, [esp + 8 + 12]  // V
2828e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    mov        edx, [esp + 8 + 16]  // argb
2829e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    mov        ecx, [esp + 8 + 20]  // width
2830e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    sub        edi, esi
2831e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
2832e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    pxor       xmm4, xmm4
2833e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com
2834c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
2835e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com convertloop:
28364c416e8849dcda3e884c6204bcdaa264f66d4288fbarchard@google.com    READYUV422
28374c416e8849dcda3e884c6204bcdaa264f66d4288fbarchard@google.com    YUVTORGB
2838e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com
2839e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    // Step 3: Weave into ARGB
2840e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    punpcklbw  xmm0, xmm1           // BG
2841e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    punpcklbw  xmm2, xmm5           // RA
2842e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    movdqa     xmm1, xmm0
2843e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
2844e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
28454c416e8849dcda3e884c6204bcdaa264f66d4288fbarchard@google.com    movdqu     [edx], xmm0
28464c416e8849dcda3e884c6204bcdaa264f66d4288fbarchard@google.com    movdqu     [edx + 16], xmm1
2847952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    lea        edx,  [edx + 32]
2848952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    sub        ecx, 8
2849952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    jg         convertloop
2850952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com
2851952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    pop        edi
2852952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    pop        esi
2853952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    ret
2854952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com  }
2855952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com}
2856952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com
2857e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com// 8 pixels, unaligned.
2858c4c578e327a9dbc9dafe113634612e9a349a8c1ffbarchard@google.com// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2859e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com// Similar to I420 but duplicate UV once more.
2860d2f4413d29d15b94d971630ba555dd0cd8fcc8c2fbarchard@google.com__declspec(naked) __declspec(align(16))
2861e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.comvoid I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2862952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com                                   const uint8* u_buf,
2863952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com                                   const uint8* v_buf,
2864bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com                                   uint8* dst_argb,
2865952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com                                   int width) {
2866952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com  __asm {
286747e856c632f0a310004601b86493220a6993d7b4fbarchard@google.com    push       ebx
2868952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    push       esi
2869952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    push       edi
287047e856c632f0a310004601b86493220a6993d7b4fbarchard@google.com    mov        eax, [esp + 12 + 4]   // Y
287147e856c632f0a310004601b86493220a6993d7b4fbarchard@google.com    mov        esi, [esp + 12 + 8]   // U
287247e856c632f0a310004601b86493220a6993d7b4fbarchard@google.com    mov        edi, [esp + 12 + 12]  // V
287347e856c632f0a310004601b86493220a6993d7b4fbarchard@google.com    mov        edx, [esp + 12 + 16]  // argb
287447e856c632f0a310004601b86493220a6993d7b4fbarchard@google.com    mov        ecx, [esp + 12 + 20]  // width
2875e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    sub        edi, esi
2876e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
2877e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    pxor       xmm4, xmm4
2878e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com
2879c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
2880e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com convertloop:
288147e856c632f0a310004601b86493220a6993d7b4fbarchard@google.com    READYUV411  // modifies EBX
28824c416e8849dcda3e884c6204bcdaa264f66d4288fbarchard@google.com    YUVTORGB
2883e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com
2884e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    // Step 3: Weave into ARGB
2885e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    punpcklbw  xmm0, xmm1           // BG
2886e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    punpcklbw  xmm2, xmm5           // RA
2887e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    movdqa     xmm1, xmm0
2888e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
2889e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
28904c416e8849dcda3e884c6204bcdaa264f66d4288fbarchard@google.com    movdqu     [edx], xmm0
28914c416e8849dcda3e884c6204bcdaa264f66d4288fbarchard@google.com    movdqu     [edx + 16], xmm1
2892e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    lea        edx,  [edx + 32]
2893e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    sub        ecx, 8
2894e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    jg         convertloop
2895e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com
2896e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    pop        edi
2897e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    pop        esi
289847e856c632f0a310004601b86493220a6993d7b4fbarchard@google.com    pop        ebx
2899e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    ret
2900e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com  }
2901e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com}
2902e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com
290315449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com// 8 pixels, dest aligned 16.
290415449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
290515449263c4bba75bc396dc3d60266efee6ab6c66fbarchard@google.com__declspec(naked) __declspec(align(16))
29062d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.comvoid NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
29072d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com                                   const uint8* uv_buf,
2908bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com                                   uint8* dst_argb,
29092d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com                                   int width) {
29102d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com  __asm {
29112d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    push       esi
29122d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    mov        eax, [esp + 4 + 4]   // Y
29132d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    mov        esi, [esp + 4 + 8]   // UV
29142d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    mov        edx, [esp + 4 + 12]  // argb
29152d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    mov        ecx, [esp + 4 + 16]  // width
29162d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
29172d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    pxor       xmm4, xmm4
29182d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com
2919c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
29202d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com convertloop:
29212d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    READNV12
29222d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    YUVTORGB
29232d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com
29242d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    // Step 3: Weave into ARGB
29252d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    punpcklbw  xmm0, xmm1           // BG
29262d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    punpcklbw  xmm2, xmm5           // RA
29272d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    movdqa     xmm1, xmm0
29282d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
29292d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
29302d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    movdqu     [edx], xmm0
29312d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    movdqu     [edx + 16], xmm1
29322d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    lea        edx,  [edx + 32]
29332d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    sub        ecx, 8
29342d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    jg         convertloop
29352d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com
29362d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    pop        esi
29372d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    ret
29382d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com  }
29392d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com}
29402d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com
29412d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com// 8 pixels, dest aligned 16.
2942c4c578e327a9dbc9dafe113634612e9a349a8c1ffbarchard@google.com// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
29432d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com__declspec(naked) __declspec(align(16))
29442d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.comvoid NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
29452d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com                                   const uint8* uv_buf,
2946bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com                                   uint8* dst_argb,
29472d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com                                   int width) {
29482d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com  __asm {
29492d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    push       esi
29502d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    mov        eax, [esp + 4 + 4]   // Y
29512d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    mov        esi, [esp + 4 + 8]   // VU
29522d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    mov        edx, [esp + 4 + 12]  // argb
29532d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    mov        ecx, [esp + 4 + 16]  // width
29542d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
29552d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    pxor       xmm4, xmm4
29562d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com
2957c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
29582d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com convertloop:
29592d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    READNV12
29602d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    YVUTORGB
29612d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com
29622d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    // Step 3: Weave into ARGB
29632d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    punpcklbw  xmm0, xmm1           // BG
29642d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    punpcklbw  xmm2, xmm5           // RA
29652d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    movdqa     xmm1, xmm0
29662d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
29672d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
29682d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    movdqu     [edx], xmm0
29692d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    movdqu     [edx + 16], xmm1
29702d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    lea        edx,  [edx + 32]
29712d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    sub        ecx, 8
29722d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    jg         convertloop
29732d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com
29742d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    pop        esi
29752d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com    ret
29762d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com  }
29772d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com}
29782d9fe08225ab28f62b515b2b914accc6a7b060fbfbarchard@google.com
2979e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com__declspec(naked) __declspec(align(16))
2980e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.comvoid I422ToBGRARow_SSSE3(const uint8* y_buf,
2981e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com                         const uint8* u_buf,
2982e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com                         const uint8* v_buf,
2983bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com                         uint8* dst_bgra,
2984e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com                         int width) {
2985e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com  __asm {
2986e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    push       esi
2987e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    push       edi
2988e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    mov        eax, [esp + 8 + 4]   // Y
2989e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    mov        esi, [esp + 8 + 8]   // U
2990e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    mov        edi, [esp + 8 + 12]  // V
2991e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    mov        edx, [esp + 8 + 16]  // bgra
2992952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    mov        ecx, [esp + 8 + 20]  // width
2993952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    sub        edi, esi
2994952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    pxor       xmm4, xmm4
2995952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com
2996c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
2997952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com convertloop:
29984c416e8849dcda3e884c6204bcdaa264f66d4288fbarchard@google.com    READYUV422
29994c416e8849dcda3e884c6204bcdaa264f66d4288fbarchard@google.com    YUVTORGB
3000952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com
3001952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    // Step 3: Weave into BGRA
3002952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
3003952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    punpcklbw  xmm1, xmm0           // GB
3004952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    punpcklbw  xmm5, xmm2           // AR
3005952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    movdqa     xmm0, xmm5
3006952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    punpcklwd  xmm5, xmm1           // BGRA first 4 pixels
3007952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    punpckhwd  xmm0, xmm1           // BGRA next 4 pixels
3008e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    movdqa     [edx], xmm5
3009e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    movdqa     [edx + 16], xmm0
3010952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    lea        edx,  [edx + 32]
3011952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    sub        ecx, 8
3012952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    jg         convertloop
3013952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com
3014952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    pop        edi
3015952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    pop        esi
3016952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    ret
3017952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com  }
3018952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com}
3019952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com
3020d2f4413d29d15b94d971630ba555dd0cd8fcc8c2fbarchard@google.com__declspec(naked) __declspec(align(16))
302125dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.comvoid I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
302225dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com                                   const uint8* u_buf,
302325dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com                                   const uint8* v_buf,
3024bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com                                   uint8* dst_bgra,
302525dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com                                   int width) {
302625dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com  __asm {
302725dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    push       esi
302825dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    push       edi
302925dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    mov        eax, [esp + 8 + 4]   // Y
303025dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    mov        esi, [esp + 8 + 8]   // U
303125dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    mov        edi, [esp + 8 + 12]  // V
303225dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    mov        edx, [esp + 8 + 16]  // bgra
303325dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    mov        ecx, [esp + 8 + 20]  // width
303425dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    sub        edi, esi
303525dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    pxor       xmm4, xmm4
303625dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com
3037c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
303825dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com convertloop:
303925dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    READYUV422
304025dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    YUVTORGB
304125dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com
304225dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    // Step 3: Weave into BGRA
304325dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
304425dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    punpcklbw  xmm1, xmm0           // GB
304525dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    punpcklbw  xmm5, xmm2           // AR
304625dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    movdqa     xmm0, xmm5
304725dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    punpcklwd  xmm5, xmm1           // BGRA first 4 pixels
304825dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    punpckhwd  xmm0, xmm1           // BGRA next 4 pixels
304925dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    movdqu     [edx], xmm5
305025dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    movdqu     [edx + 16], xmm0
305125dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    lea        edx,  [edx + 32]
305225dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    sub        ecx, 8
305325dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    jg         convertloop
305425dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com
305525dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    pop        edi
305625dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    pop        esi
305725dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    ret
305825dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com  }
305925dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com}
306025dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com
306125dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com__declspec(naked) __declspec(align(16))
3062e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.comvoid I422ToABGRRow_SSSE3(const uint8* y_buf,
3063e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com                         const uint8* u_buf,
3064e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com                         const uint8* v_buf,
3065bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com                         uint8* dst_abgr,
3066e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com                         int width) {
3067952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com  __asm {
3068952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    push       esi
3069952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    push       edi
3070952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    mov        eax, [esp + 8 + 4]   // Y
3071952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    mov        esi, [esp + 8 + 8]   // U
3072952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    mov        edi, [esp + 8 + 12]  // V
3073e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    mov        edx, [esp + 8 + 16]  // abgr
3074952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    mov        ecx, [esp + 8 + 20]  // width
3075952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    sub        edi, esi
3076952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
3077952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    pxor       xmm4, xmm4
3078952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com
3079c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
3080952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com convertloop:
30814c416e8849dcda3e884c6204bcdaa264f66d4288fbarchard@google.com    READYUV422
30824c416e8849dcda3e884c6204bcdaa264f66d4288fbarchard@google.com    YUVTORGB
3083952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com
3084952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    // Step 3: Weave into ARGB
3085952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    punpcklbw  xmm2, xmm1           // RG
3086952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    punpcklbw  xmm0, xmm5           // BA
3087952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    movdqa     xmm1, xmm2
3088952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    punpcklwd  xmm2, xmm0           // RGBA first 4 pixels
3089952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    punpckhwd  xmm1, xmm0           // RGBA next 4 pixels
3090e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    movdqa     [edx], xmm2
3091e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    movdqa     [edx + 16], xmm1
3092952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    lea        edx,  [edx + 32]
3093952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    sub        ecx, 8
3094952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    jg         convertloop
3095952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com
3096952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    pop        edi
3097952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    pop        esi
3098952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com    ret
3099952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com  }
3100952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com}
3101952a507ca6967558c2ae773321e003b6f2bb943afbarchard@google.com
3102d2f4413d29d15b94d971630ba555dd0cd8fcc8c2fbarchard@google.com__declspec(naked) __declspec(align(16))
310325dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.comvoid I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
3104e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com                                   const uint8* u_buf,
3105e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com                                   const uint8* v_buf,
3106bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com                                   uint8* dst_abgr,
3107e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com                                   int width) {
3108d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com  __asm {
3109d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    push       esi
3110d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    push       edi
3111d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    mov        eax, [esp + 8 + 4]   // Y
3112d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    mov        esi, [esp + 8 + 8]   // U
3113d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    mov        edi, [esp + 8 + 12]  // V
311425dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    mov        edx, [esp + 8 + 16]  // abgr
3115d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    mov        ecx, [esp + 8 + 20]  // width
3116d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    sub        edi, esi
311725dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
3118d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    pxor       xmm4, xmm4
3119d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com
3120c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
3121eaedc1d72735e68d45a0b42221a04902e648a21dfbarchard@google.com convertloop:
31224c416e8849dcda3e884c6204bcdaa264f66d4288fbarchard@google.com    READYUV422
31234c416e8849dcda3e884c6204bcdaa264f66d4288fbarchard@google.com    YUVTORGB
3124e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com
312525dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    // Step 3: Weave into ARGB
312625dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    punpcklbw  xmm2, xmm1           // RG
312725dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    punpcklbw  xmm0, xmm5           // BA
312825dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    movdqa     xmm1, xmm2
312925dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    punpcklwd  xmm2, xmm0           // RGBA first 4 pixels
313025dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    punpckhwd  xmm1, xmm0           // RGBA next 4 pixels
313125dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    movdqu     [edx], xmm2
313225dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    movdqu     [edx + 16], xmm1
313325dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    lea        edx,  [edx + 32]
313425dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    sub        ecx, 8
313525dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    jg         convertloop
313625dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com
313725dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    pop        edi
313825dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    pop        esi
313925dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    ret
314025dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com  }
314125dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com}
314225dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com
314325dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com__declspec(naked) __declspec(align(16))
314425dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.comvoid I422ToRGBARow_SSSE3(const uint8* y_buf,
314525dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com                         const uint8* u_buf,
314625dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com                         const uint8* v_buf,
3147bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com                         uint8* dst_rgba,
314825dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com                         int width) {
314925dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com  __asm {
315025dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    push       esi
315125dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    push       edi
315225dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    mov        eax, [esp + 8 + 4]   // Y
315325dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    mov        esi, [esp + 8 + 8]   // U
315425dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    mov        edi, [esp + 8 + 12]  // V
315525dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    mov        edx, [esp + 8 + 16]  // rgba
315625dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    mov        ecx, [esp + 8 + 20]  // width
315725dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    sub        edi, esi
315825dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    pxor       xmm4, xmm4
315925dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com
3160c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
316125dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com convertloop:
316225dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    READYUV422
316325dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    YUVTORGB
316425dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com
316525dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    // Step 3: Weave into RGBA
3166e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
316725dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    punpcklbw  xmm1, xmm2           // GR
316825dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    punpcklbw  xmm5, xmm0           // AB
3169e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    movdqa     xmm0, xmm5
317025dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    punpcklwd  xmm5, xmm1           // RGBA first 4 pixels
317125dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    punpckhwd  xmm0, xmm1           // RGBA next 4 pixels
317225dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    movdqa     [edx], xmm5
317325dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    movdqa     [edx + 16], xmm0
3174e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    lea        edx,  [edx + 32]
3175e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    sub        ecx, 8
3176e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    jg         convertloop
3177e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com
3178e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    pop        edi
3179e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    pop        esi
3180e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    ret
3181e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com  }
3182e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com}
3183e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com
3184e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com__declspec(naked) __declspec(align(16))
318525dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.comvoid I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
3186e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com                                   const uint8* u_buf,
3187e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com                                   const uint8* v_buf,
3188bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.com                                   uint8* dst_rgba,
3189e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com                                   int width) {
3190e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com  __asm {
3191e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    push       esi
3192e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    push       edi
3193e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    mov        eax, [esp + 8 + 4]   // Y
3194e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    mov        esi, [esp + 8 + 8]   // U
3195e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    mov        edi, [esp + 8 + 12]  // V
319625dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    mov        edx, [esp + 8 + 16]  // rgba
3197e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    mov        ecx, [esp + 8 + 20]  // width
3198e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    sub        edi, esi
3199e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    pxor       xmm4, xmm4
3200e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com
3201c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
3202e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com convertloop:
32034c416e8849dcda3e884c6204bcdaa264f66d4288fbarchard@google.com    READYUV422
32044c416e8849dcda3e884c6204bcdaa264f66d4288fbarchard@google.com    YUVTORGB
3205d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com
320625dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    // Step 3: Weave into RGBA
320725dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
320825dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    punpcklbw  xmm1, xmm2           // GR
320925dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    punpcklbw  xmm5, xmm0           // AB
321025dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    movdqa     xmm0, xmm5
321125dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    punpcklwd  xmm5, xmm1           // RGBA first 4 pixels
321225dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    punpckhwd  xmm0, xmm1           // RGBA next 4 pixels
321325dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    movdqu     [edx], xmm5
321425dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com    movdqu     [edx + 16], xmm0
3215e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    lea        edx,  [edx + 32]
3216e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com    sub        ecx, 8
321718184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com    jg         convertloop
3218d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com
3219d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    pop        edi
3220d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    pop        esi
3221d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    ret
3222d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com  }
3223d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com}
322425dc05858e39843299cb66715bdd4e3edd2f89a6fbarchard@google.com
3225e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com#endif  // HAS_I422TOARGBROW_SSSE3
3226d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com
3227e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com#ifdef HAS_YTOARGBROW_SSE2
3228d2f4413d29d15b94d971630ba555dd0cd8fcc8c2fbarchard@google.com__declspec(naked) __declspec(align(16))
3229e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.comvoid YToARGBRow_SSE2(const uint8* y_buf,
3230e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com                     uint8* rgb_buf,
3231e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com                     int width) {
3232d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com  __asm {
323330859f75f28c2435753d33eb7a48ccab169feb6dfbarchard@google.com    pxor       xmm5, xmm5
32348b9759c4a757ee5d1f005cfececd8382c357e5fefbarchard@google.com    pcmpeqb    xmm4, xmm4           // generate mask 0xff000000
32358b9759c4a757ee5d1f005cfececd8382c357e5fefbarchard@google.com    pslld      xmm4, 24
323698a1fbf5e9797112515d591b1262db6ae049b8fafbarchard@google.com    mov        eax, 0x00100010
323798a1fbf5e9797112515d591b1262db6ae049b8fafbarchard@google.com    movd       xmm3, eax
323898a1fbf5e9797112515d591b1262db6ae049b8fafbarchard@google.com    pshufd     xmm3, xmm3, 0
323998a1fbf5e9797112515d591b1262db6ae049b8fafbarchard@google.com    mov        eax, 0x004a004a       // 74
324098a1fbf5e9797112515d591b1262db6ae049b8fafbarchard@google.com    movd       xmm2, eax
324198a1fbf5e9797112515d591b1262db6ae049b8fafbarchard@google.com    pshufd     xmm2, xmm2,0
3242d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    mov        eax, [esp + 4]       // Y
3243d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    mov        edx, [esp + 8]       // rgb
3244d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    mov        ecx, [esp + 12]      // width
3245d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com
3246c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
3247eaedc1d72735e68d45a0b42221a04902e648a21dfbarchard@google.com convertloop:
3248d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
3249373cdbdc58d6e7b7e4653840677ef01468607e84fbarchard@google.com    movq       xmm0, qword ptr [eax]
3250d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    lea        eax, [eax + 8]
325130859f75f28c2435753d33eb7a48ccab169feb6dfbarchard@google.com    punpcklbw  xmm0, xmm5           // 0.Y
32528b9759c4a757ee5d1f005cfececd8382c357e5fefbarchard@google.com    psubusw    xmm0, xmm3
325330859f75f28c2435753d33eb7a48ccab169feb6dfbarchard@google.com    pmullw     xmm0, xmm2
325430859f75f28c2435753d33eb7a48ccab169feb6dfbarchard@google.com    psrlw      xmm0, 6
3255d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    packuswb   xmm0, xmm0           // G
3256d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com
3257d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    // Step 2: Weave into ARGB
3258d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    punpcklbw  xmm0, xmm0           // GG
3259d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    movdqa     xmm1, xmm0
3260d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    punpcklwd  xmm0, xmm0           // BGRA first 4 pixels
3261d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    punpckhwd  xmm1, xmm1           // BGRA next 4 pixels
32628b9759c4a757ee5d1f005cfececd8382c357e5fefbarchard@google.com    por        xmm0, xmm4
32638b9759c4a757ee5d1f005cfececd8382c357e5fefbarchard@google.com    por        xmm1, xmm4
32643fe369661abbd1bbca12bd69dc8be0be9a5f9792fbarchard@google.com    movdqa     [edx], xmm0
3265d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    movdqa     [edx + 16], xmm1
3266d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    lea        edx,  [edx + 32]
3267d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    sub        ecx, 8
326818184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com    jg         convertloop
3269d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com
3270d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com    ret
3271d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com  }
3272d93d4486eb6cefba07f4707db5cce5509dc0145dfbarchard@google.com}
3273e214fe3f070d47d34e3cfbf4431994f97c9e0d1bfbarchard@google.com#endif  // HAS_YTOARGBROW_SSE2
327412d048335db029aa66396d2fc09be0612afe8b59fbarchard@google.com
327542831e0aae4c786e40302ac03bf5d679796b5c3ffbarchard@google.com#ifdef HAS_MIRRORROW_SSSE3
327612d048335db029aa66396d2fc09be0612afe8b59fbarchard@google.com// Shuffle table for reversing the bytes.
3277851a702b39aefb717990a0578d8701d051cbab32fbarchard@google.comstatic const uvec8 kShuffleMirror = {
327812d048335db029aa66396d2fc09be0612afe8b59fbarchard@google.com  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
327912d048335db029aa66396d2fc09be0612afe8b59fbarchard@google.com};
3280228bdc24e44264baf3402124aaa6d4d81c8896f5fbarchard@google.com
3281d2f4413d29d15b94d971630ba555dd0cd8fcc8c2fbarchard@google.com__declspec(naked) __declspec(align(16))
328242831e0aae4c786e40302ac03bf5d679796b5c3ffbarchard@google.comvoid MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
3283f5f6fd2aa778c379c442a2210ddd8d2ee03e8eb6fbarchard@google.com  __asm {
328412d048335db029aa66396d2fc09be0612afe8b59fbarchard@google.com    mov       eax, [esp + 4]   // src
328512d048335db029aa66396d2fc09be0612afe8b59fbarchard@google.com    mov       edx, [esp + 8]   // dst
328612d048335db029aa66396d2fc09be0612afe8b59fbarchard@google.com    mov       ecx, [esp + 12]  // width
328742831e0aae4c786e40302ac03bf5d679796b5c3ffbarchard@google.com    movdqa    xmm5, kShuffleMirror
328812d048335db029aa66396d2fc09be0612afe8b59fbarchard@google.com    lea       eax, [eax - 16]
3289ba3aeed3b86dfae7bc0631c8bed9b50303318dcafbarchard@google.com
3290c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
32910e6ce93c84f710e6a589c6c6edfe480ad0567f0cfbarchard@google.com convertloop:
32920e6ce93c84f710e6a589c6c6edfe480ad0567f0cfbarchard@google.com    movdqa    xmm0, [eax + ecx]
329312d048335db029aa66396d2fc09be0612afe8b59fbarchard@google.com    pshufb    xmm0, xmm5
32940e6ce93c84f710e6a589c6c6edfe480ad0567f0cfbarchard@google.com    sub       ecx, 16
329512d048335db029aa66396d2fc09be0612afe8b59fbarchard@google.com    movdqa    [edx], xmm0
329612d048335db029aa66396d2fc09be0612afe8b59fbarchard@google.com    lea       edx, [edx + 16]
329718184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com    jg        convertloop
329812d048335db029aa66396d2fc09be0612afe8b59fbarchard@google.com    ret
329912d048335db029aa66396d2fc09be0612afe8b59fbarchard@google.com  }
330012d048335db029aa66396d2fc09be0612afe8b59fbarchard@google.com}
33014c416e8849dcda3e884c6204bcdaa264f66d4288fbarchard@google.com#endif  // HAS_MIRRORROW_SSSE3
3302585a126140be298e60a4daa26140ead0e94eaaa1fbarchard@google.com
33032007dca6dcfee6828f604068d7d17842fbe5d646fbarchard@google.com#ifdef HAS_MIRRORROW_AVX2
33042007dca6dcfee6828f604068d7d17842fbe5d646fbarchard@google.com// Shuffle table for reversing the bytes.
3305851a702b39aefb717990a0578d8701d051cbab32fbarchard@google.comstatic const ulvec8 kShuffleMirror_AVX2 = {
33062007dca6dcfee6828f604068d7d17842fbe5d646fbarchard@google.com  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u,
33072007dca6dcfee6828f604068d7d17842fbe5d646fbarchard@google.com  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
33082007dca6dcfee6828f604068d7d17842fbe5d646fbarchard@google.com};
33092007dca6dcfee6828f604068d7d17842fbe5d646fbarchard@google.com
33102007dca6dcfee6828f604068d7d17842fbe5d646fbarchard@google.com__declspec(naked) __declspec(align(16))
33112007dca6dcfee6828f604068d7d17842fbe5d646fbarchard@google.comvoid MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
33122007dca6dcfee6828f604068d7d17842fbe5d646fbarchard@google.com  __asm {
33132007dca6dcfee6828f604068d7d17842fbe5d646fbarchard@google.com    mov       eax, [esp + 4]   // src
33142007dca6dcfee6828f604068d7d17842fbe5d646fbarchard@google.com    mov       edx, [esp + 8]   // dst
33152007dca6dcfee6828f604068d7d17842fbe5d646fbarchard@google.com    mov       ecx, [esp + 12]  // width
33162007dca6dcfee6828f604068d7d17842fbe5d646fbarchard@google.com    vmovdqa   ymm5, kShuffleMirror_AVX2
33172007dca6dcfee6828f604068d7d17842fbe5d646fbarchard@google.com    lea       eax, [eax - 32]
33182007dca6dcfee6828f604068d7d17842fbe5d646fbarchard@google.com
3319c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
33202007dca6dcfee6828f604068d7d17842fbe5d646fbarchard@google.com convertloop:
33212007dca6dcfee6828f604068d7d17842fbe5d646fbarchard@google.com    vmovdqu   ymm0, [eax + ecx]
33222007dca6dcfee6828f604068d7d17842fbe5d646fbarchard@google.com    vpshufb   ymm0, ymm0, ymm5
33232007dca6dcfee6828f604068d7d17842fbe5d646fbarchard@google.com    vpermq    ymm0, ymm0, 0x4e  // swap high and low halfs
33242007dca6dcfee6828f604068d7d17842fbe5d646fbarchard@google.com    sub       ecx, 32
33252007dca6dcfee6828f604068d7d17842fbe5d646fbarchard@google.com    vmovdqu   [edx], ymm0
33262007dca6dcfee6828f604068d7d17842fbe5d646fbarchard@google.com    lea       edx, [edx + 32]
33272007dca6dcfee6828f604068d7d17842fbe5d646fbarchard@google.com    jg        convertloop
33289b4c00b908d37727c6caf82337813d567732be1cfbarchard@google.com    vzeroupper
33292007dca6dcfee6828f604068d7d17842fbe5d646fbarchard@google.com    ret
33302007dca6dcfee6828f604068d7d17842fbe5d646fbarchard@google.com  }
33312007dca6dcfee6828f604068d7d17842fbe5d646fbarchard@google.com}
33322007dca6dcfee6828f604068d7d17842fbe5d646fbarchard@google.com#endif  // HAS_MIRRORROW_AVX2
33332007dca6dcfee6828f604068d7d17842fbe5d646fbarchard@google.com
333442831e0aae4c786e40302ac03bf5d679796b5c3ffbarchard@google.com#ifdef HAS_MIRRORROW_SSE2
33352d11d43a6e21865b904705acce6535ae4c2d3caffbarchard@google.com// SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3
333642831e0aae4c786e40302ac03bf5d679796b5c3ffbarchard@google.com// version can not.
3337d2f4413d29d15b94d971630ba555dd0cd8fcc8c2fbarchard@google.com__declspec(naked) __declspec(align(16))
333842831e0aae4c786e40302ac03bf5d679796b5c3ffbarchard@google.comvoid MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
3339f5f6fd2aa778c379c442a2210ddd8d2ee03e8eb6fbarchard@google.com  __asm {
3340373cdbdc58d6e7b7e4653840677ef01468607e84fbarchard@google.com    mov       eax, [esp + 4]   // src
3341373cdbdc58d6e7b7e4653840677ef01468607e84fbarchard@google.com    mov       edx, [esp + 8]   // dst
3342373cdbdc58d6e7b7e4653840677ef01468607e84fbarchard@google.com    mov       ecx, [esp + 12]  // width
3343373cdbdc58d6e7b7e4653840677ef01468607e84fbarchard@google.com    lea       eax, [eax - 16]
3344ba3aeed3b86dfae7bc0631c8bed9b50303318dcafbarchard@google.com
3345c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
33460e6ce93c84f710e6a589c6c6edfe480ad0567f0cfbarchard@google.com convertloop:
334742831e0aae4c786e40302ac03bf5d679796b5c3ffbarchard@google.com    movdqu    xmm0, [eax + ecx]
33482d11d43a6e21865b904705acce6535ae4c2d3caffbarchard@google.com    movdqa    xmm1, xmm0        // swap bytes
3349373cdbdc58d6e7b7e4653840677ef01468607e84fbarchard@google.com    psllw     xmm0, 8
3350373cdbdc58d6e7b7e4653840677ef01468607e84fbarchard@google.com    psrlw     xmm1, 8
3351373cdbdc58d6e7b7e4653840677ef01468607e84fbarchard@google.com    por       xmm0, xmm1
3352373cdbdc58d6e7b7e4653840677ef01468607e84fbarchard@google.com    pshuflw   xmm0, xmm0, 0x1b  // swap words
3353373cdbdc58d6e7b7e4653840677ef01468607e84fbarchard@google.com    pshufhw   xmm0, xmm0, 0x1b
335445b9ef0f6a404fe416d7a04bbd6da13037f3716bfbarchard@google.com    pshufd    xmm0, xmm0, 0x4e  // swap qwords
33550e6ce93c84f710e6a589c6c6edfe480ad0567f0cfbarchard@google.com    sub       ecx, 16
335642831e0aae4c786e40302ac03bf5d679796b5c3ffbarchard@google.com    movdqu    [edx], xmm0
3357373cdbdc58d6e7b7e4653840677ef01468607e84fbarchard@google.com    lea       edx, [edx + 16]
335818184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com    jg        convertloop
3359373cdbdc58d6e7b7e4653840677ef01468607e84fbarchard@google.com    ret
3360373cdbdc58d6e7b7e4653840677ef01468607e84fbarchard@google.com  }
3361373cdbdc58d6e7b7e4653840677ef01468607e84fbarchard@google.com}
33624c416e8849dcda3e884c6204bcdaa264f66d4288fbarchard@google.com#endif  // HAS_MIRRORROW_SSE2
3363e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com
336416a96645b4987fddbcf726dea2fcf5dc87ca10e1fbarchard@google.com#ifdef HAS_MIRRORROW_UV_SSSE3
336516a96645b4987fddbcf726dea2fcf5dc87ca10e1fbarchard@google.com// Shuffle table for reversing the bytes of UV channels.
3366851a702b39aefb717990a0578d8701d051cbab32fbarchard@google.comstatic const uvec8 kShuffleMirrorUV = {
336716a96645b4987fddbcf726dea2fcf5dc87ca10e1fbarchard@google.com  14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
336816a96645b4987fddbcf726dea2fcf5dc87ca10e1fbarchard@google.com};
336916a96645b4987fddbcf726dea2fcf5dc87ca10e1fbarchard@google.com
3370d2f4413d29d15b94d971630ba555dd0cd8fcc8c2fbarchard@google.com__declspec(naked) __declspec(align(16))
3371bdf7cb591452611090922e690d5104a7d8c6b1e5fbarchard@google.comvoid MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
337216a96645b4987fddbcf726dea2fcf5dc87ca10e1fbarchard@google.com                       int width) {
337316a96645b4987fddbcf726dea2fcf5dc87ca10e1fbarchard@google.com  __asm {
337416a96645b4987fddbcf726dea2fcf5dc87ca10e1fbarchard@google.com    push      edi
337516a96645b4987fddbcf726dea2fcf5dc87ca10e1fbarchard@google.com    mov       eax, [esp + 4 + 4]   // src
337616a96645b4987fddbcf726dea2fcf5dc87ca10e1fbarchard@google.com    mov       edx, [esp + 4 + 8]   // dst_u
337716a96645b4987fddbcf726dea2fcf5dc87ca10e1fbarchard@google.com    mov       edi, [esp + 4 + 12]  // dst_v
337816a96645b4987fddbcf726dea2fcf5dc87ca10e1fbarchard@google.com    mov       ecx, [esp + 4 + 16]  // width
337916a96645b4987fddbcf726dea2fcf5dc87ca10e1fbarchard@google.com    movdqa    xmm1, kShuffleMirrorUV
338016a96645b4987fddbcf726dea2fcf5dc87ca10e1fbarchard@google.com    lea       eax, [eax + ecx * 2 - 16]
338116a96645b4987fddbcf726dea2fcf5dc87ca10e1fbarchard@google.com    sub       edi, edx
338216a96645b4987fddbcf726dea2fcf5dc87ca10e1fbarchard@google.com
3383c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
338416a96645b4987fddbcf726dea2fcf5dc87ca10e1fbarchard@google.com convertloop:
338516a96645b4987fddbcf726dea2fcf5dc87ca10e1fbarchard@google.com    movdqa    xmm0, [eax]
338616a96645b4987fddbcf726dea2fcf5dc87ca10e1fbarchard@google.com    lea       eax, [eax - 16]
338716a96645b4987fddbcf726dea2fcf5dc87ca10e1fbarchard@google.com    pshufb    xmm0, xmm1
338816a96645b4987fddbcf726dea2fcf5dc87ca10e1fbarchard@google.com    sub       ecx, 8
338916a96645b4987fddbcf726dea2fcf5dc87ca10e1fbarchard@google.com    movlpd    qword ptr [edx], xmm0
339016a96645b4987fddbcf726dea2fcf5dc87ca10e1fbarchard@google.com    movhpd    qword ptr [edx + edi], xmm0
339116a96645b4987fddbcf726dea2fcf5dc87ca10e1fbarchard@google.com    lea       edx, [edx + 8]
339218184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com    jg        convertloop
339316a96645b4987fddbcf726dea2fcf5dc87ca10e1fbarchard@google.com
339416a96645b4987fddbcf726dea2fcf5dc87ca10e1fbarchard@google.com    pop       edi
339516a96645b4987fddbcf726dea2fcf5dc87ca10e1fbarchard@google.com    ret
339616a96645b4987fddbcf726dea2fcf5dc87ca10e1fbarchard@google.com  }
339716a96645b4987fddbcf726dea2fcf5dc87ca10e1fbarchard@google.com}
33984c416e8849dcda3e884c6204bcdaa264f66d4288fbarchard@google.com#endif  // HAS_MIRRORROW_UV_SSSE3
339916a96645b4987fddbcf726dea2fcf5dc87ca10e1fbarchard@google.com
340027d42c7ff6452c53643bc57ee8b7b17afbe8dfd0fbarchard@google.com#ifdef HAS_ARGBMIRRORROW_SSSE3
340127d42c7ff6452c53643bc57ee8b7b17afbe8dfd0fbarchard@google.com// Shuffle table for reversing the bytes.
3402851a702b39aefb717990a0578d8701d051cbab32fbarchard@google.comstatic const uvec8 kARGBShuffleMirror = {
340327d42c7ff6452c53643bc57ee8b7b17afbe8dfd0fbarchard@google.com  12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
340427d42c7ff6452c53643bc57ee8b7b17afbe8dfd0fbarchard@google.com};
340527d42c7ff6452c53643bc57ee8b7b17afbe8dfd0fbarchard@google.com
340627d42c7ff6452c53643bc57ee8b7b17afbe8dfd0fbarchard@google.com__declspec(naked) __declspec(align(16))
340727d42c7ff6452c53643bc57ee8b7b17afbe8dfd0fbarchard@google.comvoid ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
3408f5f6fd2aa778c379c442a2210ddd8d2ee03e8eb6fbarchard@google.com  __asm {
340927d42c7ff6452c53643bc57ee8b7b17afbe8dfd0fbarchard@google.com    mov       eax, [esp + 4]   // src
341027d42c7ff6452c53643bc57ee8b7b17afbe8dfd0fbarchard@google.com    mov       edx, [esp + 8]   // dst
341127d42c7ff6452c53643bc57ee8b7b17afbe8dfd0fbarchard@google.com    mov       ecx, [esp + 12]  // width
34129335518f4127167ee54b0872ab715c674be06005fbarchard@google.com    lea       eax, [eax - 16 + ecx * 4]  // last 4 pixels.
341327d42c7ff6452c53643bc57ee8b7b17afbe8dfd0fbarchard@google.com    movdqa    xmm5, kARGBShuffleMirror
341427d42c7ff6452c53643bc57ee8b7b17afbe8dfd0fbarchard@google.com
3415c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
341627d42c7ff6452c53643bc57ee8b7b17afbe8dfd0fbarchard@google.com convertloop:
34179335518f4127167ee54b0872ab715c674be06005fbarchard@google.com    movdqa    xmm0, [eax]
34189335518f4127167ee54b0872ab715c674be06005fbarchard@google.com    lea       eax, [eax - 16]
341927d42c7ff6452c53643bc57ee8b7b17afbe8dfd0fbarchard@google.com    pshufb    xmm0, xmm5
342027d42c7ff6452c53643bc57ee8b7b17afbe8dfd0fbarchard@google.com    sub       ecx, 4
342127d42c7ff6452c53643bc57ee8b7b17afbe8dfd0fbarchard@google.com    movdqa    [edx], xmm0
342227d42c7ff6452c53643bc57ee8b7b17afbe8dfd0fbarchard@google.com    lea       edx, [edx + 16]
342327d42c7ff6452c53643bc57ee8b7b17afbe8dfd0fbarchard@google.com    jg        convertloop
342427d42c7ff6452c53643bc57ee8b7b17afbe8dfd0fbarchard@google.com    ret
342527d42c7ff6452c53643bc57ee8b7b17afbe8dfd0fbarchard@google.com  }
342627d42c7ff6452c53643bc57ee8b7b17afbe8dfd0fbarchard@google.com}
342727d42c7ff6452c53643bc57ee8b7b17afbe8dfd0fbarchard@google.com#endif  // HAS_ARGBMIRRORROW_SSSE3
342827d42c7ff6452c53643bc57ee8b7b17afbe8dfd0fbarchard@google.com
342951398e0be5004b8818df4a4ccda9fe77bcfaf141fbarchard@google.com#ifdef HAS_ARGBMIRRORROW_AVX2
343051398e0be5004b8818df4a4ccda9fe77bcfaf141fbarchard@google.com// Shuffle table for reversing the bytes.
3431851a702b39aefb717990a0578d8701d051cbab32fbarchard@google.comstatic const ulvec32 kARGBShuffleMirror_AVX2 = {
343251398e0be5004b8818df4a4ccda9fe77bcfaf141fbarchard@google.com  7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
343351398e0be5004b8818df4a4ccda9fe77bcfaf141fbarchard@google.com};
343451398e0be5004b8818df4a4ccda9fe77bcfaf141fbarchard@google.com
343551398e0be5004b8818df4a4ccda9fe77bcfaf141fbarchard@google.com__declspec(naked) __declspec(align(16))
343651398e0be5004b8818df4a4ccda9fe77bcfaf141fbarchard@google.comvoid ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
343751398e0be5004b8818df4a4ccda9fe77bcfaf141fbarchard@google.com  __asm {
343851398e0be5004b8818df4a4ccda9fe77bcfaf141fbarchard@google.com    mov       eax, [esp + 4]   // src
343951398e0be5004b8818df4a4ccda9fe77bcfaf141fbarchard@google.com    mov       edx, [esp + 8]   // dst
344051398e0be5004b8818df4a4ccda9fe77bcfaf141fbarchard@google.com    mov       ecx, [esp + 12]  // width
344151398e0be5004b8818df4a4ccda9fe77bcfaf141fbarchard@google.com    lea       eax, [eax - 32]
344251398e0be5004b8818df4a4ccda9fe77bcfaf141fbarchard@google.com    vmovdqa   ymm5, kARGBShuffleMirror_AVX2
344351398e0be5004b8818df4a4ccda9fe77bcfaf141fbarchard@google.com
3444c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
344551398e0be5004b8818df4a4ccda9fe77bcfaf141fbarchard@google.com convertloop:
344651398e0be5004b8818df4a4ccda9fe77bcfaf141fbarchard@google.com    vpermd    ymm0, ymm5, [eax + ecx * 4]  // permute dword order
344751398e0be5004b8818df4a4ccda9fe77bcfaf141fbarchard@google.com    sub       ecx, 8
344851398e0be5004b8818df4a4ccda9fe77bcfaf141fbarchard@google.com    vmovdqu   [edx], ymm0
344951398e0be5004b8818df4a4ccda9fe77bcfaf141fbarchard@google.com    lea       edx, [edx + 32]
345051398e0be5004b8818df4a4ccda9fe77bcfaf141fbarchard@google.com    jg        convertloop
34519b4c00b908d37727c6caf82337813d567732be1cfbarchard@google.com    vzeroupper
345251398e0be5004b8818df4a4ccda9fe77bcfaf141fbarchard@google.com    ret
345351398e0be5004b8818df4a4ccda9fe77bcfaf141fbarchard@google.com  }
345451398e0be5004b8818df4a4ccda9fe77bcfaf141fbarchard@google.com}
345551398e0be5004b8818df4a4ccda9fe77bcfaf141fbarchard@google.com#endif  // HAS_ARGBMIRRORROW_AVX2
345651398e0be5004b8818df4a4ccda9fe77bcfaf141fbarchard@google.com
3457f08ac6bb095348565b5259f2fab95f259ef47edefbarchard@google.com#ifdef HAS_SPLITUVROW_SSE2
3458d2f4413d29d15b94d971630ba555dd0cd8fcc8c2fbarchard@google.com__declspec(naked) __declspec(align(16))
3459f08ac6bb095348565b5259f2fab95f259ef47edefbarchard@google.comvoid SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
34602d11d43a6e21865b904705acce6535ae4c2d3caffbarchard@google.com  __asm {
34612d11d43a6e21865b904705acce6535ae4c2d3caffbarchard@google.com    push       edi
34622d11d43a6e21865b904705acce6535ae4c2d3caffbarchard@google.com    mov        eax, [esp + 4 + 4]    // src_uv
34632d11d43a6e21865b904705acce6535ae4c2d3caffbarchard@google.com    mov        edx, [esp + 4 + 8]    // dst_u
34642d11d43a6e21865b904705acce6535ae4c2d3caffbarchard@google.com    mov        edi, [esp + 4 + 12]   // dst_v
34652d11d43a6e21865b904705acce6535ae4c2d3caffbarchard@google.com    mov        ecx, [esp + 4 + 16]   // pix
34662d11d43a6e21865b904705acce6535ae4c2d3caffbarchard@google.com    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
34672d11d43a6e21865b904705acce6535ae4c2d3caffbarchard@google.com    psrlw      xmm5, 8
34682d11d43a6e21865b904705acce6535ae4c2d3caffbarchard@google.com    sub        edi, edx
34692d11d43a6e21865b904705acce6535ae4c2d3caffbarchard@google.com
3470c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
34712d11d43a6e21865b904705acce6535ae4c2d3caffbarchard@google.com  convertloop:
34722d11d43a6e21865b904705acce6535ae4c2d3caffbarchard@google.com    movdqa     xmm0, [eax]
34732d11d43a6e21865b904705acce6535ae4c2d3caffbarchard@google.com    movdqa     xmm1, [eax + 16]
34742d11d43a6e21865b904705acce6535ae4c2d3caffbarchard@google.com    lea        eax,  [eax + 32]
34752d11d43a6e21865b904705acce6535ae4c2d3caffbarchard@google.com    movdqa     xmm2, xmm0
34762d11d43a6e21865b904705acce6535ae4c2d3caffbarchard@google.com    movdqa     xmm3, xmm1
34772d11d43a6e21865b904705acce6535ae4c2d3caffbarchard@google.com    pand       xmm0, xmm5   // even bytes
34782d11d43a6e21865b904705acce6535ae4c2d3caffbarchard@google.com    pand       xmm1, xmm5
34792d11d43a6e21865b904705acce6535ae4c2d3caffbarchard@google.com    packuswb   xmm0, xmm1
34802d11d43a6e21865b904705acce6535ae4c2d3caffbarchard@google.com    psrlw      xmm2, 8      // odd bytes
34812d11d43a6e21865b904705acce6535ae4c2d3caffbarchard@google.com    psrlw      xmm3, 8
34822d11d43a6e21865b904705acce6535ae4c2d3caffbarchard@google.com    packuswb   xmm2, xmm3
34832d11d43a6e21865b904705acce6535ae4c2d3caffbarchard@google.com    movdqa     [edx], xmm0
34842d11d43a6e21865b904705acce6535ae4c2d3caffbarchard@google.com    movdqa     [edx + edi], xmm2
34852d11d43a6e21865b904705acce6535ae4c2d3caffbarchard@google.com    lea        edx, [edx + 16]
34862d11d43a6e21865b904705acce6535ae4c2d3caffbarchard@google.com    sub        ecx, 16
348718184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com    jg         convertloop
348818184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com
34892d11d43a6e21865b904705acce6535ae4c2d3caffbarchard@google.com    pop        edi
34902d11d43a6e21865b904705acce6535ae4c2d3caffbarchard@google.com    ret
34912d11d43a6e21865b904705acce6535ae4c2d3caffbarchard@google.com  }
34922d11d43a6e21865b904705acce6535ae4c2d3caffbarchard@google.com}
3493db694edfc2dcdede9adad7febc4e4b7f9506eee8fbarchard@google.com
3494db694edfc2dcdede9adad7febc4e4b7f9506eee8fbarchard@google.com__declspec(naked) __declspec(align(16))
3495f08ac6bb095348565b5259f2fab95f259ef47edefbarchard@google.comvoid SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
3496f08ac6bb095348565b5259f2fab95f259ef47edefbarchard@google.com                               int pix) {
3497db694edfc2dcdede9adad7febc4e4b7f9506eee8fbarchard@google.com  __asm {
3498db694edfc2dcdede9adad7febc4e4b7f9506eee8fbarchard@google.com    push       edi
3499db694edfc2dcdede9adad7febc4e4b7f9506eee8fbarchard@google.com    mov        eax, [esp + 4 + 4]    // src_uv
3500db694edfc2dcdede9adad7febc4e4b7f9506eee8fbarchard@google.com    mov        edx, [esp + 4 + 8]    // dst_u
3501db694edfc2dcdede9adad7febc4e4b7f9506eee8fbarchard@google.com    mov        edi, [esp + 4 + 12]   // dst_v
3502db694edfc2dcdede9adad7febc4e4b7f9506eee8fbarchard@google.com    mov        ecx, [esp + 4 + 16]   // pix
3503db694edfc2dcdede9adad7febc4e4b7f9506eee8fbarchard@google.com    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
3504db694edfc2dcdede9adad7febc4e4b7f9506eee8fbarchard@google.com    psrlw      xmm5, 8
3505db694edfc2dcdede9adad7febc4e4b7f9506eee8fbarchard@google.com    sub        edi, edx
3506db694edfc2dcdede9adad7febc4e4b7f9506eee8fbarchard@google.com
3507c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
3508db694edfc2dcdede9adad7febc4e4b7f9506eee8fbarchard@google.com  convertloop:
3509db694edfc2dcdede9adad7febc4e4b7f9506eee8fbarchard@google.com    movdqu     xmm0, [eax]
3510db694edfc2dcdede9adad7febc4e4b7f9506eee8fbarchard@google.com    movdqu     xmm1, [eax + 16]
3511db694edfc2dcdede9adad7febc4e4b7f9506eee8fbarchard@google.com    lea        eax,  [eax + 32]
3512db694edfc2dcdede9adad7febc4e4b7f9506eee8fbarchard@google.com    movdqa     xmm2, xmm0
3513db694edfc2dcdede9adad7febc4e4b7f9506eee8fbarchard@google.com    movdqa     xmm3, xmm1
3514db694edfc2dcdede9adad7febc4e4b7f9506eee8fbarchard@google.com    pand       xmm0, xmm5   // even bytes
3515db694edfc2dcdede9adad7febc4e4b7f9506eee8fbarchard@google.com    pand       xmm1, xmm5
3516db694edfc2dcdede9adad7febc4e4b7f9506eee8fbarchard@google.com    packuswb   xmm0, xmm1
3517db694edfc2dcdede9adad7febc4e4b7f9506eee8fbarchard@google.com    psrlw      xmm2, 8      // odd bytes
3518db694edfc2dcdede9adad7febc4e4b7f9506eee8fbarchard@google.com    psrlw      xmm3, 8
3519db694edfc2dcdede9adad7febc4e4b7f9506eee8fbarchard@google.com    packuswb   xmm2, xmm3
3520db694edfc2dcdede9adad7febc4e4b7f9506eee8fbarchard@google.com    movdqu     [edx], xmm0
3521db694edfc2dcdede9adad7febc4e4b7f9506eee8fbarchard@google.com    movdqu     [edx + edi], xmm2
3522db694edfc2dcdede9adad7febc4e4b7f9506eee8fbarchard@google.com    lea        edx, [edx + 16]
3523db694edfc2dcdede9adad7febc4e4b7f9506eee8fbarchard@google.com    sub        ecx, 16
3524db694edfc2dcdede9adad7febc4e4b7f9506eee8fbarchard@google.com    jg         convertloop
3525db694edfc2dcdede9adad7febc4e4b7f9506eee8fbarchard@google.com
3526db694edfc2dcdede9adad7febc4e4b7f9506eee8fbarchard@google.com    pop        edi
3527db694edfc2dcdede9adad7febc4e4b7f9506eee8fbarchard@google.com    ret
3528db694edfc2dcdede9adad7febc4e4b7f9506eee8fbarchard@google.com  }
3529db694edfc2dcdede9adad7febc4e4b7f9506eee8fbarchard@google.com}
3530f08ac6bb095348565b5259f2fab95f259ef47edefbarchard@google.com#endif  // HAS_SPLITUVROW_SSE2
35312d11d43a6e21865b904705acce6535ae4c2d3caffbarchard@google.com
3532c9562334d7ee6b7ecda388001f47864d5fd6ca0afbarchard@google.com#ifdef HAS_SPLITUVROW_AVX2
3533c9562334d7ee6b7ecda388001f47864d5fd6ca0afbarchard@google.com__declspec(naked) __declspec(align(16))
3534c9562334d7ee6b7ecda388001f47864d5fd6ca0afbarchard@google.comvoid SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
3535c9562334d7ee6b7ecda388001f47864d5fd6ca0afbarchard@google.com  __asm {
3536c9562334d7ee6b7ecda388001f47864d5fd6ca0afbarchard@google.com    push       edi
3537c9562334d7ee6b7ecda388001f47864d5fd6ca0afbarchard@google.com    mov        eax, [esp + 4 + 4]    // src_uv
3538c9562334d7ee6b7ecda388001f47864d5fd6ca0afbarchard@google.com    mov        edx, [esp + 4 + 8]    // dst_u
3539c9562334d7ee6b7ecda388001f47864d5fd6ca0afbarchard@google.com    mov        edi, [esp + 4 + 12]   // dst_v
3540c9562334d7ee6b7ecda388001f47864d5fd6ca0afbarchard@google.com    mov        ecx, [esp + 4 + 16]   // pix
3541c9562334d7ee6b7ecda388001f47864d5fd6ca0afbarchard@google.com    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
3542c9562334d7ee6b7ecda388001f47864d5fd6ca0afbarchard@google.com    vpsrlw     ymm5, ymm5, 8
3543c9562334d7ee6b7ecda388001f47864d5fd6ca0afbarchard@google.com    sub        edi, edx
3544c9562334d7ee6b7ecda388001f47864d5fd6ca0afbarchard@google.com
3545c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
3546c9562334d7ee6b7ecda388001f47864d5fd6ca0afbarchard@google.com  convertloop:
3547b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vmovdqu    ymm0, [eax]
3548b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vmovdqu    ymm1, [eax + 32]
3549c9562334d7ee6b7ecda388001f47864d5fd6ca0afbarchard@google.com    lea        eax,  [eax + 64]
3550c9562334d7ee6b7ecda388001f47864d5fd6ca0afbarchard@google.com    vpsrlw     ymm2, ymm0, 8      // odd bytes
3551c9562334d7ee6b7ecda388001f47864d5fd6ca0afbarchard@google.com    vpsrlw     ymm3, ymm1, 8
3552c9562334d7ee6b7ecda388001f47864d5fd6ca0afbarchard@google.com    vpand      ymm0, ymm0, ymm5   // even bytes
3553c9562334d7ee6b7ecda388001f47864d5fd6ca0afbarchard@google.com    vpand      ymm1, ymm1, ymm5
3554c9562334d7ee6b7ecda388001f47864d5fd6ca0afbarchard@google.com    vpackuswb  ymm0, ymm0, ymm1
3555c9562334d7ee6b7ecda388001f47864d5fd6ca0afbarchard@google.com    vpackuswb  ymm2, ymm2, ymm3
3556c9562334d7ee6b7ecda388001f47864d5fd6ca0afbarchard@google.com    vpermq     ymm0, ymm0, 0xd8
3557c9562334d7ee6b7ecda388001f47864d5fd6ca0afbarchard@google.com    vpermq     ymm2, ymm2, 0xd8
3558b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vmovdqu    [edx], ymm0
3559b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vmovdqu    [edx + edi], ymm2
3560c9562334d7ee6b7ecda388001f47864d5fd6ca0afbarchard@google.com    lea        edx, [edx + 32]
3561c9562334d7ee6b7ecda388001f47864d5fd6ca0afbarchard@google.com    sub        ecx, 32
3562c9562334d7ee6b7ecda388001f47864d5fd6ca0afbarchard@google.com    jg         convertloop
3563c9562334d7ee6b7ecda388001f47864d5fd6ca0afbarchard@google.com
3564c9562334d7ee6b7ecda388001f47864d5fd6ca0afbarchard@google.com    pop        edi
35659b4c00b908d37727c6caf82337813d567732be1cfbarchard@google.com    vzeroupper
3566c9562334d7ee6b7ecda388001f47864d5fd6ca0afbarchard@google.com    ret
3567c9562334d7ee6b7ecda388001f47864d5fd6ca0afbarchard@google.com  }
3568c9562334d7ee6b7ecda388001f47864d5fd6ca0afbarchard@google.com}
3569b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com#endif  // HAS_SPLITUVROW_AVX2
3570c9562334d7ee6b7ecda388001f47864d5fd6ca0afbarchard@google.com
3571b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com#ifdef HAS_MERGEUVROW_SSE2
3572c9562334d7ee6b7ecda388001f47864d5fd6ca0afbarchard@google.com__declspec(naked) __declspec(align(16))
3573b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.comvoid MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
3574b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com                     int width) {
3575c9562334d7ee6b7ecda388001f47864d5fd6ca0afbarchard@google.com  __asm {
3576c9562334d7ee6b7ecda388001f47864d5fd6ca0afbarchard@google.com    push       edi
3577b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    mov        eax, [esp + 4 + 4]    // src_u
3578b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    mov        edx, [esp + 4 + 8]    // src_v
3579b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    mov        edi, [esp + 4 + 12]   // dst_uv
3580b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    mov        ecx, [esp + 4 + 16]   // width
3581b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    sub        edx, eax
3582c9562334d7ee6b7ecda388001f47864d5fd6ca0afbarchard@google.com
3583c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
3584c9562334d7ee6b7ecda388001f47864d5fd6ca0afbarchard@google.com  convertloop:
3585b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    movdqa     xmm0, [eax]      // read 16 U's
3586b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    movdqa     xmm1, [eax + edx]  // and 16 V's
3587b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    lea        eax,  [eax + 16]
3588b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    movdqa     xmm2, xmm0
3589b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    punpcklbw  xmm0, xmm1       // first 8 UV pairs
3590b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    punpckhbw  xmm2, xmm1       // next 8 UV pairs
3591b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    movdqa     [edi], xmm0
3592b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    movdqa     [edi + 16], xmm2
3593b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    lea        edi, [edi + 32]
3594b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    sub        ecx, 16
3595c9562334d7ee6b7ecda388001f47864d5fd6ca0afbarchard@google.com    jg         convertloop
3596c9562334d7ee6b7ecda388001f47864d5fd6ca0afbarchard@google.com
3597c9562334d7ee6b7ecda388001f47864d5fd6ca0afbarchard@google.com    pop        edi
3598c9562334d7ee6b7ecda388001f47864d5fd6ca0afbarchard@google.com    ret
3599c9562334d7ee6b7ecda388001f47864d5fd6ca0afbarchard@google.com  }
3600c9562334d7ee6b7ecda388001f47864d5fd6ca0afbarchard@google.com}
3601c9562334d7ee6b7ecda388001f47864d5fd6ca0afbarchard@google.com
3602e0d8648b6ab861cfcf03513439fad8ae39ba50c2fbarchard@google.com__declspec(naked) __declspec(align(16))
3603f08ac6bb095348565b5259f2fab95f259ef47edefbarchard@google.comvoid MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
3604f08ac6bb095348565b5259f2fab95f259ef47edefbarchard@google.com                               uint8* dst_uv, int width) {
3605e0d8648b6ab861cfcf03513439fad8ae39ba50c2fbarchard@google.com  __asm {
3606e0d8648b6ab861cfcf03513439fad8ae39ba50c2fbarchard@google.com    push       edi
3607e0d8648b6ab861cfcf03513439fad8ae39ba50c2fbarchard@google.com    mov        eax, [esp + 4 + 4]    // src_u
3608e0d8648b6ab861cfcf03513439fad8ae39ba50c2fbarchard@google.com    mov        edx, [esp + 4 + 8]    // src_v
3609e0d8648b6ab861cfcf03513439fad8ae39ba50c2fbarchard@google.com    mov        edi, [esp + 4 + 12]   // dst_uv
3610e0d8648b6ab861cfcf03513439fad8ae39ba50c2fbarchard@google.com    mov        ecx, [esp + 4 + 16]   // width
3611e0d8648b6ab861cfcf03513439fad8ae39ba50c2fbarchard@google.com    sub        edx, eax
3612e0d8648b6ab861cfcf03513439fad8ae39ba50c2fbarchard@google.com
3613c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
3614e0d8648b6ab861cfcf03513439fad8ae39ba50c2fbarchard@google.com  convertloop:
3615e0d8648b6ab861cfcf03513439fad8ae39ba50c2fbarchard@google.com    movdqu     xmm0, [eax]      // read 16 U's
3616e0d8648b6ab861cfcf03513439fad8ae39ba50c2fbarchard@google.com    movdqu     xmm1, [eax + edx]  // and 16 V's
3617e0d8648b6ab861cfcf03513439fad8ae39ba50c2fbarchard@google.com    lea        eax,  [eax + 16]
3618e0d8648b6ab861cfcf03513439fad8ae39ba50c2fbarchard@google.com    movdqa     xmm2, xmm0
3619e0d8648b6ab861cfcf03513439fad8ae39ba50c2fbarchard@google.com    punpcklbw  xmm0, xmm1       // first 8 UV pairs
3620e0d8648b6ab861cfcf03513439fad8ae39ba50c2fbarchard@google.com    punpckhbw  xmm2, xmm1       // next 8 UV pairs
3621e0d8648b6ab861cfcf03513439fad8ae39ba50c2fbarchard@google.com    movdqu     [edi], xmm0
3622e0d8648b6ab861cfcf03513439fad8ae39ba50c2fbarchard@google.com    movdqu     [edi + 16], xmm2
3623e0d8648b6ab861cfcf03513439fad8ae39ba50c2fbarchard@google.com    lea        edi, [edi + 32]
3624e0d8648b6ab861cfcf03513439fad8ae39ba50c2fbarchard@google.com    sub        ecx, 16
3625e0d8648b6ab861cfcf03513439fad8ae39ba50c2fbarchard@google.com    jg         convertloop
3626e0d8648b6ab861cfcf03513439fad8ae39ba50c2fbarchard@google.com
3627e0d8648b6ab861cfcf03513439fad8ae39ba50c2fbarchard@google.com    pop        edi
3628e0d8648b6ab861cfcf03513439fad8ae39ba50c2fbarchard@google.com    ret
3629e0d8648b6ab861cfcf03513439fad8ae39ba50c2fbarchard@google.com  }
3630e0d8648b6ab861cfcf03513439fad8ae39ba50c2fbarchard@google.com}
3631f08ac6bb095348565b5259f2fab95f259ef47edefbarchard@google.com#endif  //  HAS_MERGEUVROW_SSE2
36321dafd444ba355e8188cc42c61d3ad85d6681fd1dfbarchard@google.com
3633b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com#ifdef HAS_MERGEUVROW_AVX2
3634b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com__declspec(naked) __declspec(align(16))
3635b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.comvoid MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
3636b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com                     int width) {
3637b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com  __asm {
3638b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    push       edi
3639b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    mov        eax, [esp + 4 + 4]    // src_u
3640b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    mov        edx, [esp + 4 + 8]    // src_v
3641b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    mov        edi, [esp + 4 + 12]   // dst_uv
3642b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    mov        ecx, [esp + 4 + 16]   // width
3643b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    sub        edx, eax
3644b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com
3645c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
3646b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com  convertloop:
3647b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vmovdqu    ymm0, [eax]           // read 32 U's
3648b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vmovdqu    ymm1, [eax + edx]     // and 32 V's
3649b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    lea        eax,  [eax + 32]
3650b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpunpcklbw ymm2, ymm0, ymm1      // low 16 UV pairs. mutated qqword 0,2
3651b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpunpckhbw ymm0, ymm0, ymm1      // high 16 UV pairs. mutated qqword 1,3
3652b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vperm2i128 ymm1, ymm2, ymm0, 0x20  // low 128 of ymm2 and low 128 of ymm0
3653b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vperm2i128 ymm2, ymm2, ymm0, 0x31  // high 128 of ymm2 and high 128 of ymm0
3654b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vmovdqu    [edi], ymm1
3655b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vmovdqu    [edi + 32], ymm2
3656b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    lea        edi, [edi + 64]
3657b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    sub        ecx, 32
3658b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    jg         convertloop
3659b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com
3660b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    pop        edi
36619b4c00b908d37727c6caf82337813d567732be1cfbarchard@google.com    vzeroupper
3662b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    ret
3663b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com  }
3664b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com}
3665b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com#endif  //  HAS_MERGEUVROW_AVX2
3666b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com
366719932f8dbc5ca3123d87b5b8369e7d7bf3469a97fbarchard@google.com#ifdef HAS_COPYROW_SSE2
3668c4c578e327a9dbc9dafe113634612e9a349a8c1ffbarchard@google.com// CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
3669d2f4413d29d15b94d971630ba555dd0cd8fcc8c2fbarchard@google.com__declspec(naked) __declspec(align(16))
367019932f8dbc5ca3123d87b5b8369e7d7bf3469a97fbarchard@google.comvoid CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
367119932f8dbc5ca3123d87b5b8369e7d7bf3469a97fbarchard@google.com  __asm {
367219932f8dbc5ca3123d87b5b8369e7d7bf3469a97fbarchard@google.com    mov        eax, [esp + 4]   // src
367319932f8dbc5ca3123d87b5b8369e7d7bf3469a97fbarchard@google.com    mov        edx, [esp + 8]   // dst
367419932f8dbc5ca3123d87b5b8369e7d7bf3469a97fbarchard@google.com    mov        ecx, [esp + 12]  // count
3675ba3aeed3b86dfae7bc0631c8bed9b50303318dcafbarchard@google.com
3676c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
367719932f8dbc5ca3123d87b5b8369e7d7bf3469a97fbarchard@google.com  convertloop:
367819932f8dbc5ca3123d87b5b8369e7d7bf3469a97fbarchard@google.com    movdqa     xmm0, [eax]
367919932f8dbc5ca3123d87b5b8369e7d7bf3469a97fbarchard@google.com    movdqa     xmm1, [eax + 16]
368019932f8dbc5ca3123d87b5b8369e7d7bf3469a97fbarchard@google.com    lea        eax, [eax + 32]
3681c140b9d150bb40ff79a2a53ad560494b67fb115ffbarchard@google.com    movdqa     [edx], xmm0
3682c140b9d150bb40ff79a2a53ad560494b67fb115ffbarchard@google.com    movdqa     [edx + 16], xmm1
3683c140b9d150bb40ff79a2a53ad560494b67fb115ffbarchard@google.com    lea        edx, [edx + 32]
368419932f8dbc5ca3123d87b5b8369e7d7bf3469a97fbarchard@google.com    sub        ecx, 32
368518184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com    jg         convertloop
368619932f8dbc5ca3123d87b5b8369e7d7bf3469a97fbarchard@google.com    ret
368719932f8dbc5ca3123d87b5b8369e7d7bf3469a97fbarchard@google.com  }
368819932f8dbc5ca3123d87b5b8369e7d7bf3469a97fbarchard@google.com}
368919932f8dbc5ca3123d87b5b8369e7d7bf3469a97fbarchard@google.com#endif  // HAS_COPYROW_SSE2
369019932f8dbc5ca3123d87b5b8369e7d7bf3469a97fbarchard@google.com
3691b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com// Unaligned Multiple of 1.
3692b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com__declspec(naked) __declspec(align(16))
3693aa7988ff733b13d7bfd3c755bf0c18f93b9e8f6efbarchard@google.comvoid CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
3694b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com  __asm {
3695b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    mov        eax, esi
3696b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    mov        edx, edi
3697b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    mov        esi, [esp + 4]   // src
3698b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    mov        edi, [esp + 8]   // dst
3699b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    mov        ecx, [esp + 12]  // count
3700b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    rep movsb
3701b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    mov        edi, edx
3702b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    mov        esi, eax
3703b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    ret
3704b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com  }
3705b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com}
3706b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com
370719932f8dbc5ca3123d87b5b8369e7d7bf3469a97fbarchard@google.com#ifdef HAS_COPYROW_X86
3708d2f4413d29d15b94d971630ba555dd0cd8fcc8c2fbarchard@google.com__declspec(naked) __declspec(align(16))
370919932f8dbc5ca3123d87b5b8369e7d7bf3469a97fbarchard@google.comvoid CopyRow_X86(const uint8* src, uint8* dst, int count) {
371019932f8dbc5ca3123d87b5b8369e7d7bf3469a97fbarchard@google.com  __asm {
371119932f8dbc5ca3123d87b5b8369e7d7bf3469a97fbarchard@google.com    mov        eax, esi
371219932f8dbc5ca3123d87b5b8369e7d7bf3469a97fbarchard@google.com    mov        edx, edi
371319932f8dbc5ca3123d87b5b8369e7d7bf3469a97fbarchard@google.com    mov        esi, [esp + 4]   // src
371419932f8dbc5ca3123d87b5b8369e7d7bf3469a97fbarchard@google.com    mov        edi, [esp + 8]   // dst
371519932f8dbc5ca3123d87b5b8369e7d7bf3469a97fbarchard@google.com    mov        ecx, [esp + 12]  // count
371619932f8dbc5ca3123d87b5b8369e7d7bf3469a97fbarchard@google.com    shr        ecx, 2
371719932f8dbc5ca3123d87b5b8369e7d7bf3469a97fbarchard@google.com    rep movsd
371819932f8dbc5ca3123d87b5b8369e7d7bf3469a97fbarchard@google.com    mov        edi, edx
371919932f8dbc5ca3123d87b5b8369e7d7bf3469a97fbarchard@google.com    mov        esi, eax
372019932f8dbc5ca3123d87b5b8369e7d7bf3469a97fbarchard@google.com    ret
372119932f8dbc5ca3123d87b5b8369e7d7bf3469a97fbarchard@google.com  }
372219932f8dbc5ca3123d87b5b8369e7d7bf3469a97fbarchard@google.com}
37234c416e8849dcda3e884c6204bcdaa264f66d4288fbarchard@google.com#endif  // HAS_COPYROW_X86
372419932f8dbc5ca3123d87b5b8369e7d7bf3469a97fbarchard@google.com
37257f67961ec53f0ad12f827905fc4a4cc880f00931fbarchard@google.com#ifdef HAS_ARGBCOPYALPHAROW_SSE2
37267f67961ec53f0ad12f827905fc4a4cc880f00931fbarchard@google.com// width in pixels
37277f67961ec53f0ad12f827905fc4a4cc880f00931fbarchard@google.com__declspec(naked) __declspec(align(16))
37287f67961ec53f0ad12f827905fc4a4cc880f00931fbarchard@google.comvoid ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
37297f67961ec53f0ad12f827905fc4a4cc880f00931fbarchard@google.com  __asm {
37307f67961ec53f0ad12f827905fc4a4cc880f00931fbarchard@google.com    mov        eax, [esp + 4]   // src
3731f6631bb814600f841f74a9d8a626b528be2fd8bbfbarchard@google.com    mov        edx, [esp + 8]   // dst
37327f67961ec53f0ad12f827905fc4a4cc880f00931fbarchard@google.com    mov        ecx, [esp + 12]  // count
3733f6631bb814600f841f74a9d8a626b528be2fd8bbfbarchard@google.com    pcmpeqb    xmm0, xmm0       // generate mask 0xff000000
3734f6631bb814600f841f74a9d8a626b528be2fd8bbfbarchard@google.com    pslld      xmm0, 24
3735f6631bb814600f841f74a9d8a626b528be2fd8bbfbarchard@google.com    pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff
3736f6631bb814600f841f74a9d8a626b528be2fd8bbfbarchard@google.com    psrld      xmm1, 8
37377f67961ec53f0ad12f827905fc4a4cc880f00931fbarchard@google.com
3738f6631bb814600f841f74a9d8a626b528be2fd8bbfbarchard@google.com    align      4
37397f67961ec53f0ad12f827905fc4a4cc880f00931fbarchard@google.com  convertloop:
3740f6631bb814600f841f74a9d8a626b528be2fd8bbfbarchard@google.com    movdqa     xmm2, [eax]
3741f6631bb814600f841f74a9d8a626b528be2fd8bbfbarchard@google.com    movdqa     xmm3, [eax + 16]
37427f67961ec53f0ad12f827905fc4a4cc880f00931fbarchard@google.com    lea        eax, [eax + 32]
3743f6631bb814600f841f74a9d8a626b528be2fd8bbfbarchard@google.com    movdqa     xmm4, [edx]
3744f6631bb814600f841f74a9d8a626b528be2fd8bbfbarchard@google.com    movdqa     xmm5, [edx + 16]
3745f6631bb814600f841f74a9d8a626b528be2fd8bbfbarchard@google.com    pand       xmm2, xmm0
3746f6631bb814600f841f74a9d8a626b528be2fd8bbfbarchard@google.com    pand       xmm3, xmm0
3747f6631bb814600f841f74a9d8a626b528be2fd8bbfbarchard@google.com    pand       xmm4, xmm1
3748f6631bb814600f841f74a9d8a626b528be2fd8bbfbarchard@google.com    pand       xmm5, xmm1
3749f6631bb814600f841f74a9d8a626b528be2fd8bbfbarchard@google.com    por        xmm2, xmm4
3750f6631bb814600f841f74a9d8a626b528be2fd8bbfbarchard@google.com    por        xmm3, xmm5
3751f6631bb814600f841f74a9d8a626b528be2fd8bbfbarchard@google.com    movdqa     [edx], xmm2
3752f6631bb814600f841f74a9d8a626b528be2fd8bbfbarchard@google.com    movdqa     [edx + 16], xmm3
3753f6631bb814600f841f74a9d8a626b528be2fd8bbfbarchard@google.com    lea        edx, [edx + 32]
37547f67961ec53f0ad12f827905fc4a4cc880f00931fbarchard@google.com    sub        ecx, 8
37557f67961ec53f0ad12f827905fc4a4cc880f00931fbarchard@google.com    jg         convertloop
37567f67961ec53f0ad12f827905fc4a4cc880f00931fbarchard@google.com
37577f67961ec53f0ad12f827905fc4a4cc880f00931fbarchard@google.com    ret
37587f67961ec53f0ad12f827905fc4a4cc880f00931fbarchard@google.com  }
37597f67961ec53f0ad12f827905fc4a4cc880f00931fbarchard@google.com}
37607f67961ec53f0ad12f827905fc4a4cc880f00931fbarchard@google.com#endif  // HAS_ARGBCOPYALPHAROW_SSE2
37617f67961ec53f0ad12f827905fc4a4cc880f00931fbarchard@google.com
3762f6631bb814600f841f74a9d8a626b528be2fd8bbfbarchard@google.com#ifdef HAS_ARGBCOPYALPHAROW_AVX2
3763f6631bb814600f841f74a9d8a626b528be2fd8bbfbarchard@google.com// width in pixels
3764f6631bb814600f841f74a9d8a626b528be2fd8bbfbarchard@google.com__declspec(naked) __declspec(align(16))
3765f6631bb814600f841f74a9d8a626b528be2fd8bbfbarchard@google.comvoid ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
3766f6631bb814600f841f74a9d8a626b528be2fd8bbfbarchard@google.com  __asm {
3767f6631bb814600f841f74a9d8a626b528be2fd8bbfbarchard@google.com    mov        eax, [esp + 4]   // src
3768f6631bb814600f841f74a9d8a626b528be2fd8bbfbarchard@google.com    mov        edx, [esp + 8]   // dst
3769f6631bb814600f841f74a9d8a626b528be2fd8bbfbarchard@google.com    mov        ecx, [esp + 12]  // count
37703075de82856a044ebd3e808b2f0918d2b0e9713cfbarchard@google.com    vpcmpeqb   ymm0, ymm0, ymm0
3771adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com    vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff
3772f6631bb814600f841f74a9d8a626b528be2fd8bbfbarchard@google.com
3773f6631bb814600f841f74a9d8a626b528be2fd8bbfbarchard@google.com    align      4
3774f6631bb814600f841f74a9d8a626b528be2fd8bbfbarchard@google.com  convertloop:
3775adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com    vmovdqu    ymm1, [eax]
3776adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com    vmovdqu    ymm2, [eax + 32]
3777f6631bb814600f841f74a9d8a626b528be2fd8bbfbarchard@google.com    lea        eax, [eax + 64]
3778adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com    vpblendvb  ymm1, ymm1, [edx], ymm0
3779adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com    vpblendvb  ymm2, ymm2, [edx + 32], ymm0
3780adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com    vmovdqu    [edx], ymm1
3781adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com    vmovdqu    [edx + 32], ymm2
3782f6631bb814600f841f74a9d8a626b528be2fd8bbfbarchard@google.com    lea        edx, [edx + 64]
3783f6631bb814600f841f74a9d8a626b528be2fd8bbfbarchard@google.com    sub        ecx, 16
3784f6631bb814600f841f74a9d8a626b528be2fd8bbfbarchard@google.com    jg         convertloop
3785f6631bb814600f841f74a9d8a626b528be2fd8bbfbarchard@google.com
3786f6631bb814600f841f74a9d8a626b528be2fd8bbfbarchard@google.com    vzeroupper
3787f6631bb814600f841f74a9d8a626b528be2fd8bbfbarchard@google.com    ret
3788f6631bb814600f841f74a9d8a626b528be2fd8bbfbarchard@google.com  }
3789f6631bb814600f841f74a9d8a626b528be2fd8bbfbarchard@google.com}
3790f6631bb814600f841f74a9d8a626b528be2fd8bbfbarchard@google.com#endif  // HAS_ARGBCOPYALPHAROW_AVX2
3791f6631bb814600f841f74a9d8a626b528be2fd8bbfbarchard@google.com
3792adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
3793adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com// width in pixels
3794adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com__declspec(naked) __declspec(align(16))
3795adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.comvoid ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
3796adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com  __asm {
3797adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com    mov        eax, [esp + 4]   // src
3798adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com    mov        edx, [esp + 8]   // dst
3799adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com    mov        ecx, [esp + 12]  // count
3800adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com    pcmpeqb    xmm0, xmm0       // generate mask 0xff000000
3801adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com    pslld      xmm0, 24
3802adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com    pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff
3803adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com    psrld      xmm1, 8
3804adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com
3805adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com    align      4
3806adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com  convertloop:
3807adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com    movq       xmm2, qword ptr [eax]  // 8 Y's
3808adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com    lea        eax, [eax + 8]
3809adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com    punpcklbw  xmm2, xmm2
3810adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com    punpckhwd  xmm3, xmm2
3811adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com    punpcklwd  xmm2, xmm2
3812adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com    movdqa     xmm4, [edx]
3813adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com    movdqa     xmm5, [edx + 16]
3814adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com    pand       xmm2, xmm0
3815adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com    pand       xmm3, xmm0
3816adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com    pand       xmm4, xmm1
3817adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com    pand       xmm5, xmm1
3818adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com    por        xmm2, xmm4
3819adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com    por        xmm3, xmm5
3820adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com    movdqa     [edx], xmm2
3821adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com    movdqa     [edx + 16], xmm3
3822adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com    lea        edx, [edx + 32]
3823adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com    sub        ecx, 8
3824adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com    jg         convertloop
3825adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com
3826adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com    ret
3827adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com  }
3828adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com}
3829adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com#endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
3830adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com
3831adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
3832adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com// width in pixels
3833adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com__declspec(naked) __declspec(align(16))
3834adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.comvoid ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
3835adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com  __asm {
3836adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com    mov        eax, [esp + 4]   // src
3837adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com    mov        edx, [esp + 8]   // dst
3838adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com    mov        ecx, [esp + 12]  // count
3839adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com    vpcmpeqb   ymm0, ymm0, ymm0
3840adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com    vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff
3841adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com
3842adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com    align      4
3843adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com  convertloop:
3844adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com    vpmovzxbd  ymm1, qword ptr [eax]
3845adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com    vpmovzxbd  ymm2, qword ptr [eax + 8]
3846adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com    lea        eax, [eax + 16]
3847adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com    vpslld     ymm1, ymm1, 24
3848adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com    vpslld     ymm2, ymm2, 24
3849adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com    vpblendvb  ymm1, ymm1, [edx], ymm0
3850adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com    vpblendvb  ymm2, ymm2, [edx + 32], ymm0
3851adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com    vmovdqu    [edx], ymm1
3852adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com    vmovdqu    [edx + 32], ymm2
3853adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com    lea        edx, [edx + 64]
3854adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com    sub        ecx, 16
3855adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com    jg         convertloop
3856adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com
3857adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com    vzeroupper
3858adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com    ret
3859adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com  }
3860adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com}
3861adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com#endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
3862adef267edfb3539cd773692d6fa4050ffd092f55fbarchard@google.com
386364ce0ab544591b1e26ae6d276932cacdb8137071fbarchard@google.com#ifdef HAS_SETROW_X86
386464ce0ab544591b1e26ae6d276932cacdb8137071fbarchard@google.com// SetRow8 writes 'count' bytes using a 32 bit value repeated.
386564ce0ab544591b1e26ae6d276932cacdb8137071fbarchard@google.com__declspec(naked) __declspec(align(16))
3866f08ac6bb095348565b5259f2fab95f259ef47edefbarchard@google.comvoid SetRow_X86(uint8* dst, uint32 v32, int count) {
386764ce0ab544591b1e26ae6d276932cacdb8137071fbarchard@google.com  __asm {
386864ce0ab544591b1e26ae6d276932cacdb8137071fbarchard@google.com    mov        edx, edi
386964ce0ab544591b1e26ae6d276932cacdb8137071fbarchard@google.com    mov        edi, [esp + 4]   // dst
387064ce0ab544591b1e26ae6d276932cacdb8137071fbarchard@google.com    mov        eax, [esp + 8]   // v32
387164ce0ab544591b1e26ae6d276932cacdb8137071fbarchard@google.com    mov        ecx, [esp + 12]  // count
387264ce0ab544591b1e26ae6d276932cacdb8137071fbarchard@google.com    shr        ecx, 2
387364ce0ab544591b1e26ae6d276932cacdb8137071fbarchard@google.com    rep stosd
387464ce0ab544591b1e26ae6d276932cacdb8137071fbarchard@google.com    mov        edi, edx
387564ce0ab544591b1e26ae6d276932cacdb8137071fbarchard@google.com    ret
387664ce0ab544591b1e26ae6d276932cacdb8137071fbarchard@google.com  }
387764ce0ab544591b1e26ae6d276932cacdb8137071fbarchard@google.com}
387864ce0ab544591b1e26ae6d276932cacdb8137071fbarchard@google.com
387964ce0ab544591b1e26ae6d276932cacdb8137071fbarchard@google.com// SetRow32 writes 'count' words using a 32 bit value repeated.
388064ce0ab544591b1e26ae6d276932cacdb8137071fbarchard@google.com__declspec(naked) __declspec(align(16))
3881f08ac6bb095348565b5259f2fab95f259ef47edefbarchard@google.comvoid ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
388264ce0ab544591b1e26ae6d276932cacdb8137071fbarchard@google.com                   int dst_stride, int height) {
388364ce0ab544591b1e26ae6d276932cacdb8137071fbarchard@google.com  __asm {
388464ce0ab544591b1e26ae6d276932cacdb8137071fbarchard@google.com    push       esi
388564ce0ab544591b1e26ae6d276932cacdb8137071fbarchard@google.com    push       edi
388664ce0ab544591b1e26ae6d276932cacdb8137071fbarchard@google.com    push       ebp
388764ce0ab544591b1e26ae6d276932cacdb8137071fbarchard@google.com    mov        edi, [esp + 12 + 4]   // dst
388864ce0ab544591b1e26ae6d276932cacdb8137071fbarchard@google.com    mov        eax, [esp + 12 + 8]   // v32
388964ce0ab544591b1e26ae6d276932cacdb8137071fbarchard@google.com    mov        ebp, [esp + 12 + 12]  // width
389064ce0ab544591b1e26ae6d276932cacdb8137071fbarchard@google.com    mov        edx, [esp + 12 + 16]  // dst_stride
389164ce0ab544591b1e26ae6d276932cacdb8137071fbarchard@google.com    mov        esi, [esp + 12 + 20]  // height
389264ce0ab544591b1e26ae6d276932cacdb8137071fbarchard@google.com    lea        ecx, [ebp * 4]
389364ce0ab544591b1e26ae6d276932cacdb8137071fbarchard@google.com    sub        edx, ecx             // stride - width * 4
389464ce0ab544591b1e26ae6d276932cacdb8137071fbarchard@google.com
3895c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
389664ce0ab544591b1e26ae6d276932cacdb8137071fbarchard@google.com  convertloop:
389764ce0ab544591b1e26ae6d276932cacdb8137071fbarchard@google.com    mov        ecx, ebp
389864ce0ab544591b1e26ae6d276932cacdb8137071fbarchard@google.com    rep stosd
389964ce0ab544591b1e26ae6d276932cacdb8137071fbarchard@google.com    add        edi, edx
390064ce0ab544591b1e26ae6d276932cacdb8137071fbarchard@google.com    sub        esi, 1
390164ce0ab544591b1e26ae6d276932cacdb8137071fbarchard@google.com    jg         convertloop
390264ce0ab544591b1e26ae6d276932cacdb8137071fbarchard@google.com
390364ce0ab544591b1e26ae6d276932cacdb8137071fbarchard@google.com    pop        ebp
390464ce0ab544591b1e26ae6d276932cacdb8137071fbarchard@google.com    pop        edi
390564ce0ab544591b1e26ae6d276932cacdb8137071fbarchard@google.com    pop        esi
390664ce0ab544591b1e26ae6d276932cacdb8137071fbarchard@google.com    ret
390764ce0ab544591b1e26ae6d276932cacdb8137071fbarchard@google.com  }
390864ce0ab544591b1e26ae6d276932cacdb8137071fbarchard@google.com}
390964ce0ab544591b1e26ae6d276932cacdb8137071fbarchard@google.com#endif  // HAS_SETROW_X86
391064ce0ab544591b1e26ae6d276932cacdb8137071fbarchard@google.com
3911b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com#ifdef HAS_YUY2TOYROW_AVX2
3912b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com__declspec(naked) __declspec(align(16))
3913b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.comvoid YUY2ToYRow_AVX2(const uint8* src_yuy2,
3914b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com                     uint8* dst_y, int pix) {
3915b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com  __asm {
3916b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    mov        eax, [esp + 4]    // src_yuy2
3917b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    mov        edx, [esp + 8]    // dst_y
3918b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    mov        ecx, [esp + 12]   // pix
3919b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
3920b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpsrlw     ymm5, ymm5, 8
3921b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com
3922c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
3923b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com  convertloop:
3924b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vmovdqu    ymm0, [eax]
3925b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vmovdqu    ymm1, [eax + 32]
3926b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    lea        eax,  [eax + 64]
3927b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpand      ymm0, ymm0, ymm5   // even bytes are Y
3928b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpand      ymm1, ymm1, ymm5
3929b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpackuswb  ymm0, ymm0, ymm1   // mutates.
3930b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpermq     ymm0, ymm0, 0xd8
3931b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    sub        ecx, 32
3932b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vmovdqu    [edx], ymm0
3933b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    lea        edx, [edx + 32]
3934b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    jg         convertloop
39359b4c00b908d37727c6caf82337813d567732be1cfbarchard@google.com    vzeroupper
3936b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    ret
3937b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com  }
3938b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com}
3939b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com
3940b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com__declspec(naked) __declspec(align(16))
3941b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.comvoid YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
3942b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com                      uint8* dst_u, uint8* dst_v, int pix) {
3943b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com  __asm {
3944b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    push       esi
3945b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    push       edi
3946b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    mov        eax, [esp + 8 + 4]    // src_yuy2
3947b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    mov        esi, [esp + 8 + 8]    // stride_yuy2
3948b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    mov        edx, [esp + 8 + 12]   // dst_u
3949b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    mov        edi, [esp + 8 + 16]   // dst_v
3950b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    mov        ecx, [esp + 8 + 20]   // pix
3951b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
3952b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpsrlw     ymm5, ymm5, 8
3953b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    sub        edi, edx
3954b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com
3955c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
3956b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com  convertloop:
3957b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vmovdqu    ymm0, [eax]
3958b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vmovdqu    ymm1, [eax + 32]
3959b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpavgb     ymm0, ymm0, [eax + esi]
3960b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpavgb     ymm1, ymm1, [eax + esi + 32]
3961b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    lea        eax,  [eax + 64]
3962b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpsrlw     ymm0, ymm0, 8      // YUYV -> UVUV
3963b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpsrlw     ymm1, ymm1, 8
3964b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpackuswb  ymm0, ymm0, ymm1   // mutates.
3965b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpermq     ymm0, ymm0, 0xd8
3966b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpand      ymm1, ymm0, ymm5  // U
3967b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpsrlw     ymm0, ymm0, 8     // V
3968b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpackuswb  ymm1, ymm1, ymm1  // mutates.
3969b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpackuswb  ymm0, ymm0, ymm0  // mutates.
3970b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpermq     ymm1, ymm1, 0xd8
3971b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpermq     ymm0, ymm0, 0xd8
3972b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vextractf128 [edx], ymm1, 0  // U
3973b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vextractf128 [edx + edi], ymm0, 0 // V
3974b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    lea        edx, [edx + 16]
3975b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    sub        ecx, 32
3976b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    jg         convertloop
3977b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com
3978b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    pop        edi
3979b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    pop        esi
39809b4c00b908d37727c6caf82337813d567732be1cfbarchard@google.com    vzeroupper
3981b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    ret
3982b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com  }
3983b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com}
3984b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com
3985b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com__declspec(naked) __declspec(align(16))
3986b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.comvoid YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
3987b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com                         uint8* dst_u, uint8* dst_v, int pix) {
3988b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com  __asm {
3989b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    push       edi
3990b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    mov        eax, [esp + 4 + 4]    // src_yuy2
3991b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    mov        edx, [esp + 4 + 8]    // dst_u
3992b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    mov        edi, [esp + 4 + 12]   // dst_v
3993b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    mov        ecx, [esp + 4 + 16]   // pix
3994b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
3995b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpsrlw     ymm5, ymm5, 8
3996b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    sub        edi, edx
3997b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com
3998c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
3999b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com  convertloop:
4000b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vmovdqu    ymm0, [eax]
4001b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vmovdqu    ymm1, [eax + 32]
4002b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    lea        eax,  [eax + 64]
4003b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpsrlw     ymm0, ymm0, 8      // YUYV -> UVUV
4004b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpsrlw     ymm1, ymm1, 8
4005b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpackuswb  ymm0, ymm0, ymm1   // mutates.
4006b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpermq     ymm0, ymm0, 0xd8
4007b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpand      ymm1, ymm0, ymm5  // U
4008b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpsrlw     ymm0, ymm0, 8     // V
4009b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpackuswb  ymm1, ymm1, ymm1  // mutates.
4010b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpackuswb  ymm0, ymm0, ymm0  // mutates.
4011b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpermq     ymm1, ymm1, 0xd8
4012b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpermq     ymm0, ymm0, 0xd8
4013b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vextractf128 [edx], ymm1, 0  // U
4014b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vextractf128 [edx + edi], ymm0, 0 // V
4015b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    lea        edx, [edx + 16]
4016b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    sub        ecx, 32
4017b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    jg         convertloop
4018b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com
4019b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    pop        edi
40209b4c00b908d37727c6caf82337813d567732be1cfbarchard@google.com    vzeroupper
4021b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    ret
4022b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com  }
4023b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com}
4024b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com
4025b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com__declspec(naked) __declspec(align(16))
4026b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.comvoid UYVYToYRow_AVX2(const uint8* src_uyvy,
4027b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com                     uint8* dst_y, int pix) {
4028b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com  __asm {
4029b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    mov        eax, [esp + 4]    // src_uyvy
4030b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    mov        edx, [esp + 8]    // dst_y
4031b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    mov        ecx, [esp + 12]   // pix
4032b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com
4033c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
4034b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com  convertloop:
4035b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vmovdqu    ymm0, [eax]
4036b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vmovdqu    ymm1, [eax + 32]
4037b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    lea        eax,  [eax + 64]
4038b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpsrlw     ymm0, ymm0, 8      // odd bytes are Y
4039b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpsrlw     ymm1, ymm1, 8
4040b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpackuswb  ymm0, ymm0, ymm1   // mutates.
4041b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpermq     ymm0, ymm0, 0xd8
4042b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    sub        ecx, 32
4043b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vmovdqu    [edx], ymm0
4044b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    lea        edx, [edx + 32]
4045b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    jg         convertloop
4046b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    ret
40479b4c00b908d37727c6caf82337813d567732be1cfbarchard@google.com    vzeroupper
4048b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com  }
4049b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com}
4050b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com
4051b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com__declspec(naked) __declspec(align(16))
4052b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.comvoid UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
4053b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com                      uint8* dst_u, uint8* dst_v, int pix) {
4054b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com  __asm {
4055b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    push       esi
4056b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    push       edi
4057b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    mov        eax, [esp + 8 + 4]    // src_yuy2
4058b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    mov        esi, [esp + 8 + 8]    // stride_yuy2
4059b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    mov        edx, [esp + 8 + 12]   // dst_u
4060b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    mov        edi, [esp + 8 + 16]   // dst_v
4061b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    mov        ecx, [esp + 8 + 20]   // pix
4062b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
4063b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpsrlw     ymm5, ymm5, 8
4064b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    sub        edi, edx
4065b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com
4066c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
4067b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com  convertloop:
4068b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vmovdqu    ymm0, [eax]
4069b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vmovdqu    ymm1, [eax + 32]
4070b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpavgb     ymm0, ymm0, [eax + esi]
4071b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpavgb     ymm1, ymm1, [eax + esi + 32]
4072b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    lea        eax,  [eax + 64]
4073b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpand      ymm0, ymm0, ymm5   // UYVY -> UVUV
4074b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpand      ymm1, ymm1, ymm5
4075b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpackuswb  ymm0, ymm0, ymm1   // mutates.
4076b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpermq     ymm0, ymm0, 0xd8
4077b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpand      ymm1, ymm0, ymm5  // U
4078b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpsrlw     ymm0, ymm0, 8     // V
4079b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpackuswb  ymm1, ymm1, ymm1  // mutates.
4080b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpackuswb  ymm0, ymm0, ymm0  // mutates.
4081b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpermq     ymm1, ymm1, 0xd8
4082b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpermq     ymm0, ymm0, 0xd8
4083b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vextractf128 [edx], ymm1, 0  // U
4084b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vextractf128 [edx + edi], ymm0, 0 // V
4085b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    lea        edx, [edx + 16]
4086b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    sub        ecx, 32
4087b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    jg         convertloop
4088b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com
4089b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    pop        edi
4090b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    pop        esi
40919b4c00b908d37727c6caf82337813d567732be1cfbarchard@google.com    vzeroupper
4092b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    ret
4093b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com  }
4094b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com}
4095b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com
4096b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com__declspec(naked) __declspec(align(16))
4097b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.comvoid UYVYToUV422Row_AVX2(const uint8* src_uyvy,
4098b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com                         uint8* dst_u, uint8* dst_v, int pix) {
4099b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com  __asm {
4100b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    push       edi
4101b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    mov        eax, [esp + 4 + 4]    // src_yuy2
4102b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    mov        edx, [esp + 4 + 8]    // dst_u
4103b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    mov        edi, [esp + 4 + 12]   // dst_v
4104b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    mov        ecx, [esp + 4 + 16]   // pix
4105b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
4106b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpsrlw     ymm5, ymm5, 8
4107b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    sub        edi, edx
4108b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com
4109c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
4110b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com  convertloop:
4111b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vmovdqu    ymm0, [eax]
4112b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vmovdqu    ymm1, [eax + 32]
4113b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    lea        eax,  [eax + 64]
4114b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpand      ymm0, ymm0, ymm5   // UYVY -> UVUV
4115b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpand      ymm1, ymm1, ymm5
4116b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpackuswb  ymm0, ymm0, ymm1   // mutates.
4117b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpermq     ymm0, ymm0, 0xd8
4118b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpand      ymm1, ymm0, ymm5  // U
4119b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpsrlw     ymm0, ymm0, 8     // V
4120b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpackuswb  ymm1, ymm1, ymm1  // mutates.
4121b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpackuswb  ymm0, ymm0, ymm0  // mutates.
4122b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpermq     ymm1, ymm1, 0xd8
4123b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vpermq     ymm0, ymm0, 0xd8
4124b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vextractf128 [edx], ymm1, 0  // U
4125b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    vextractf128 [edx + edi], ymm0, 0 // V
4126b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    lea        edx, [edx + 16]
4127b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    sub        ecx, 32
4128b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    jg         convertloop
4129b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com
4130b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    pop        edi
41319b4c00b908d37727c6caf82337813d567732be1cfbarchard@google.com    vzeroupper
4132b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com    ret
4133b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com  }
4134b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com}
4135b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com#endif  // HAS_YUY2TOYROW_AVX2
4136b444bae883e97c1e4579f2e1148cf14f9c7c18fbfbarchard@google.com
4137b95dbf24951d8b7118f680d75c7456a5f5d57bfffbarchard@google.com#ifdef HAS_YUY2TOYROW_SSE2
4138d2f4413d29d15b94d971630ba555dd0cd8fcc8c2fbarchard@google.com__declspec(naked) __declspec(align(16))
4139e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.comvoid YUY2ToYRow_SSE2(const uint8* src_yuy2,
4140e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com                     uint8* dst_y, int pix) {
4141e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com  __asm {
4142e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    mov        eax, [esp + 4]    // src_yuy2
4143e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    mov        edx, [esp + 8]    // dst_y
4144e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    mov        ecx, [esp + 12]   // pix
4145e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff
4146e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    psrlw      xmm5, 8
4147e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com
4148c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
4149e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com  convertloop:
4150e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    movdqa     xmm0, [eax]
4151e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    movdqa     xmm1, [eax + 16]
4152e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    lea        eax,  [eax + 32]
4153e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    pand       xmm0, xmm5   // even bytes are Y
4154e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    pand       xmm1, xmm5
4155e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    packuswb   xmm0, xmm1
415618184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com    sub        ecx, 16
4157e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    movdqa     [edx], xmm0
4158e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    lea        edx, [edx + 16]
415918184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com    jg         convertloop
4160e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    ret
4161e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com  }
4162e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com}
4163e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com
4164d2f4413d29d15b94d971630ba555dd0cd8fcc8c2fbarchard@google.com__declspec(naked) __declspec(align(16))
4165e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.comvoid YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
4166c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com                      uint8* dst_u, uint8* dst_v, int pix) {
4167e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com  __asm {
4168e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    push       esi
4169e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    push       edi
4170e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    mov        eax, [esp + 8 + 4]    // src_yuy2
4171e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    mov        esi, [esp + 8 + 8]    // stride_yuy2
4172e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    mov        edx, [esp + 8 + 12]   // dst_u
4173e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    mov        edi, [esp + 8 + 16]   // dst_v
4174e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    mov        ecx, [esp + 8 + 20]   // pix
4175e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
4176e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    psrlw      xmm5, 8
4177e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    sub        edi, edx
4178e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com
4179c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
4180e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com  convertloop:
4181e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    movdqa     xmm0, [eax]
4182e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    movdqa     xmm1, [eax + 16]
4183e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    movdqa     xmm2, [eax + esi]
4184e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    movdqa     xmm3, [eax + esi + 16]
4185e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    lea        eax,  [eax + 32]
4186e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    pavgb      xmm0, xmm2
4187e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    pavgb      xmm1, xmm3
4188e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    psrlw      xmm0, 8      // YUYV -> UVUV
4189e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    psrlw      xmm1, 8
4190e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    packuswb   xmm0, xmm1
4191e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    movdqa     xmm1, xmm0
4192e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    pand       xmm0, xmm5  // U
4193e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    packuswb   xmm0, xmm0
4194e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    psrlw      xmm1, 8     // V
4195e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    packuswb   xmm1, xmm1
4196e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    movq       qword ptr [edx], xmm0
4197e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    movq       qword ptr [edx + edi], xmm1
4198e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    lea        edx, [edx + 8]
4199e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    sub        ecx, 16
420018184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com    jg         convertloop
4201e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com
4202e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    pop        edi
4203e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    pop        esi
4204e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    ret
4205e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com  }
4206e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com}
4207e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com
4208d2f4413d29d15b94d971630ba555dd0cd8fcc8c2fbarchard@google.com__declspec(naked) __declspec(align(16))
4209c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.comvoid YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
4210c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com                         uint8* dst_u, uint8* dst_v, int pix) {
4211c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com  __asm {
4212c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    push       edi
4213c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    mov        eax, [esp + 4 + 4]    // src_yuy2
4214c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    mov        edx, [esp + 4 + 8]    // dst_u
4215c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    mov        edi, [esp + 4 + 12]   // dst_v
4216c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    mov        ecx, [esp + 4 + 16]   // pix
4217c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
4218c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    psrlw      xmm5, 8
4219c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    sub        edi, edx
4220c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com
4221c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
4222c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com  convertloop:
4223c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    movdqa     xmm0, [eax]
4224c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    movdqa     xmm1, [eax + 16]
4225c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    lea        eax,  [eax + 32]
4226c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    psrlw      xmm0, 8      // YUYV -> UVUV
4227c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    psrlw      xmm1, 8
4228c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    packuswb   xmm0, xmm1
4229c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    movdqa     xmm1, xmm0
4230c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    pand       xmm0, xmm5  // U
4231c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    packuswb   xmm0, xmm0
4232c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    psrlw      xmm1, 8     // V
4233c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    packuswb   xmm1, xmm1
4234c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    movq       qword ptr [edx], xmm0
4235c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    movq       qword ptr [edx + edi], xmm1
4236c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    lea        edx, [edx + 8]
4237c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    sub        ecx, 16
4238c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    jg         convertloop
4239c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com
4240c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    pop        edi
4241c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    ret
4242c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com  }
4243c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com}
4244c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com
4245c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com__declspec(naked) __declspec(align(16))
4246e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.comvoid YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
4247e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com                               uint8* dst_y, int pix) {
4248e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com  __asm {
4249e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    mov        eax, [esp + 4]    // src_yuy2
4250e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    mov        edx, [esp + 8]    // dst_y
4251e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    mov        ecx, [esp + 12]   // pix
4252e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff
4253e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    psrlw      xmm5, 8
4254e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com
4255c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
4256e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com  convertloop:
4257e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    movdqu     xmm0, [eax]
4258e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    movdqu     xmm1, [eax + 16]
4259e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    lea        eax,  [eax + 32]
4260e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    pand       xmm0, xmm5   // even bytes are Y
4261e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    pand       xmm1, xmm5
4262e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    packuswb   xmm0, xmm1
426318184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com    sub        ecx, 16
4264e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    movdqu     [edx], xmm0
4265e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    lea        edx, [edx + 16]
426618184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com    jg         convertloop
4267e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    ret
4268e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com  }
4269e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com}
4270e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com
4271d2f4413d29d15b94d971630ba555dd0cd8fcc8c2fbarchard@google.com__declspec(naked) __declspec(align(16))
4272e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.comvoid YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
4273c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com                                uint8* dst_u, uint8* dst_v, int pix) {
4274e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com  __asm {
4275e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    push       esi
4276e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    push       edi
4277e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    mov        eax, [esp + 8 + 4]    // src_yuy2
4278e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    mov        esi, [esp + 8 + 8]    // stride_yuy2
4279e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    mov        edx, [esp + 8 + 12]   // dst_u
4280e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    mov        edi, [esp + 8 + 16]   // dst_v
4281e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    mov        ecx, [esp + 8 + 20]   // pix
4282e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
4283e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    psrlw      xmm5, 8
4284e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    sub        edi, edx
4285e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com
4286c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
4287e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com  convertloop:
4288e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    movdqu     xmm0, [eax]
4289e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    movdqu     xmm1, [eax + 16]
4290e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    movdqu     xmm2, [eax + esi]
4291e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    movdqu     xmm3, [eax + esi + 16]
4292e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    lea        eax,  [eax + 32]
4293e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    pavgb      xmm0, xmm2
4294e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    pavgb      xmm1, xmm3
4295e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    psrlw      xmm0, 8      // YUYV -> UVUV
4296e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    psrlw      xmm1, 8
4297e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    packuswb   xmm0, xmm1
4298e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    movdqa     xmm1, xmm0
4299e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    pand       xmm0, xmm5  // U
4300e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    packuswb   xmm0, xmm0
4301e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    psrlw      xmm1, 8     // V
4302e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    packuswb   xmm1, xmm1
4303e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    movq       qword ptr [edx], xmm0
4304e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    movq       qword ptr [edx + edi], xmm1
4305e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    lea        edx, [edx + 8]
4306e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    sub        ecx, 16
430718184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com    jg         convertloop
4308e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com
4309e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    pop        edi
4310e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    pop        esi
4311e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    ret
4312e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com  }
4313e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com}
4314e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com
4315d2f4413d29d15b94d971630ba555dd0cd8fcc8c2fbarchard@google.com__declspec(naked) __declspec(align(16))
4316c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.comvoid YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
4317c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com                                   uint8* dst_u, uint8* dst_v, int pix) {
4318c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com  __asm {
4319c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    push       edi
4320c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    mov        eax, [esp + 4 + 4]    // src_yuy2
4321c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    mov        edx, [esp + 4 + 8]    // dst_u
4322c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    mov        edi, [esp + 4 + 12]   // dst_v
4323c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    mov        ecx, [esp + 4 + 16]   // pix
4324c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
4325c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    psrlw      xmm5, 8
4326c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    sub        edi, edx
4327c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com
4328c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
4329c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com  convertloop:
4330c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    movdqu     xmm0, [eax]
4331c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    movdqu     xmm1, [eax + 16]
4332c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    lea        eax,  [eax + 32]
4333c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    psrlw      xmm0, 8      // YUYV -> UVUV
4334c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    psrlw      xmm1, 8
4335c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    packuswb   xmm0, xmm1
4336c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    movdqa     xmm1, xmm0
4337c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    pand       xmm0, xmm5  // U
4338c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    packuswb   xmm0, xmm0
4339c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    psrlw      xmm1, 8     // V
4340c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    packuswb   xmm1, xmm1
4341c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    movq       qword ptr [edx], xmm0
4342c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    movq       qword ptr [edx + edi], xmm1
4343c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    lea        edx, [edx + 8]
4344c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    sub        ecx, 16
4345c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    jg         convertloop
4346c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com
4347c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    pop        edi
4348c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    ret
4349c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com  }
4350c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com}
4351c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com
4352c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com__declspec(naked) __declspec(align(16))
4353e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.comvoid UYVYToYRow_SSE2(const uint8* src_uyvy,
4354e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com                     uint8* dst_y, int pix) {
4355e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com  __asm {
4356e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    mov        eax, [esp + 4]    // src_uyvy
4357e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    mov        edx, [esp + 8]    // dst_y
4358e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    mov        ecx, [esp + 12]   // pix
4359e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com
4360c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
4361e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com  convertloop:
4362e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    movdqa     xmm0, [eax]
4363e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    movdqa     xmm1, [eax + 16]
4364e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    lea        eax,  [eax + 32]
4365e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    psrlw      xmm0, 8    // odd bytes are Y
4366e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    psrlw      xmm1, 8
4367e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    packuswb   xmm0, xmm1
436818184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com    sub        ecx, 16
4369e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    movdqa     [edx], xmm0
4370e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    lea        edx, [edx + 16]
437118184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com    jg         convertloop
4372e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    ret
4373e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com  }
4374e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com}
4375e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com
4376d2f4413d29d15b94d971630ba555dd0cd8fcc8c2fbarchard@google.com__declspec(naked) __declspec(align(16))
4377e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.comvoid UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
4378c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com                      uint8* dst_u, uint8* dst_v, int pix) {
4379e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com  __asm {
4380e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    push       esi
4381e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    push       edi
4382e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    mov        eax, [esp + 8 + 4]    // src_yuy2
4383e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    mov        esi, [esp + 8 + 8]    // stride_yuy2
4384e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    mov        edx, [esp + 8 + 12]   // dst_u
4385e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    mov        edi, [esp + 8 + 16]   // dst_v
4386e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    mov        ecx, [esp + 8 + 20]   // pix
4387e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
4388e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    psrlw      xmm5, 8
4389e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    sub        edi, edx
4390e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com
4391c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
4392e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com  convertloop:
4393e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    movdqa     xmm0, [eax]
4394e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    movdqa     xmm1, [eax + 16]
4395e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    movdqa     xmm2, [eax + esi]
4396e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    movdqa     xmm3, [eax + esi + 16]
4397e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    lea        eax,  [eax + 32]
4398e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    pavgb      xmm0, xmm2
4399e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    pavgb      xmm1, xmm3
4400e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    pand       xmm0, xmm5   // UYVY -> UVUV
4401e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    pand       xmm1, xmm5
4402e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    packuswb   xmm0, xmm1
4403e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    movdqa     xmm1, xmm0
4404e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    pand       xmm0, xmm5  // U
4405e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    packuswb   xmm0, xmm0
4406e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    psrlw      xmm1, 8     // V
4407e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    packuswb   xmm1, xmm1
4408e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    movq       qword ptr [edx], xmm0
4409e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    movq       qword ptr [edx + edi], xmm1
4410e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    lea        edx, [edx + 8]
4411e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    sub        ecx, 16
441218184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com    jg         convertloop
4413e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com
4414e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    pop        edi
4415e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    pop        esi
4416e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    ret
4417e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com  }
4418e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com}
4419e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com
4420d2f4413d29d15b94d971630ba555dd0cd8fcc8c2fbarchard@google.com__declspec(naked) __declspec(align(16))
4421c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.comvoid UYVYToUV422Row_SSE2(const uint8* src_uyvy,
4422c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com                         uint8* dst_u, uint8* dst_v, int pix) {
4423c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com  __asm {
4424c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    push       edi
4425c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    mov        eax, [esp + 4 + 4]    // src_yuy2
4426c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    mov        edx, [esp + 4 + 8]    // dst_u
4427c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    mov        edi, [esp + 4 + 12]   // dst_v
4428c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    mov        ecx, [esp + 4 + 16]   // pix
4429c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
4430c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    psrlw      xmm5, 8
4431c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    sub        edi, edx
4432c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com
4433c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
4434c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com  convertloop:
4435c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    movdqa     xmm0, [eax]
4436c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    movdqa     xmm1, [eax + 16]
4437c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    lea        eax,  [eax + 32]
4438c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    pand       xmm0, xmm5   // UYVY -> UVUV
4439c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    pand       xmm1, xmm5
4440c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    packuswb   xmm0, xmm1
4441c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    movdqa     xmm1, xmm0
4442c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    pand       xmm0, xmm5  // U
4443c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    packuswb   xmm0, xmm0
4444c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    psrlw      xmm1, 8     // V
4445c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    packuswb   xmm1, xmm1
4446c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    movq       qword ptr [edx], xmm0
4447c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    movq       qword ptr [edx + edi], xmm1
4448c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    lea        edx, [edx + 8]
4449c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    sub        ecx, 16
4450c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    jg         convertloop
4451c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com
4452c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    pop        edi
4453c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    ret
4454c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com  }
4455c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com}
4456c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com
4457c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com__declspec(naked) __declspec(align(16))
4458e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.comvoid UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
4459e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com                               uint8* dst_y, int pix) {
4460e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com  __asm {
4461e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    mov        eax, [esp + 4]    // src_uyvy
4462e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    mov        edx, [esp + 8]    // dst_y
4463e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    mov        ecx, [esp + 12]   // pix
4464e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com
4465c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
4466e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com  convertloop:
4467e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    movdqu     xmm0, [eax]
4468e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    movdqu     xmm1, [eax + 16]
4469e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    lea        eax,  [eax + 32]
4470e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    psrlw      xmm0, 8    // odd bytes are Y
4471e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    psrlw      xmm1, 8
4472e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    packuswb   xmm0, xmm1
447318184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com    sub        ecx, 16
4474e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    movdqu     [edx], xmm0
4475e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    lea        edx, [edx + 16]
447618184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com    jg         convertloop
4477e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    ret
4478e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com  }
4479e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com}
4480e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com
4481d2f4413d29d15b94d971630ba555dd0cd8fcc8c2fbarchard@google.com__declspec(naked) __declspec(align(16))
4482e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.comvoid UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
4483c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com                                uint8* dst_u, uint8* dst_v, int pix) {
4484e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com  __asm {
4485e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    push       esi
4486e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    push       edi
4487e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    mov        eax, [esp + 8 + 4]    // src_yuy2
4488e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    mov        esi, [esp + 8 + 8]    // stride_yuy2
4489e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    mov        edx, [esp + 8 + 12]   // dst_u
4490e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    mov        edi, [esp + 8 + 16]   // dst_v
4491e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    mov        ecx, [esp + 8 + 20]   // pix
4492e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
4493e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    psrlw      xmm5, 8
4494e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    sub        edi, edx
4495e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com
4496c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
4497e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com  convertloop:
4498e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    movdqu     xmm0, [eax]
4499e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    movdqu     xmm1, [eax + 16]
4500e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    movdqu     xmm2, [eax + esi]
4501e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    movdqu     xmm3, [eax + esi + 16]
4502e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    lea        eax,  [eax + 32]
4503e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    pavgb      xmm0, xmm2
4504e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    pavgb      xmm1, xmm3
4505e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    pand       xmm0, xmm5   // UYVY -> UVUV
4506e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    pand       xmm1, xmm5
4507e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    packuswb   xmm0, xmm1
4508e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    movdqa     xmm1, xmm0
4509e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    pand       xmm0, xmm5  // U
4510e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    packuswb   xmm0, xmm0
4511e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    psrlw      xmm1, 8     // V
4512e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    packuswb   xmm1, xmm1
4513e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    movq       qword ptr [edx], xmm0
4514e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    movq       qword ptr [edx + edi], xmm1
4515e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    lea        edx, [edx + 8]
4516e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    sub        ecx, 16
451718184fd19dba08d6567357e3913285a779e4b9f3fbarchard@google.com    jg         convertloop
4518e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com
4519e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    pop        edi
4520e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    pop        esi
4521e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com    ret
4522e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com  }
4523e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com}
4524c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com
4525c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com__declspec(naked) __declspec(align(16))
4526c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.comvoid UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
4527c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com                                   uint8* dst_u, uint8* dst_v, int pix) {
4528c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com  __asm {
4529c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    push       edi
4530c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    mov        eax, [esp + 4 + 4]    // src_yuy2
4531c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    mov        edx, [esp + 4 + 8]    // dst_u
4532c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    mov        edi, [esp + 4 + 12]   // dst_v
4533c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    mov        ecx, [esp + 4 + 16]   // pix
4534c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
4535c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    psrlw      xmm5, 8
4536c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    sub        edi, edx
4537c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com
4538c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
4539c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com  convertloop:
4540c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    movdqu     xmm0, [eax]
4541c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    movdqu     xmm1, [eax + 16]
4542c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    lea        eax,  [eax + 32]
4543c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    pand       xmm0, xmm5   // UYVY -> UVUV
4544c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    pand       xmm1, xmm5
4545c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    packuswb   xmm0, xmm1
4546c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    movdqa     xmm1, xmm0
4547c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    pand       xmm0, xmm5  // U
4548c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    packuswb   xmm0, xmm0
4549c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    psrlw      xmm1, 8     // V
4550c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    packuswb   xmm1, xmm1
4551c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    movq       qword ptr [edx], xmm0
4552c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    movq       qword ptr [edx + edi], xmm1
4553c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    lea        edx, [edx + 8]
4554c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    sub        ecx, 16
4555c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    jg         convertloop
4556c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com
4557c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    pop        edi
4558c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com    ret
4559c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com  }
4560c704f789e9305890d865e6334f57a9febbc83e45fbarchard@google.com}
4561b95dbf24951d8b7118f680d75c7456a5f5d57bfffbarchard@google.com#endif  // HAS_YUY2TOYROW_SSE2
4562e5f3fd4cc870b9b22112b3b2f25af06e067c8b7dfbarchard@google.com
4563965fb914ea3f5057cd186763c9af5d3110c44acdfbarchard@google.com#ifdef HAS_ARGBBLENDROW_SSE2
45641702ec78f85cc484e10eeac501971f76ab173b83fbarchard@google.com// Blend 8 pixels at a time.
456591ab139558747d9109552ec65632e6da9e170861fbarchard@google.com__declspec(naked) __declspec(align(16))
4566bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.comvoid ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4567bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com                       uint8* dst_argb, int width) {
4568c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com  __asm {
4569c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    push       esi
4570c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    mov        eax, [esp + 4 + 4]   // src_argb0
4571c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    mov        esi, [esp + 4 + 8]   // src_argb1
4572c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    mov        edx, [esp + 4 + 12]  // dst_argb
4573c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    mov        ecx, [esp + 4 + 16]  // width
4574c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    pcmpeqb    xmm7, xmm7       // generate constant 1
4575c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    psrlw      xmm7, 15
4576c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff
4577c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    psrlw      xmm6, 8
4578c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
4579c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    psllw      xmm5, 8
4580c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
4581c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    pslld      xmm4, 24
4582c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com
4583bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    sub        ecx, 1
4584bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    je         convertloop1     // only 1 pixel?
4585bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    jl         convertloop1b
4586bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com
4587bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    // 1 pixel loop until destination pointer is aligned.
4588bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com  alignloop1:
4589bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    test       edx, 15          // aligned?
4590bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    je         alignloop1b
4591bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    movd       xmm3, [eax]
4592bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    lea        eax, [eax + 4]
4593bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    movdqa     xmm0, xmm3       // src argb
4594bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    pxor       xmm3, xmm4       // ~alpha
4595bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    movd       xmm2, [esi]      // _r_b
4596bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    psrlw      xmm3, 8          // alpha
459798a1fbf5e9797112515d591b1262db6ae049b8fafbarchard@google.com    pshufhw    xmm3, xmm3, 0F5h // 8 alpha words
459898a1fbf5e9797112515d591b1262db6ae049b8fafbarchard@google.com    pshuflw    xmm3, xmm3, 0F5h
4599bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    pand       xmm2, xmm6       // _r_b
4600bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    paddw      xmm3, xmm7       // 256 - alpha
4601bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    pmullw     xmm2, xmm3       // _r_b * alpha
4602bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    movd       xmm1, [esi]      // _a_g
4603bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    lea        esi, [esi + 4]
4604bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    psrlw      xmm1, 8          // _a_g
4605bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    por        xmm0, xmm4       // set alpha to 255
4606bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    pmullw     xmm1, xmm3       // _a_g * alpha
4607bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    psrlw      xmm2, 8          // _r_b convert to 8 bits again
4608bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    paddusb    xmm0, xmm2       // + src argb
4609bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
4610bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    paddusb    xmm0, xmm1       // + src argb
4611bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    sub        ecx, 1
4612bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    movd       [edx], xmm0
4613bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    lea        edx, [edx + 4]
4614bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    jge        alignloop1
4615bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com
4616bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com  alignloop1b:
4617bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    add        ecx, 1 - 4
4618bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    jl         convertloop4b
4619bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com
4620794fe1236a29b272cf36442c8c4c3d97a33ff64ffbarchard@google.com    // 4 pixel loop.
4621bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com  convertloop4:
4622794fe1236a29b272cf36442c8c4c3d97a33ff64ffbarchard@google.com    movdqu     xmm3, [eax]      // src argb
4623794fe1236a29b272cf36442c8c4c3d97a33ff64ffbarchard@google.com    lea        eax, [eax + 16]
4624c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    movdqa     xmm0, xmm3       // src argb
4625c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    pxor       xmm3, xmm4       // ~alpha
4626794fe1236a29b272cf36442c8c4c3d97a33ff64ffbarchard@google.com    movdqu     xmm2, [esi]      // _r_b
4627c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    psrlw      xmm3, 8          // alpha
462898a1fbf5e9797112515d591b1262db6ae049b8fafbarchard@google.com    pshufhw    xmm3, xmm3, 0F5h // 8 alpha words
462998a1fbf5e9797112515d591b1262db6ae049b8fafbarchard@google.com    pshuflw    xmm3, xmm3, 0F5h
4630c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    pand       xmm2, xmm6       // _r_b
4631c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    paddw      xmm3, xmm7       // 256 - alpha
4632c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    pmullw     xmm2, xmm3       // _r_b * alpha
46331702ec78f85cc484e10eeac501971f76ab173b83fbarchard@google.com    movdqu     xmm1, [esi]      // _a_g
4634794fe1236a29b272cf36442c8c4c3d97a33ff64ffbarchard@google.com    lea        esi, [esi + 16]
4635c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    psrlw      xmm1, 8          // _a_g
4636c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    por        xmm0, xmm4       // set alpha to 255
4637c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    pmullw     xmm1, xmm3       // _a_g * alpha
4638c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    psrlw      xmm2, 8          // _r_b convert to 8 bits again
4639c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    paddusb    xmm0, xmm2       // + src argb
4640c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
4641c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    paddusb    xmm0, xmm1       // + src argb
4642c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    sub        ecx, 4
4643c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    movdqa     [edx], xmm0
4644794fe1236a29b272cf36442c8c4c3d97a33ff64ffbarchard@google.com    lea        edx, [edx + 16]
4645bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    jge        convertloop4
4646c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com
4647bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com  convertloop4b:
4648bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    add        ecx, 4 - 1
4649bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    jl         convertloop1b
4650c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com
4651bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    // 1 pixel loop.
4652bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com  convertloop1:
4653794fe1236a29b272cf36442c8c4c3d97a33ff64ffbarchard@google.com    movd       xmm3, [eax]      // src argb
4654c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    lea        eax, [eax + 4]
4655c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    movdqa     xmm0, xmm3       // src argb
4656c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    pxor       xmm3, xmm4       // ~alpha
4657c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    movd       xmm2, [esi]      // _r_b
4658c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    psrlw      xmm3, 8          // alpha
465998a1fbf5e9797112515d591b1262db6ae049b8fafbarchard@google.com    pshufhw    xmm3, xmm3, 0F5h // 8 alpha words
466098a1fbf5e9797112515d591b1262db6ae049b8fafbarchard@google.com    pshuflw    xmm3, xmm3, 0F5h
4661c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    pand       xmm2, xmm6       // _r_b
4662c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    paddw      xmm3, xmm7       // 256 - alpha
4663c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    pmullw     xmm2, xmm3       // _r_b * alpha
4664c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    movd       xmm1, [esi]      // _a_g
4665c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    lea        esi, [esi + 4]
4666c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    psrlw      xmm1, 8          // _a_g
4667c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    por        xmm0, xmm4       // set alpha to 255
4668c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    pmullw     xmm1, xmm3       // _a_g * alpha
4669c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    psrlw      xmm2, 8          // _r_b convert to 8 bits again
4670c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    paddusb    xmm0, xmm2       // + src argb
4671c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
4672c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    paddusb    xmm0, xmm1       // + src argb
4673c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    sub        ecx, 1
4674c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    movd       [edx], xmm0
4675c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    lea        edx, [edx + 4]
4676bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    jge        convertloop1
4677c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com
4678bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com  convertloop1b:
4679c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    pop        esi
4680c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    ret
4681c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com  }
4682c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com}
4683bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com#endif  // HAS_ARGBBLENDROW_SSE2
4684c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com
4685c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com#ifdef HAS_ARGBBLENDROW_SSSE3
4686bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com// Shuffle table for isolating alpha.
4687851a702b39aefb717990a0578d8701d051cbab32fbarchard@google.comstatic const uvec8 kShuffleAlpha = {
4688d2f4413d29d15b94d971630ba555dd0cd8fcc8c2fbarchard@google.com  3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
4689d2f4413d29d15b94d971630ba555dd0cd8fcc8c2fbarchard@google.com  11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
4690d2f4413d29d15b94d971630ba555dd0cd8fcc8c2fbarchard@google.com};
4691c4c578e327a9dbc9dafe113634612e9a349a8c1ffbarchard@google.com// Same as SSE2, but replaces:
4692c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com//    psrlw      xmm3, 8          // alpha
469398a1fbf5e9797112515d591b1262db6ae049b8fafbarchard@google.com//    pshufhw    xmm3, xmm3, 0F5h // 8 alpha words
469498a1fbf5e9797112515d591b1262db6ae049b8fafbarchard@google.com//    pshuflw    xmm3, xmm3, 0F5h
4695c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com// with..
4696c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com//    pshufb     xmm3, kShuffleAlpha // alpha
4697c4c578e327a9dbc9dafe113634612e9a349a8c1ffbarchard@google.com// Blend 8 pixels at a time.
4698c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com
4699c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com__declspec(naked) __declspec(align(16))
4700bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.comvoid ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
4701bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com                        uint8* dst_argb, int width) {
4702c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com  __asm {
4703c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    push       esi
4704c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    mov        eax, [esp + 4 + 4]   // src_argb0
4705c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    mov        esi, [esp + 4 + 8]   // src_argb1
4706c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    mov        edx, [esp + 4 + 12]  // dst_argb
4707c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    mov        ecx, [esp + 4 + 16]  // width
470838157bdc719c403a620218bc2a35af0f9b4adc85fbarchard@google.com    pcmpeqb    xmm7, xmm7       // generate constant 0x0001
4709c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    psrlw      xmm7, 15
4710c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff
4711c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    psrlw      xmm6, 8
4712c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
4713c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    psllw      xmm5, 8
4714c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
4715c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    pslld      xmm4, 24
4716c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com
4717bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    sub        ecx, 1
4718bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    je         convertloop1     // only 1 pixel?
4719bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    jl         convertloop1b
4720bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com
4721bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    // 1 pixel loop until destination pointer is aligned.
4722bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com  alignloop1:
4723dd3b137f5d9d87e92cb44f754f60b90ba50e31bcfbarchard@google.com    test       edx, 15          // aligned?
4724dd3b137f5d9d87e92cb44f754f60b90ba50e31bcfbarchard@google.com    je         alignloop1b
4725bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    movd       xmm3, [eax]
4726bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    lea        eax, [eax + 4]
4727bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    movdqa     xmm0, xmm3       // src argb
4728bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    pxor       xmm3, xmm4       // ~alpha
4729bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    movd       xmm2, [esi]      // _r_b
4730bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    pshufb     xmm3, kShuffleAlpha // alpha
4731bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    pand       xmm2, xmm6       // _r_b
4732bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    paddw      xmm3, xmm7       // 256 - alpha
4733bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    pmullw     xmm2, xmm3       // _r_b * alpha
4734bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    movd       xmm1, [esi]      // _a_g
4735bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    lea        esi, [esi + 4]
4736bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    psrlw      xmm1, 8          // _a_g
4737bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    por        xmm0, xmm4       // set alpha to 255
4738bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    pmullw     xmm1, xmm3       // _a_g * alpha
4739bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    psrlw      xmm2, 8          // _r_b convert to 8 bits again
4740bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    paddusb    xmm0, xmm2       // + src argb
4741bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
4742bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    paddusb    xmm0, xmm1       // + src argb
4743bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    sub        ecx, 1
4744bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    movd       [edx], xmm0
4745bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    lea        edx, [edx + 4]
4746bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    jge        alignloop1
4747bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com
4748bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com  alignloop1b:
4749bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    add        ecx, 1 - 4
4750bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    jl         convertloop4b
4751bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com
4752f877e71995128ae4c945591574367da5d66a1ac4fbarchard@google.com    test       eax, 15          // unaligned?
4753f877e71995128ae4c945591574367da5d66a1ac4fbarchard@google.com    jne        convertuloop4
4754f877e71995128ae4c945591574367da5d66a1ac4fbarchard@google.com    test       esi, 15          // unaligned?
4755f877e71995128ae4c945591574367da5d66a1ac4fbarchard@google.com    jne        convertuloop4
4756f877e71995128ae4c945591574367da5d66a1ac4fbarchard@google.com
4757794fe1236a29b272cf36442c8c4c3d97a33ff64ffbarchard@google.com    // 4 pixel loop.
4758bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com  convertloop4:
4759f877e71995128ae4c945591574367da5d66a1ac4fbarchard@google.com    movdqa     xmm3, [eax]      // src argb
4760f877e71995128ae4c945591574367da5d66a1ac4fbarchard@google.com    lea        eax, [eax + 16]
4761f877e71995128ae4c945591574367da5d66a1ac4fbarchard@google.com    movdqa     xmm0, xmm3       // src argb
4762f877e71995128ae4c945591574367da5d66a1ac4fbarchard@google.com    pxor       xmm3, xmm4       // ~alpha
4763f877e71995128ae4c945591574367da5d66a1ac4fbarchard@google.com    movdqa     xmm2, [esi]      // _r_b
4764f877e71995128ae4c945591574367da5d66a1ac4fbarchard@google.com    pshufb     xmm3, kShuffleAlpha // alpha
4765f877e71995128ae4c945591574367da5d66a1ac4fbarchard@google.com    pand       xmm2, xmm6       // _r_b
4766f877e71995128ae4c945591574367da5d66a1ac4fbarchard@google.com    paddw      xmm3, xmm7       // 256 - alpha
4767f877e71995128ae4c945591574367da5d66a1ac4fbarchard@google.com    pmullw     xmm2, xmm3       // _r_b * alpha
4768f877e71995128ae4c945591574367da5d66a1ac4fbarchard@google.com    movdqa     xmm1, [esi]      // _a_g
4769f877e71995128ae4c945591574367da5d66a1ac4fbarchard@google.com    lea        esi, [esi + 16]
4770f877e71995128ae4c945591574367da5d66a1ac4fbarchard@google.com    psrlw      xmm1, 8          // _a_g
4771f877e71995128ae4c945591574367da5d66a1ac4fbarchard@google.com    por        xmm0, xmm4       // set alpha to 255
4772f877e71995128ae4c945591574367da5d66a1ac4fbarchard@google.com    pmullw     xmm1, xmm3       // _a_g * alpha
4773f877e71995128ae4c945591574367da5d66a1ac4fbarchard@google.com    psrlw      xmm2, 8          // _r_b convert to 8 bits again
4774f877e71995128ae4c945591574367da5d66a1ac4fbarchard@google.com    paddusb    xmm0, xmm2       // + src argb
4775f877e71995128ae4c945591574367da5d66a1ac4fbarchard@google.com    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
4776f877e71995128ae4c945591574367da5d66a1ac4fbarchard@google.com    paddusb    xmm0, xmm1       // + src argb
4777f877e71995128ae4c945591574367da5d66a1ac4fbarchard@google.com    sub        ecx, 4
4778f877e71995128ae4c945591574367da5d66a1ac4fbarchard@google.com    movdqa     [edx], xmm0
4779f877e71995128ae4c945591574367da5d66a1ac4fbarchard@google.com    lea        edx, [edx + 16]
4780f877e71995128ae4c945591574367da5d66a1ac4fbarchard@google.com    jge        convertloop4
4781f877e71995128ae4c945591574367da5d66a1ac4fbarchard@google.com    jmp        convertloop4b
4782f877e71995128ae4c945591574367da5d66a1ac4fbarchard@google.com
4783f877e71995128ae4c945591574367da5d66a1ac4fbarchard@google.com    // 4 pixel unaligned loop.
4784f877e71995128ae4c945591574367da5d66a1ac4fbarchard@google.com  convertuloop4:
4785794fe1236a29b272cf36442c8c4c3d97a33ff64ffbarchard@google.com    movdqu     xmm3, [eax]      // src argb
4786794fe1236a29b272cf36442c8c4c3d97a33ff64ffbarchard@google.com    lea        eax, [eax + 16]
4787c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    movdqa     xmm0, xmm3       // src argb
4788c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    pxor       xmm3, xmm4       // ~alpha
47891702ec78f85cc484e10eeac501971f76ab173b83fbarchard@google.com    movdqu     xmm2, [esi]      // _r_b
4790794fe1236a29b272cf36442c8c4c3d97a33ff64ffbarchard@google.com    pshufb     xmm3, kShuffleAlpha // alpha
4791c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    pand       xmm2, xmm6       // _r_b
4792c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    paddw      xmm3, xmm7       // 256 - alpha
4793c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    pmullw     xmm2, xmm3       // _r_b * alpha
47941702ec78f85cc484e10eeac501971f76ab173b83fbarchard@google.com    movdqu     xmm1, [esi]      // _a_g
4795794fe1236a29b272cf36442c8c4c3d97a33ff64ffbarchard@google.com    lea        esi, [esi + 16]
4796c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    psrlw      xmm1, 8          // _a_g
4797c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    por        xmm0, xmm4       // set alpha to 255
4798c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    pmullw     xmm1, xmm3       // _a_g * alpha
4799c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    psrlw      xmm2, 8          // _r_b convert to 8 bits again
4800c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    paddusb    xmm0, xmm2       // + src argb
4801c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
4802c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    paddusb    xmm0, xmm1       // + src argb
4803c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    sub        ecx, 4
4804c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com    movdqa     [edx], xmm0
4805794fe1236a29b272cf36442c8c4c3d97a33ff64ffbarchard@google.com    lea        edx, [edx + 16]
4806f877e71995128ae4c945591574367da5d66a1ac4fbarchard@google.com    jge        convertuloop4
4807c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com
4808bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com  convertloop4b:
4809bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    add        ecx, 4 - 1
4810bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    jl         convertloop1b
4811c757f308eab211f9d5467a089052e7d84606f6c1fbarchard@google.com
4812bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    // 1 pixel loop.
4813bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com  convertloop1:
4814794fe1236a29b272cf36442c8c4c3d97a33ff64ffbarchard@google.com    movd       xmm3, [eax]      // src argb
48155ff3a8fec5fa54bca8905f1eb6eb69c14d5fb79ffbarchard@google.com    lea        eax, [eax + 4]
48165ff3a8fec5fa54bca8905f1eb6eb69c14d5fb79ffbarchard@google.com    movdqa     xmm0, xmm3       // src argb
48175ff3a8fec5fa54bca8905f1eb6eb69c14d5fb79ffbarchard@google.com    pxor       xmm3, xmm4       // ~alpha
48185ff3a8fec5fa54bca8905f1eb6eb69c14d5fb79ffbarchard@google.com    movd       xmm2, [esi]      // _r_b
48195ff3a8fec5fa54bca8905f1eb6eb69c14d5fb79ffbarchard@google.com    pshufb     xmm3, kShuffleAlpha // alpha
48205ff3a8fec5fa54bca8905f1eb6eb69c14d5fb79ffbarchard@google.com    pand       xmm2, xmm6       // _r_b
48215ff3a8fec5fa54bca8905f1eb6eb69c14d5fb79ffbarchard@google.com    paddw      xmm3, xmm7       // 256 - alpha
48225ff3a8fec5fa54bca8905f1eb6eb69c14d5fb79ffbarchard@google.com    pmullw     xmm2, xmm3       // _r_b * alpha
48235ff3a8fec5fa54bca8905f1eb6eb69c14d5fb79ffbarchard@google.com    movd       xmm1, [esi]      // _a_g
48245ff3a8fec5fa54bca8905f1eb6eb69c14d5fb79ffbarchard@google.com    lea        esi, [esi + 4]
48255ff3a8fec5fa54bca8905f1eb6eb69c14d5fb79ffbarchard@google.com    psrlw      xmm1, 8          // _a_g
48265ff3a8fec5fa54bca8905f1eb6eb69c14d5fb79ffbarchard@google.com    por        xmm0, xmm4       // set alpha to 255
48275ff3a8fec5fa54bca8905f1eb6eb69c14d5fb79ffbarchard@google.com    pmullw     xmm1, xmm3       // _a_g * alpha
48285ff3a8fec5fa54bca8905f1eb6eb69c14d5fb79ffbarchard@google.com    psrlw      xmm2, 8          // _r_b convert to 8 bits again
48295ff3a8fec5fa54bca8905f1eb6eb69c14d5fb79ffbarchard@google.com    paddusb    xmm0, xmm2       // + src argb
48305ff3a8fec5fa54bca8905f1eb6eb69c14d5fb79ffbarchard@google.com    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
48315ff3a8fec5fa54bca8905f1eb6eb69c14d5fb79ffbarchard@google.com    paddusb    xmm0, xmm1       // + src argb
48325ff3a8fec5fa54bca8905f1eb6eb69c14d5fb79ffbarchard@google.com    sub        ecx, 1
48335ff3a8fec5fa54bca8905f1eb6eb69c14d5fb79ffbarchard@google.com    movd       [edx], xmm0
48345ff3a8fec5fa54bca8905f1eb6eb69c14d5fb79ffbarchard@google.com    lea        edx, [edx + 4]
4835bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com    jge        convertloop1
48365ff3a8fec5fa54bca8905f1eb6eb69c14d5fb79ffbarchard@google.com
4837bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com  convertloop1b:
48385ff3a8fec5fa54bca8905f1eb6eb69c14d5fb79ffbarchard@google.com    pop        esi
48395ff3a8fec5fa54bca8905f1eb6eb69c14d5fb79ffbarchard@google.com    ret
48405ff3a8fec5fa54bca8905f1eb6eb69c14d5fb79ffbarchard@google.com  }
48415ff3a8fec5fa54bca8905f1eb6eb69c14d5fb79ffbarchard@google.com}
4842bac5f2c3ee12535817448e606c7a7704bbae8321fbarchard@google.com#endif  // HAS_ARGBBLENDROW_SSSE3
48435ff3a8fec5fa54bca8905f1eb6eb69c14d5fb79ffbarchard@google.com
48441d160cb99f2b05df80c4555bd769825ad1175dc9fbarchard@google.com#ifdef HAS_ARGBATTENUATEROW_SSE2
48458ed54222e723037322579f15c36d4faddb924e91fbarchard@google.com// Attenuate 4 pixels at a time.
4846c4c578e327a9dbc9dafe113634612e9a349a8c1ffbarchard@google.com// Aligned to 16 bytes.
48478ed54222e723037322579f15c36d4faddb924e91fbarchard@google.com__declspec(naked) __declspec(align(16))
48488ed54222e723037322579f15c36d4faddb924e91fbarchard@google.comvoid ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
48498ed54222e723037322579f15c36d4faddb924e91fbarchard@google.com  __asm {
48508ed54222e723037322579f15c36d4faddb924e91fbarchard@google.com    mov        eax, [esp + 4]   // src_argb0
48518ed54222e723037322579f15c36d4faddb924e91fbarchard@google.com    mov        edx, [esp + 8]   // dst_argb
48528ed54222e723037322579f15c36d4faddb924e91fbarchard@google.com    mov        ecx, [esp + 12]  // width
48538ed54222e723037322579f15c36d4faddb924e91fbarchard@google.com    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
48548ed54222e723037322579f15c36d4faddb924e91fbarchard@google.com    pslld      xmm4, 24
48558ed54222e723037322579f15c36d4faddb924e91fbarchard@google.com    pcmpeqb    xmm5, xmm5       // generate mask 0x00ffffff
48568ed54222e723037322579f15c36d4faddb924e91fbarchard@google.com    psrld      xmm5, 8
48578ed54222e723037322579f15c36d4faddb924e91fbarchard@google.com
4858c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
48598ed54222e723037322579f15c36d4faddb924e91fbarchard@google.com convertloop:
48608ed54222e723037322579f15c36d4faddb924e91fbarchard@google.com    movdqa     xmm0, [eax]      // read 4 pixels
48618ed54222e723037322579f15c36d4faddb924e91fbarchard@google.com    punpcklbw  xmm0, xmm0       // first 2
486298a1fbf5e9797112515d591b1262db6ae049b8fafbarchard@google.com    pshufhw    xmm2, xmm0, 0FFh // 8 alpha words
486398a1fbf5e9797112515d591b1262db6ae049b8fafbarchard@google.com    pshuflw    xmm2, xmm2, 0FFh
48648ed54222e723037322579f15c36d4faddb924e91fbarchard@google.com    pmulhuw    xmm0, xmm2       // rgb * a
48658ed54222e723037322579f15c36d4faddb924e91fbarchard@google.com    movdqa     xmm1, [eax]      // read 4 pixels
48668ed54222e723037322579f15c36d4faddb924e91fbarchard@google.com    punpckhbw  xmm1, xmm1       // next 2 pixels
486798a1fbf5e9797112515d591b1262db6ae049b8fafbarchard@google.com    pshufhw    xmm2, xmm1, 0FFh // 8 alpha words
486898a1fbf5e9797112515d591b1262db6ae049b8fafbarchard@google.com    pshuflw    xmm2, xmm2, 0FFh
48698ed54222e723037322579f15c36d4faddb924e91fbarchard@google.com    pmulhuw    xmm1, xmm2       // rgb * a
4870810cd91079505f04cfec7481b51d04f08250d982fbarchard@google.com    movdqa     xmm2, [eax]      // alphas
4871008ecea4fe387388255f9d5ffcd8f11cc462b64bfbarchard@google.com    lea        eax, [eax + 16]
48728ed54222e723037322579f15c36d4faddb924e91fbarchard@google.com    psrlw      xmm0, 8
4873810cd91079505f04cfec7481b51d04f08250d982fbarchard@google.com    pand       xmm2, xmm4
48748ed54222e723037322579f15c36d4faddb924e91fbarchard@google.com    psrlw      xmm1, 8
48758ed54222e723037322579f15c36d4faddb924e91fbarchard@google.com    packuswb   xmm0, xmm1
48768ed54222e723037322579f15c36d4faddb924e91fbarchard@google.com    pand       xmm0, xmm5       // keep original alphas
4877810cd91079505f04cfec7481b51d04f08250d982fbarchard@google.com    por        xmm0, xmm2
48788ed54222e723037322579f15c36d4faddb924e91fbarchard@google.com    sub        ecx, 4
4879008ecea4fe387388255f9d5ffcd8f11cc462b64bfbarchard@google.com    movdqa     [edx], xmm0
4880008ecea4fe387388255f9d5ffcd8f11cc462b64bfbarchard@google.com    lea        edx, [edx + 16]
48818ed54222e723037322579f15c36d4faddb924e91fbarchard@google.com    jg         convertloop
48828ed54222e723037322579f15c36d4faddb924e91fbarchard@google.com
48838ed54222e723037322579f15c36d4faddb924e91fbarchard@google.com    ret
48848ed54222e723037322579f15c36d4faddb924e91fbarchard@google.com  }
48858ed54222e723037322579f15c36d4faddb924e91fbarchard@google.com}
48861d160cb99f2b05df80c4555bd769825ad1175dc9fbarchard@google.com#endif  // HAS_ARGBATTENUATEROW_SSE2
4887f2c86d01cc46b0851e0bf88429dd064b8c8b0dbafbarchard@google.com
4888eeac2903ef22110d475c50ef9bfd7826d3183a5efbarchard@google.com#ifdef HAS_ARGBATTENUATEROW_SSSE3
4889c4c578e327a9dbc9dafe113634612e9a349a8c1ffbarchard@google.com// Shuffle table duplicating alpha.
4890851a702b39aefb717990a0578d8701d051cbab32fbarchard@google.comstatic const uvec8 kShuffleAlpha0 = {
4891f2c86d01cc46b0851e0bf88429dd064b8c8b0dbafbarchard@google.com  3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
4892f2c86d01cc46b0851e0bf88429dd064b8c8b0dbafbarchard@google.com};
4893851a702b39aefb717990a0578d8701d051cbab32fbarchard@google.comstatic const uvec8 kShuffleAlpha1 = {
4894f2c86d01cc46b0851e0bf88429dd064b8c8b0dbafbarchard@google.com  11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
4895f2c86d01cc46b0851e0bf88429dd064b8c8b0dbafbarchard@google.com  15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
4896f2c86d01cc46b0851e0bf88429dd064b8c8b0dbafbarchard@google.com};
4897f2c86d01cc46b0851e0bf88429dd064b8c8b0dbafbarchard@google.com__declspec(naked) __declspec(align(16))
4898f2c86d01cc46b0851e0bf88429dd064b8c8b0dbafbarchard@google.comvoid ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
48994660679ff38f9ef755ce914ec1bf98d4ad040540fbarchard@google.com  __asm {
4900f2c86d01cc46b0851e0bf88429dd064b8c8b0dbafbarchard@google.com    mov        eax, [esp + 4]   // src_argb0
4901f2c86d01cc46b0851e0bf88429dd064b8c8b0dbafbarchard@google.com    mov        edx, [esp + 8]   // dst_argb
4902f2c86d01cc46b0851e0bf88429dd064b8c8b0dbafbarchard@google.com    mov        ecx, [esp + 12]  // width
4903f2c86d01cc46b0851e0bf88429dd064b8c8b0dbafbarchard@google.com    pcmpeqb    xmm3, xmm3       // generate mask 0xff000000
4904f2c86d01cc46b0851e0bf88429dd064b8c8b0dbafbarchard@google.com    pslld      xmm3, 24
4905f2c86d01cc46b0851e0bf88429dd064b8c8b0dbafbarchard@google.com    movdqa     xmm4, kShuffleAlpha0
4906f2c86d01cc46b0851e0bf88429dd064b8c8b0dbafbarchard@google.com    movdqa     xmm5, kShuffleAlpha1
4907f2c86d01cc46b0851e0bf88429dd064b8c8b0dbafbarchard@google.com
4908c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
4909f2c86d01cc46b0851e0bf88429dd064b8c8b0dbafbarchard@google.com convertloop:
491038157bdc719c403a620218bc2a35af0f9b4adc85fbarchard@google.com    movdqu     xmm0, [eax]      // read 4 pixels
4911810cd91079505f04cfec7481b51d04f08250d982fbarchard@google.com    pshufb     xmm0, xmm4       // isolate first 2 alphas
491238157bdc719c403a620218bc2a35af0f9b4adc85fbarchard@google.com    movdqu     xmm1, [eax]      // read 4 pixels
4913f2c86d01cc46b0851e0bf88429dd064b8c8b0dbafbarchard@google.com    punpcklbw  xmm1, xmm1       // first 2 pixel rgbs
4914f2c86d01cc46b0851e0bf88429dd064b8c8b0dbafbarchard@google.com    pmulhuw    xmm0, xmm1       // rgb * a
491538157bdc719c403a620218bc2a35af0f9b4adc85fbarchard@google.com    movdqu     xmm1, [eax]      // read 4 pixels
4916f2c86d01cc46b0851e0bf88429dd064b8c8b0dbafbarchard@google.com    pshufb     xmm1, xmm5       // isolate next 2 alphas
491738157bdc719c403a620218bc2a35af0f9b4adc85fbarchard@google.com    movdqu     xmm2, [eax]      // read 4 pixels
4918f2c86d01cc46b0851e0bf88429dd064b8c8b0dbafbarchard@google.com    punpckhbw  xmm2, xmm2       // next 2 pixel rgbs
4919f2c86d01cc46b0851e0bf88429dd064b8c8b0dbafbarchard@google.com    pmulhuw    xmm1, xmm2       // rgb * a
492038157bdc719c403a620218bc2a35af0f9b4adc85fbarchard@google.com    movdqu     xmm2, [eax]      // mask original alpha
4921008ecea4fe387388255f9d5ffcd8f11cc462b64bfbarchard@google.com    lea        eax, [eax + 16]
4922f2c86d01cc46b0851e0bf88429dd064b8c8b0dbafbarchard@google.com    pand       xmm2, xmm3
4923f2c86d01cc46b0851e0bf88429dd064b8c8b0dbafbarchard@google.com    psrlw      xmm0, 8
4924f2c86d01cc46b0851e0bf88429dd064b8c8b0dbafbarchard@google.com    psrlw      xmm1, 8
4925f2c86d01cc46b0851e0bf88429dd064b8c8b0dbafbarchard@google.com    packuswb   xmm0, xmm1
4926f2c86d01cc46b0851e0bf88429dd064b8c8b0dbafbarchard@google.com    por        xmm0, xmm2       // copy original alpha
4927f2c86d01cc46b0851e0bf88429dd064b8c8b0dbafbarchard@google.com    sub        ecx, 4
492838157bdc719c403a620218bc2a35af0f9b4adc85fbarchard@google.com    movdqu     [edx], xmm0
4929008ecea4fe387388255f9d5ffcd8f11cc462b64bfbarchard@google.com    lea        edx, [edx + 16]
4930f2c86d01cc46b0851e0bf88429dd064b8c8b0dbafbarchard@google.com    jg         convertloop
4931f2c86d01cc46b0851e0bf88429dd064b8c8b0dbafbarchard@google.com
4932f2c86d01cc46b0851e0bf88429dd064b8c8b0dbafbarchard@google.com    ret
4933f2c86d01cc46b0851e0bf88429dd064b8c8b0dbafbarchard@google.com  }
4934f2c86d01cc46b0851e0bf88429dd064b8c8b0dbafbarchard@google.com}
4935eeac2903ef22110d475c50ef9bfd7826d3183a5efbarchard@google.com#endif  // HAS_ARGBATTENUATEROW_SSSE3
4936f2c86d01cc46b0851e0bf88429dd064b8c8b0dbafbarchard@google.com
4937d5ee3dc9123c9fa4e90ec6b90a5c45f8434cac3ffbarchard@google.com#ifdef HAS_ARGBATTENUATEROW_AVX2
4938d5ee3dc9123c9fa4e90ec6b90a5c45f8434cac3ffbarchard@google.com// Shuffle table duplicating alpha.
4939851a702b39aefb717990a0578d8701d051cbab32fbarchard@google.comstatic const ulvec8 kShuffleAlpha_AVX2 = {
4940d5ee3dc9123c9fa4e90ec6b90a5c45f8434cac3ffbarchard@google.com  6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u,
4941d5ee3dc9123c9fa4e90ec6b90a5c45f8434cac3ffbarchard@google.com  14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u,
4942d5ee3dc9123c9fa4e90ec6b90a5c45f8434cac3ffbarchard@google.com  6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u,
4943d5ee3dc9123c9fa4e90ec6b90a5c45f8434cac3ffbarchard@google.com  14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u,
4944d5ee3dc9123c9fa4e90ec6b90a5c45f8434cac3ffbarchard@google.com};
4945d5ee3dc9123c9fa4e90ec6b90a5c45f8434cac3ffbarchard@google.com__declspec(naked) __declspec(align(16))
4946d5ee3dc9123c9fa4e90ec6b90a5c45f8434cac3ffbarchard@google.comvoid ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
4947d5ee3dc9123c9fa4e90ec6b90a5c45f8434cac3ffbarchard@google.com  __asm {
4948d5ee3dc9123c9fa4e90ec6b90a5c45f8434cac3ffbarchard@google.com    mov        eax, [esp + 4]   // src_argb0
4949d5ee3dc9123c9fa4e90ec6b90a5c45f8434cac3ffbarchard@google.com    mov        edx, [esp + 8]   // dst_argb
4950d5ee3dc9123c9fa4e90ec6b90a5c45f8434cac3ffbarchard@google.com    mov        ecx, [esp + 12]  // width
4951d5ee3dc9123c9fa4e90ec6b90a5c45f8434cac3ffbarchard@google.com    sub        edx, eax
4952d5ee3dc9123c9fa4e90ec6b90a5c45f8434cac3ffbarchard@google.com    vmovdqa    ymm4, kShuffleAlpha_AVX2
4953d5ee3dc9123c9fa4e90ec6b90a5c45f8434cac3ffbarchard@google.com    vpcmpeqb   ymm5, ymm5, ymm5 // generate mask 0xff000000
4954d5ee3dc9123c9fa4e90ec6b90a5c45f8434cac3ffbarchard@google.com    vpslld     ymm5, ymm5, 24
4955d5ee3dc9123c9fa4e90ec6b90a5c45f8434cac3ffbarchard@google.com
4956c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
4957d5ee3dc9123c9fa4e90ec6b90a5c45f8434cac3ffbarchard@google.com convertloop:
4958d5ee3dc9123c9fa4e90ec6b90a5c45f8434cac3ffbarchard@google.com    vmovdqu    ymm6, [eax]       // read 8 pixels.
4959d5ee3dc9123c9fa4e90ec6b90a5c45f8434cac3ffbarchard@google.com    vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
4960d5ee3dc9123c9fa4e90ec6b90a5c45f8434cac3ffbarchard@google.com    vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
4961d5ee3dc9123c9fa4e90ec6b90a5c45f8434cac3ffbarchard@google.com    vpshufb    ymm2, ymm0, ymm4  // low 4 alphas
4962d5ee3dc9123c9fa4e90ec6b90a5c45f8434cac3ffbarchard@google.com    vpshufb    ymm3, ymm1, ymm4  // high 4 alphas
4963d5ee3dc9123c9fa4e90ec6b90a5c45f8434cac3ffbarchard@google.com    vpmulhuw   ymm0, ymm0, ymm2  // rgb * a
4964d5ee3dc9123c9fa4e90ec6b90a5c45f8434cac3ffbarchard@google.com    vpmulhuw   ymm1, ymm1, ymm3  // rgb * a
4965d5ee3dc9123c9fa4e90ec6b90a5c45f8434cac3ffbarchard@google.com    vpand      ymm6, ymm6, ymm5  // isolate alpha
4966d5ee3dc9123c9fa4e90ec6b90a5c45f8434cac3ffbarchard@google.com    vpsrlw     ymm0, ymm0, 8
4967d5ee3dc9123c9fa4e90ec6b90a5c45f8434cac3ffbarchard@google.com    vpsrlw     ymm1, ymm1, 8
4968d5ee3dc9123c9fa4e90ec6b90a5c45f8434cac3ffbarchard@google.com    vpackuswb  ymm0, ymm0, ymm1  // unmutated.
4969d5ee3dc9123c9fa4e90ec6b90a5c45f8434cac3ffbarchard@google.com    vpor       ymm0, ymm0, ymm6  // copy original alpha
4970d5ee3dc9123c9fa4e90ec6b90a5c45f8434cac3ffbarchard@google.com    sub        ecx, 8
4971d5ee3dc9123c9fa4e90ec6b90a5c45f8434cac3ffbarchard@google.com    vmovdqu    [eax + edx], ymm0
4972d5ee3dc9123c9fa4e90ec6b90a5c45f8434cac3ffbarchard@google.com    lea        eax, [eax + 32]
4973d5ee3dc9123c9fa4e90ec6b90a5c45f8434cac3ffbarchard@google.com    jg         convertloop
4974d5ee3dc9123c9fa4e90ec6b90a5c45f8434cac3ffbarchard@google.com
49759b4c00b908d37727c6caf82337813d567732be1cfbarchard@google.com    vzeroupper
4976d5ee3dc9123c9fa4e90ec6b90a5c45f8434cac3ffbarchard@google.com    ret
4977d5ee3dc9123c9fa4e90ec6b90a5c45f8434cac3ffbarchard@google.com  }
4978d5ee3dc9123c9fa4e90ec6b90a5c45f8434cac3ffbarchard@google.com}
4979d5ee3dc9123c9fa4e90ec6b90a5c45f8434cac3ffbarchard@google.com#endif  // HAS_ARGBATTENUATEROW_AVX2
4980d5ee3dc9123c9fa4e90ec6b90a5c45f8434cac3ffbarchard@google.com
4981eeac2903ef22110d475c50ef9bfd7826d3183a5efbarchard@google.com#ifdef HAS_ARGBUNATTENUATEROW_SSE2
4982810cd91079505f04cfec7481b51d04f08250d982fbarchard@google.com// Unattenuate 4 pixels at a time.
4983c4c578e327a9dbc9dafe113634612e9a349a8c1ffbarchard@google.com// Aligned to 16 bytes.
4984810cd91079505f04cfec7481b51d04f08250d982fbarchard@google.com__declspec(naked) __declspec(align(16))
4985810cd91079505f04cfec7481b51d04f08250d982fbarchard@google.comvoid ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
4986810cd91079505f04cfec7481b51d04f08250d982fbarchard@google.com                             int width) {
4987810cd91079505f04cfec7481b51d04f08250d982fbarchard@google.com  __asm {
4988810cd91079505f04cfec7481b51d04f08250d982fbarchard@google.com    push       esi
4989810cd91079505f04cfec7481b51d04f08250d982fbarchard@google.com    push       edi
4990810cd91079505f04cfec7481b51d04f08250d982fbarchard@google.com    mov        eax, [esp + 8 + 4]   // src_argb0
4991810cd91079505f04cfec7481b51d04f08250d982fbarchard@google.com    mov        edx, [esp + 8 + 8]   // dst_argb
4992810cd91079505f04cfec7481b51d04f08250d982fbarchard@google.com    mov        ecx, [esp + 8 + 12]  // width
4993810cd91079505f04cfec7481b51d04f08250d982fbarchard@google.com
4994c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
4995810cd91079505f04cfec7481b51d04f08250d982fbarchard@google.com convertloop:
499638157bdc719c403a620218bc2a35af0f9b4adc85fbarchard@google.com    movdqu     xmm0, [eax]      // read 4 pixels
4997810cd91079505f04cfec7481b51d04f08250d982fbarchard@google.com    movzx      esi, byte ptr [eax + 3]  // first alpha
4998810cd91079505f04cfec7481b51d04f08250d982fbarchard@google.com    movzx      edi, byte ptr [eax + 7]  // second alpha
4999810cd91079505f04cfec7481b51d04f08250d982fbarchard@google.com    punpcklbw  xmm0, xmm0       // first 2
5000810cd91079505f04cfec7481b51d04f08250d982fbarchard@google.com    movd       xmm2, dword ptr fixed_invtbl8[esi * 4]
5001810cd91079505f04cfec7481b51d04f08250d982fbarchard@google.com    movd       xmm3, dword ptr fixed_invtbl8[edi * 4]
500298a1fbf5e9797112515d591b1262db6ae049b8fafbarchard@google.com    pshuflw    xmm2, xmm2, 040h // first 4 inv_alpha words.  1, a, a, a
500398a1fbf5e9797112515d591b1262db6ae049b8fafbarchard@google.com    pshuflw    xmm3, xmm3, 040h // next 4 inv_alpha words
5004810cd91079505f04cfec7481b51d04f08250d982fbarchard@google.com    movlhps    xmm2, xmm3
5005810cd91079505f04cfec7481b51d04f08250d982fbarchard@google.com    pmulhuw    xmm0, xmm2       // rgb * a
5006810cd91079505f04cfec7481b51d04f08250d982fbarchard@google.com
500738157bdc719c403a620218bc2a35af0f9b4adc85fbarchard@google.com    movdqu     xmm1, [eax]      // read 4 pixels
5008810cd91079505f04cfec7481b51d04f08250d982fbarchard@google.com    movzx      esi, byte ptr [eax + 11]  // third alpha
5009810cd91079505f04cfec7481b51d04f08250d982fbarchard@google.com    movzx      edi, byte ptr [eax + 15]  // forth alpha
5010810cd91079505f04cfec7481b51d04f08250d982fbarchard@google.com    punpckhbw  xmm1, xmm1       // next 2
5011810cd91079505f04cfec7481b51d04f08250d982fbarchard@google.com    movd       xmm2, dword ptr fixed_invtbl8[esi * 4]
5012810cd91079505f04cfec7481b51d04f08250d982fbarchard@google.com    movd       xmm3, dword ptr fixed_invtbl8[edi * 4]
501398a1fbf5e9797112515d591b1262db6ae049b8fafbarchard@google.com    pshuflw    xmm2, xmm2, 040h // first 4 inv_alpha words
501498a1fbf5e9797112515d591b1262db6ae049b8fafbarchard@google.com    pshuflw    xmm3, xmm3, 040h // next 4 inv_alpha words
5015810cd91079505f04cfec7481b51d04f08250d982fbarchard@google.com    movlhps    xmm2, xmm3
5016810cd91079505f04cfec7481b51d04f08250d982fbarchard@google.com    pmulhuw    xmm1, xmm2       // rgb * a
5017bb5ea8e4df7aba47d39a0b6e74ce7cc85e5c8c3afbarchard@google.com    lea        eax, [eax + 16]
5018810cd91079505f04cfec7481b51d04f08250d982fbarchard@google.com
5019810cd91079505f04cfec7481b51d04f08250d982fbarchard@google.com    packuswb   xmm0, xmm1
5020810cd91079505f04cfec7481b51d04f08250d982fbarchard@google.com    sub        ecx, 4
502138157bdc719c403a620218bc2a35af0f9b4adc85fbarchard@google.com    movdqu     [edx], xmm0
5022bb5ea8e4df7aba47d39a0b6e74ce7cc85e5c8c3afbarchard@google.com    lea        edx, [edx + 16]
5023810cd91079505f04cfec7481b51d04f08250d982fbarchard@google.com    jg         convertloop
5024810cd91079505f04cfec7481b51d04f08250d982fbarchard@google.com    pop        edi
5025810cd91079505f04cfec7481b51d04f08250d982fbarchard@google.com    pop        esi
5026810cd91079505f04cfec7481b51d04f08250d982fbarchard@google.com    ret
5027810cd91079505f04cfec7481b51d04f08250d982fbarchard@google.com  }
5028810cd91079505f04cfec7481b51d04f08250d982fbarchard@google.com}
5029eeac2903ef22110d475c50ef9bfd7826d3183a5efbarchard@google.com#endif  // HAS_ARGBUNATTENUATEROW_SSE2
5030810cd91079505f04cfec7481b51d04f08250d982fbarchard@google.com
50313c7bb050bd54264f360ced29c1cd2777483bd6f0fbarchard@google.com#ifdef HAS_ARGBUNATTENUATEROW_AVX2
50323c7bb050bd54264f360ced29c1cd2777483bd6f0fbarchard@google.com// Shuffle table duplicating alpha.
5033851a702b39aefb717990a0578d8701d051cbab32fbarchard@google.comstatic const ulvec8 kUnattenShuffleAlpha_AVX2 = {
5034787f82766394fc13ff99bb68308c922c014a6f1dfbarchard@google.com  0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15,
5035787f82766394fc13ff99bb68308c922c014a6f1dfbarchard@google.com  0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15,
50363c7bb050bd54264f360ced29c1cd2777483bd6f0fbarchard@google.com};
5037805fefb9d87bfbe00ae435d779e867c48c10d530fbarchard@google.com// TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
5038805fefb9d87bfbe00ae435d779e867c48c10d530fbarchard@google.com// USE_GATHER is not on by default, due to being a slow instruction.
5039805fefb9d87bfbe00ae435d779e867c48c10d530fbarchard@google.com#ifdef USE_GATHER
50403c7bb050bd54264f360ced29c1cd2777483bd6f0fbarchard@google.com__declspec(naked) __declspec(align(16))
50413c7bb050bd54264f360ced29c1cd2777483bd6f0fbarchard@google.comvoid ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
50423c7bb050bd54264f360ced29c1cd2777483bd6f0fbarchard@google.com                             int width) {
50433c7bb050bd54264f360ced29c1cd2777483bd6f0fbarchard@google.com  __asm {
50443c7bb050bd54264f360ced29c1cd2777483bd6f0fbarchard@google.com    mov        eax, [esp + 4]   // src_argb0
50453c7bb050bd54264f360ced29c1cd2777483bd6f0fbarchard@google.com    mov        edx, [esp + 8]   // dst_argb
50463c7bb050bd54264f360ced29c1cd2777483bd6f0fbarchard@google.com    mov        ecx, [esp + 12]  // width
50473c7bb050bd54264f360ced29c1cd2777483bd6f0fbarchard@google.com    sub        edx, eax
50483c7bb050bd54264f360ced29c1cd2777483bd6f0fbarchard@google.com    vmovdqa    ymm4, kUnattenShuffleAlpha_AVX2
50493c7bb050bd54264f360ced29c1cd2777483bd6f0fbarchard@google.com
5050c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
50513c7bb050bd54264f360ced29c1cd2777483bd6f0fbarchard@google.com convertloop:
50523c7bb050bd54264f360ced29c1cd2777483bd6f0fbarchard@google.com    vmovdqu    ymm6, [eax]       // read 8 pixels.
5053787f82766394fc13ff99bb68308c922c014a6f1dfbarchard@google.com    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0xffffffff for gather.
50543c7bb050bd54264f360ced29c1cd2777483bd6f0fbarchard@google.com    vpsrld     ymm2, ymm6, 24    // alpha in low 8 bits.
50553c7bb050bd54264f360ced29c1cd2777483bd6f0fbarchard@google.com    vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
50563c7bb050bd54264f360ced29c1cd2777483bd6f0fbarchard@google.com    vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
5057787f82766394fc13ff99bb68308c922c014a6f1dfbarchard@google.com    vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5  // ymm5 cleared.  1, a
5058787f82766394fc13ff99bb68308c922c014a6f1dfbarchard@google.com    vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a
5059787f82766394fc13ff99bb68308c922c014a6f1dfbarchard@google.com    vpunpckhwd ymm3, ymm3, ymm3  // high 4 inverted alphas. mutated.
5060787f82766394fc13ff99bb68308c922c014a6f1dfbarchard@google.com    vpshufb    ymm2, ymm2, ymm4  // replicate low 4 alphas. 1, a, a, a
50613c7bb050bd54264f360ced29c1cd2777483bd6f0fbarchard@google.com    vpshufb    ymm3, ymm3, ymm4  // replicate high 4 alphas
50623c7bb050bd54264f360ced29c1cd2777483bd6f0fbarchard@google.com    vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
50633c7bb050bd54264f360ced29c1cd2777483bd6f0fbarchard@google.com    vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
50643c7bb050bd54264f360ced29c1cd2777483bd6f0fbarchard@google.com    vpackuswb  ymm0, ymm0, ymm1  // unmutated.
50653c7bb050bd54264f360ced29c1cd2777483bd6f0fbarchard@google.com    sub        ecx, 8
50663c7bb050bd54264f360ced29c1cd2777483bd6f0fbarchard@google.com    vmovdqu    [eax + edx], ymm0
50673c7bb050bd54264f360ced29c1cd2777483bd6f0fbarchard@google.com    lea        eax, [eax + 32]
50683c7bb050bd54264f360ced29c1cd2777483bd6f0fbarchard@google.com    jg         convertloop
50693c7bb050bd54264f360ced29c1cd2777483bd6f0fbarchard@google.com
50709b4c00b908d37727c6caf82337813d567732be1cfbarchard@google.com    vzeroupper
50713c7bb050bd54264f360ced29c1cd2777483bd6f0fbarchard@google.com    ret
50723c7bb050bd54264f360ced29c1cd2777483bd6f0fbarchard@google.com  }
50733c7bb050bd54264f360ced29c1cd2777483bd6f0fbarchard@google.com}
5074805fefb9d87bfbe00ae435d779e867c48c10d530fbarchard@google.com#else  // USE_GATHER
5075805fefb9d87bfbe00ae435d779e867c48c10d530fbarchard@google.com__declspec(naked) __declspec(align(16))
5076805fefb9d87bfbe00ae435d779e867c48c10d530fbarchard@google.comvoid ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
5077805fefb9d87bfbe00ae435d779e867c48c10d530fbarchard@google.com                             int width) {
5078805fefb9d87bfbe00ae435d779e867c48c10d530fbarchard@google.com  __asm {
5079805fefb9d87bfbe00ae435d779e867c48c10d530fbarchard@google.com
5080805fefb9d87bfbe00ae435d779e867c48c10d530fbarchard@google.com    mov        eax, [esp + 4]   // src_argb0
5081805fefb9d87bfbe00ae435d779e867c48c10d530fbarchard@google.com    mov        edx, [esp + 8]   // dst_argb
5082805fefb9d87bfbe00ae435d779e867c48c10d530fbarchard@google.com    mov        ecx, [esp + 12]  // width
5083805fefb9d87bfbe00ae435d779e867c48c10d530fbarchard@google.com    sub        edx, eax
5084805fefb9d87bfbe00ae435d779e867c48c10d530fbarchard@google.com    vmovdqa    ymm5, kUnattenShuffleAlpha_AVX2
5085805fefb9d87bfbe00ae435d779e867c48c10d530fbarchard@google.com
5086805fefb9d87bfbe00ae435d779e867c48c10d530fbarchard@google.com    push       esi
5087805fefb9d87bfbe00ae435d779e867c48c10d530fbarchard@google.com    push       edi
5088805fefb9d87bfbe00ae435d779e867c48c10d530fbarchard@google.com
5089c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
5090805fefb9d87bfbe00ae435d779e867c48c10d530fbarchard@google.com convertloop:
5091805fefb9d87bfbe00ae435d779e867c48c10d530fbarchard@google.com    // replace VPGATHER
50929b4c00b908d37727c6caf82337813d567732be1cfbarchard@google.com    movzx      esi, byte ptr [eax + 3]                 // alpha0
50939b4c00b908d37727c6caf82337813d567732be1cfbarchard@google.com    movzx      edi, byte ptr [eax + 7]                 // alpha1
5094805fefb9d87bfbe00ae435d779e867c48c10d530fbarchard@google.com    vmovd      xmm0, dword ptr fixed_invtbl8[esi * 4]  // [1,a0]
5095805fefb9d87bfbe00ae435d779e867c48c10d530fbarchard@google.com    vmovd      xmm1, dword ptr fixed_invtbl8[edi * 4]  // [1,a1]
50969b4c00b908d37727c6caf82337813d567732be1cfbarchard@google.com    movzx      esi, byte ptr [eax + 11]                // alpha2
50979b4c00b908d37727c6caf82337813d567732be1cfbarchard@google.com    movzx      edi, byte ptr [eax + 15]                // alpha3
5098805fefb9d87bfbe00ae435d779e867c48c10d530fbarchard@google.com    vpunpckldq xmm6, xmm0, xmm1                        // [1,a1,1,a0]
5099805fefb9d87bfbe00ae435d779e867c48c10d530fbarchard@google.com    vmovd      xmm2, dword ptr fixed_invtbl8[esi * 4]  // [1,a2]
5100805fefb9d87bfbe00ae435d779e867c48c10d530fbarchard@google.com    vmovd      xmm3, dword ptr fixed_invtbl8[edi * 4]  // [1,a3]
51019b4c00b908d37727c6caf82337813d567732be1cfbarchard@google.com    movzx      esi, byte ptr [eax + 19]                // alpha4
51029b4c00b908d37727c6caf82337813d567732be1cfbarchard@google.com    movzx      edi, byte ptr [eax + 23]                // alpha5
5103805fefb9d87bfbe00ae435d779e867c48c10d530fbarchard@google.com    vpunpckldq xmm7, xmm2, xmm3                        // [1,a3,1,a2]
5104805fefb9d87bfbe00ae435d779e867c48c10d530fbarchard@google.com    vmovd      xmm0, dword ptr fixed_invtbl8[esi * 4]  // [1,a4]
5105805fefb9d87bfbe00ae435d779e867c48c10d530fbarchard@google.com    vmovd      xmm1, dword ptr fixed_invtbl8[edi * 4]  // [1,a5]
51069b4c00b908d37727c6caf82337813d567732be1cfbarchard@google.com    movzx      esi, byte ptr [eax + 27]                // alpha6
51079b4c00b908d37727c6caf82337813d567732be1cfbarchard@google.com    movzx      edi, byte ptr [eax + 31]                // alpha7
5108805fefb9d87bfbe00ae435d779e867c48c10d530fbarchard@google.com    vpunpckldq xmm0, xmm0, xmm1                        // [1,a5,1,a4]
5109805fefb9d87bfbe00ae435d779e867c48c10d530fbarchard@google.com    vmovd      xmm2, dword ptr fixed_invtbl8[esi * 4]  // [1,a6]
5110805fefb9d87bfbe00ae435d779e867c48c10d530fbarchard@google.com    vmovd      xmm3, dword ptr fixed_invtbl8[edi * 4]  // [1,a7]
5111805fefb9d87bfbe00ae435d779e867c48c10d530fbarchard@google.com    vpunpckldq xmm2, xmm2, xmm3                        // [1,a7,1,a6]
5112805fefb9d87bfbe00ae435d779e867c48c10d530fbarchard@google.com    vpunpcklqdq xmm3, xmm6, xmm7                       // [1,a3,1,a2,1,a1,1,a0]
5113805fefb9d87bfbe00ae435d779e867c48c10d530fbarchard@google.com    vpunpcklqdq xmm0, xmm0, xmm2                       // [1,a7,1,a6,1,a5,1,a4]
5114805fefb9d87bfbe00ae435d779e867c48c10d530fbarchard@google.com    vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
5115805fefb9d87bfbe00ae435d779e867c48c10d530fbarchard@google.com    // end of VPGATHER
5116805fefb9d87bfbe00ae435d779e867c48c10d530fbarchard@google.com
5117805fefb9d87bfbe00ae435d779e867c48c10d530fbarchard@google.com    vmovdqu    ymm6, [eax]       // read 8 pixels.
5118805fefb9d87bfbe00ae435d779e867c48c10d530fbarchard@google.com    vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
5119805fefb9d87bfbe00ae435d779e867c48c10d530fbarchard@google.com    vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
5120805fefb9d87bfbe00ae435d779e867c48c10d530fbarchard@google.com    vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a
5121805fefb9d87bfbe00ae435d779e867c48c10d530fbarchard@google.com    vpunpckhwd ymm3, ymm3, ymm3  // high 4 inverted alphas. mutated.
5122805fefb9d87bfbe00ae435d779e867c48c10d530fbarchard@google.com    vpshufb    ymm2, ymm2, ymm5  // replicate low 4 alphas. 1, a, a, a
5123805fefb9d87bfbe00ae435d779e867c48c10d530fbarchard@google.com    vpshufb    ymm3, ymm3, ymm5  // replicate high 4 alphas
5124805fefb9d87bfbe00ae435d779e867c48c10d530fbarchard@google.com    vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
5125805fefb9d87bfbe00ae435d779e867c48c10d530fbarchard@google.com    vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
5126805fefb9d87bfbe00ae435d779e867c48c10d530fbarchard@google.com    vpackuswb  ymm0, ymm0, ymm1  // unmutated.
5127805fefb9d87bfbe00ae435d779e867c48c10d530fbarchard@google.com    sub        ecx, 8
5128805fefb9d87bfbe00ae435d779e867c48c10d530fbarchard@google.com    vmovdqu    [eax + edx], ymm0
5129805fefb9d87bfbe00ae435d779e867c48c10d530fbarchard@google.com    lea        eax, [eax + 32]
5130805fefb9d87bfbe00ae435d779e867c48c10d530fbarchard@google.com    jg         convertloop
5131805fefb9d87bfbe00ae435d779e867c48c10d530fbarchard@google.com
5132805fefb9d87bfbe00ae435d779e867c48c10d530fbarchard@google.com    pop        edi
5133805fefb9d87bfbe00ae435d779e867c48c10d530fbarchard@google.com    pop        esi
51349b4c00b908d37727c6caf82337813d567732be1cfbarchard@google.com    vzeroupper
5135805fefb9d87bfbe00ae435d779e867c48c10d530fbarchard@google.com    ret
5136805fefb9d87bfbe00ae435d779e867c48c10d530fbarchard@google.com  }
5137805fefb9d87bfbe00ae435d779e867c48c10d530fbarchard@google.com}
5138805fefb9d87bfbe00ae435d779e867c48c10d530fbarchard@google.com#endif  // USE_GATHER
51393c7bb050bd54264f360ced29c1cd2777483bd6f0fbarchard@google.com#endif  // HAS_ARGBATTENUATEROW_AVX2
51403c7bb050bd54264f360ced29c1cd2777483bd6f0fbarchard@google.com
5141ffaea7eee38e593a3e63553ffa90e554ba81fe30fbarchard@google.com#ifdef HAS_ARGBGRAYROW_SSSE3
5142c4c578e327a9dbc9dafe113634612e9a349a8c1ffbarchard@google.com// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
5143ffaea7eee38e593a3e63553ffa90e554ba81fe30fbarchard@google.com__declspec(naked) __declspec(align(16))
5144eeac2903ef22110d475c50ef9bfd7826d3183a5efbarchard@google.comvoid ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
5145ffaea7eee38e593a3e63553ffa90e554ba81fe30fbarchard@google.com  __asm {
5146eeac2903ef22110d475c50ef9bfd7826d3183a5efbarchard@google.com    mov        eax, [esp + 4]   /* src_argb */
5147eeac2903ef22110d475c50ef9bfd7826d3183a5efbarchard@google.com    mov        edx, [esp + 8]   /* dst_argb */
5148eeac2903ef22110d475c50ef9bfd7826d3183a5efbarchard@google.com    mov        ecx, [esp + 12]  /* width */
5149050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    movdqa     xmm4, kARGBToYJ
5150050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    movdqa     xmm5, kAddYJ64
5151ffaea7eee38e593a3e63553ffa90e554ba81fe30fbarchard@google.com
5152c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
5153ffaea7eee38e593a3e63553ffa90e554ba81fe30fbarchard@google.com convertloop:
5154221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com    movdqa     xmm0, [eax]  // G
5155ffaea7eee38e593a3e63553ffa90e554ba81fe30fbarchard@google.com    movdqa     xmm1, [eax + 16]
5156ffaea7eee38e593a3e63553ffa90e554ba81fe30fbarchard@google.com    pmaddubsw  xmm0, xmm4
5157ffaea7eee38e593a3e63553ffa90e554ba81fe30fbarchard@google.com    pmaddubsw  xmm1, xmm4
5158ffaea7eee38e593a3e63553ffa90e554ba81fe30fbarchard@google.com    phaddw     xmm0, xmm1
5159050b39a5cbf6c0f529531aafba36f2c846a139b1fbarchard@google.com    paddw      xmm0, xmm5  // Add .5 for rounding.
5160ffaea7eee38e593a3e63553ffa90e554ba81fe30fbarchard@google.com    psrlw      xmm0, 7
5161221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com    packuswb   xmm0, xmm0   // 8 G bytes
5162221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com    movdqa     xmm2, [eax]  // A
5163221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com    movdqa     xmm3, [eax + 16]
5164b8ffdc9e574a1552955dbb62369082c7a475e9fafbarchard@google.com    lea        eax, [eax + 32]
5165221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com    psrld      xmm2, 24
5166221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com    psrld      xmm3, 24
5167221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com    packuswb   xmm2, xmm3
5168221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com    packuswb   xmm2, xmm2   // 8 A bytes
5169221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com    movdqa     xmm3, xmm0   // Weave into GG, GA, then GGGA
5170221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com    punpcklbw  xmm0, xmm0   // 8 GG words
5171221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com    punpcklbw  xmm3, xmm2   // 8 GA words
5172ffaea7eee38e593a3e63553ffa90e554ba81fe30fbarchard@google.com    movdqa     xmm1, xmm0
5173221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com    punpcklwd  xmm0, xmm3   // GGGA first 4
5174221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com    punpckhwd  xmm1, xmm3   // GGGA next 4
5175ffaea7eee38e593a3e63553ffa90e554ba81fe30fbarchard@google.com    sub        ecx, 8
5176b8ffdc9e574a1552955dbb62369082c7a475e9fafbarchard@google.com    movdqa     [edx], xmm0
5177b8ffdc9e574a1552955dbb62369082c7a475e9fafbarchard@google.com    movdqa     [edx + 16], xmm1
5178b8ffdc9e574a1552955dbb62369082c7a475e9fafbarchard@google.com    lea        edx, [edx + 32]
5179ffaea7eee38e593a3e63553ffa90e554ba81fe30fbarchard@google.com    jg         convertloop
5180ffaea7eee38e593a3e63553ffa90e554ba81fe30fbarchard@google.com    ret
5181ffaea7eee38e593a3e63553ffa90e554ba81fe30fbarchard@google.com  }
5182ffaea7eee38e593a3e63553ffa90e554ba81fe30fbarchard@google.com}
5183ffaea7eee38e593a3e63553ffa90e554ba81fe30fbarchard@google.com#endif  // HAS_ARGBGRAYROW_SSSE3
5184221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com
5185221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com#ifdef HAS_ARGBSEPIAROW_SSSE3
5186221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com//    b = (r * 35 + g * 68 + b * 17) >> 7
5187221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com//    g = (r * 45 + g * 88 + b * 22) >> 7
5188221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com//    r = (r * 50 + g * 98 + b * 24) >> 7
5189c4c578e327a9dbc9dafe113634612e9a349a8c1ffbarchard@google.com// Constant for ARGB color to sepia tone.
5190851a702b39aefb717990a0578d8701d051cbab32fbarchard@google.comstatic const vec8 kARGBToSepiaB = {
5191221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com  17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
5192221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com};
5193221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com
5194851a702b39aefb717990a0578d8701d051cbab32fbarchard@google.comstatic const vec8 kARGBToSepiaG = {
5195221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com  22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
5196221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com};
5197221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com
5198851a702b39aefb717990a0578d8701d051cbab32fbarchard@google.comstatic const vec8 kARGBToSepiaR = {
5199221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com  24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
5200221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com};
5201221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com
5202e442dc4c2a896e85419628e3b7d97c4dfbe71c9dfbarchard@google.com// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
5203221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com__declspec(naked) __declspec(align(16))
5204221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.comvoid ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
5205221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com  __asm {
5206221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com    mov        eax, [esp + 4]   /* dst_argb */
5207221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com    mov        ecx, [esp + 8]   /* width */
5208221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com    movdqa     xmm2, kARGBToSepiaB
5209221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com    movdqa     xmm3, kARGBToSepiaG
5210221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com    movdqa     xmm4, kARGBToSepiaR
5211221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com
5212c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
5213221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com convertloop:
5214221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com    movdqa     xmm0, [eax]  // B
5215221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com    movdqa     xmm6, [eax + 16]
5216221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com    pmaddubsw  xmm0, xmm2
5217221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com    pmaddubsw  xmm6, xmm2
5218221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com    phaddw     xmm0, xmm6
5219221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com    psrlw      xmm0, 7
5220221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com    packuswb   xmm0, xmm0   // 8 B values
5221221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com    movdqa     xmm5, [eax]  // G
5222221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com    movdqa     xmm1, [eax + 16]
5223221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com    pmaddubsw  xmm5, xmm3
5224221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com    pmaddubsw  xmm1, xmm3
5225221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com    phaddw     xmm5, xmm1
5226221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com    psrlw      xmm5, 7
5227221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com    packuswb   xmm5, xmm5   // 8 G values
5228221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com    punpcklbw  xmm0, xmm5   // 8 BG values
5229221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com    movdqa     xmm5, [eax]  // R
5230221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com    movdqa     xmm1, [eax + 16]
5231221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com    pmaddubsw  xmm5, xmm4
5232221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com    pmaddubsw  xmm1, xmm4
5233221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com    phaddw     xmm5, xmm1
5234221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com    psrlw      xmm5, 7
5235221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com    packuswb   xmm5, xmm5   // 8 R values
5236221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com    movdqa     xmm6, [eax]  // A
5237221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com    movdqa     xmm1, [eax + 16]
5238221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com    psrld      xmm6, 24
5239221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com    psrld      xmm1, 24
5240221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com    packuswb   xmm6, xmm1
5241221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com    packuswb   xmm6, xmm6   // 8 A values
5242221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com    punpcklbw  xmm5, xmm6   // 8 RA values
5243221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com    movdqa     xmm1, xmm0   // Weave BG, RA together
5244221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com    punpcklwd  xmm0, xmm5   // BGRA first 4
5245221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com    punpckhwd  xmm1, xmm5   // BGRA next 4
5246221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com    sub        ecx, 8
5247221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com    movdqa     [eax], xmm0
5248221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com    movdqa     [eax + 16], xmm1
5249221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com    lea        eax, [eax + 32]
5250221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com    jg         convertloop
5251221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com    ret
5252221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com  }
5253221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com}
5254221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com#endif  // HAS_ARGBSEPIAROW_SSSE3
525581b804e35c0346ee2fc5f8d11945eab9a88fdb10fbarchard@google.com
5256e442dc4c2a896e85419628e3b7d97c4dfbe71c9dfbarchard@google.com#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
5257e442dc4c2a896e85419628e3b7d97c4dfbe71c9dfbarchard@google.com// Tranform 8 ARGB pixels (32 bytes) with color matrix.
5258e442dc4c2a896e85419628e3b7d97c4dfbe71c9dfbarchard@google.com// Same as Sepia except matrix is provided.
525964ce0ab544591b1e26ae6d276932cacdb8137071fbarchard@google.com// TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
5260e442dc4c2a896e85419628e3b7d97c4dfbe71c9dfbarchard@google.com// and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
5261e442dc4c2a896e85419628e3b7d97c4dfbe71c9dfbarchard@google.com__declspec(naked) __declspec(align(16))
5262c99db063e24d6180740d4adc29e84159096eef2dfbarchard@google.comvoid ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
5263c99db063e24d6180740d4adc29e84159096eef2dfbarchard@google.com                              const int8* matrix_argb, int width) {
5264e442dc4c2a896e85419628e3b7d97c4dfbe71c9dfbarchard@google.com  __asm {
5265c99db063e24d6180740d4adc29e84159096eef2dfbarchard@google.com    mov        eax, [esp + 4]   /* src_argb */
5266c99db063e24d6180740d4adc29e84159096eef2dfbarchard@google.com    mov        edx, [esp + 8]   /* dst_argb */
5267c99db063e24d6180740d4adc29e84159096eef2dfbarchard@google.com    mov        ecx, [esp + 12]  /* matrix_argb */
52684a4b7374c12598560904609c91059003b57bc3d4fbarchard@google.com    movdqu     xmm5, [ecx]
52694a4b7374c12598560904609c91059003b57bc3d4fbarchard@google.com    pshufd     xmm2, xmm5, 0x00
52704a4b7374c12598560904609c91059003b57bc3d4fbarchard@google.com    pshufd     xmm3, xmm5, 0x55
52714a4b7374c12598560904609c91059003b57bc3d4fbarchard@google.com    pshufd     xmm4, xmm5, 0xaa
52724a4b7374c12598560904609c91059003b57bc3d4fbarchard@google.com    pshufd     xmm5, xmm5, 0xff
5273c99db063e24d6180740d4adc29e84159096eef2dfbarchard@google.com    mov        ecx, [esp + 16]  /* width */
5274e442dc4c2a896e85419628e3b7d97c4dfbe71c9dfbarchard@google.com
527511a0d48e45a7acd5aaf6b914caeee06432f06b6bfbarchard@google.com    align      4
5276e442dc4c2a896e85419628e3b7d97c4dfbe71c9dfbarchard@google.com convertloop:
5277e442dc4c2a896e85419628e3b7d97c4dfbe71c9dfbarchard@google.com    movdqa     xmm0, [eax]  // B
5278c99db063e24d6180740d4adc29e84159096eef2dfbarchard@google.com    movdqa     xmm7, [eax + 16]
5279e442dc4c2a896e85419628e3b7d97c4dfbe71c9dfbarchard@google.com    pmaddubsw  xmm0, xmm2
5280c99db063e24d6180740d4adc29e84159096eef2dfbarchard@google.com    pmaddubsw  xmm7, xmm2
5281c99db063e24d6180740d4adc29e84159096eef2dfbarchard@google.com    movdqa     xmm6, [eax]  // G
5282e442dc4c2a896e85419628e3b7d97c4dfbe71c9dfbarchard@google.com    movdqa     xmm1, [eax + 16]
5283c99db063e24d6180740d4adc29e84159096eef2dfbarchard@google.com    pmaddubsw  xmm6, xmm3
5284e442dc4c2a896e85419628e3b7d97c4dfbe71c9dfbarchard@google.com    pmaddubsw  xmm1, xmm3
5285c99db063e24d6180740d4adc29e84159096eef2dfbarchard@google.com    phaddsw    xmm0, xmm7   // B
5286c99db063e24d6180740d4adc29e84159096eef2dfbarchard@google.com    phaddsw    xmm6, xmm1   // G
5287c99db063e24d6180740d4adc29e84159096eef2dfbarchard@google.com    psraw      xmm0, 6      // B
5288c99db063e24d6180740d4adc29e84159096eef2dfbarchard@google.com    psraw      xmm6, 6      // G
52898f439eac1dc6352c214d3797a2af5cee80ead300fbarchard@google.com    packuswb   xmm0, xmm0   // 8 B values
5290c99db063e24d6180740d4adc29e84159096eef2dfbarchard@google.com    packuswb   xmm6, xmm6   // 8 G values
5291c99db063e24d6180740d4adc29e84159096eef2dfbarchard@google.com    punpcklbw  xmm0, xmm6   // 8 BG values
5292c99db063e24d6180740d4adc29e84159096eef2dfbarchard@google.com    movdqa     xmm1, [eax]  // R
5293c99db063e24d6180740d4adc29e84159096eef2dfbarchard@google.com    movdqa     xmm7, [eax + 16]
5294e442dc4c2a896e85419628e3b7d97c4dfbe71c9dfbarchard@google.com    pmaddubsw  xmm1, xmm4
5295c99db063e24d6180740d4adc29e84159096eef2dfbarchard@google.com    pmaddubsw  xmm7, xmm4
5296c99db063e24d6180740d4adc29e84159096eef2dfbarchard@google.com    phaddsw    xmm1, xmm7   // R
5297e442dc4c2a896e85419628e3b7d97c4dfbe71c9dfbarchard@google.com    movdqa     xmm6, [eax]  // A
5298c99db063e24d6180740d4adc29e84159096eef2dfbarchard@google.com    movdqa     xmm7, [eax + 16]
5299c99db063e24d6180740d4adc29e84159096eef2dfbarchard@google.com    pmaddubsw  xmm6, xmm5
5300c99db063e24d6180740d4adc29e84159096eef2dfbarchard@google.com    pmaddubsw  xmm7, xmm5
5301c99db063e24d6180740d4adc29e84159096eef2dfbarchard@google.com    phaddsw    xmm6, xmm7   // A
5302c99db063e24d6180740d4adc29e84159096eef2dfbarchard@google.com    psraw      xmm1, 6      // R
5303c99db063e24d6180740d4adc29e84159096eef2dfbarchard@google.com    psraw      xmm6, 6      // A
5304c99db063e24d6180740d4adc29e84159096eef2dfbarchard@google.com    packuswb   xmm1, xmm1   // 8 R values
5305e442dc4c2a896e85419628e3b7d97c4dfbe71c9dfbarchard@google.com    packuswb   xmm6, xmm6   // 8 A values
5306c99db063e24d6180740d4adc29e84159096eef2dfbarchard@google.com    punpcklbw  xmm1, xmm6   // 8 RA values
5307c99db063e24d6180740d4adc29e84159096eef2dfbarchard@google.com    movdqa     xmm6, xmm0   // Weave BG, RA together
5308c99db063e24d6180740d4adc29e84159096eef2dfbarchard@google.com    punpcklwd  xmm0, xmm1   // BGRA first 4
5309c99db063e24d6180740d4adc29e84159096eef2dfbarchard@google.com    punpckhwd  xmm6, xmm1   // BGRA next 4
5310e442dc4c2a896e85419628e3b7d97c4dfbe71c9dfbarchard@google.com    sub        ecx, 8
5311c99db063e24d6180740d4adc29e84159096eef2dfbarchard@google.com    movdqa     [edx], xmm0
5312c99db063e24d6180740d4adc29e84159096eef2dfbarchard@google.com    movdqa     [edx + 16], xmm6
5313e442dc4c2a896e85419628e3b7d97c4dfbe71c9dfbarchard@google.com    lea        eax, [eax + 32]
5314c99db063e24d6180740d4adc29e84159096eef2dfbarchard@google.com    lea        edx, [edx + 32]
5315e442dc4c2a896e85419628e3b7d97c4dfbe71c9dfbarchard@google.com    jg         convertloop
5316e442dc4c2a896e85419628e3b7d97c4dfbe71c9dfbarchard@google.com    ret
5317e442dc4c2a896e85419628e3b7d97c4dfbe71c9dfbarchard@google.com  }
5318e442dc4c2a896e85419628e3b7d97c4dfbe71c9dfbarchard@google.com}
5319e442dc4c2a896e85419628e3b7d97c4dfbe71c9dfbarchard@google.com#endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
5320e442dc4c2a896e85419628e3b7d97c4dfbe71c9dfbarchard@google.com
532181b804e35c0346ee2fc5f8d11945eab9a88fdb10fbarchard@google.com#ifdef HAS_ARGBQUANTIZEROW_SSE2
532281b804e35c0346ee2fc5f8d11945eab9a88fdb10fbarchard@google.com// Quantize 4 ARGB pixels (16 bytes).
5323c4c578e327a9dbc9dafe113634612e9a349a8c1ffbarchard@google.com// Aligned to 16 bytes.
532481b804e35c0346ee2fc5f8d11945eab9a88fdb10fbarchard@google.com__declspec(naked) __declspec(align(16))
532581b804e35c0346ee2fc5f8d11945eab9a88fdb10fbarchard@google.comvoid ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
532681b804e35c0346ee2fc5f8d11945eab9a88fdb10fbarchard@google.com                          int interval_offset, int width) {
532781b804e35c0346ee2fc5f8d11945eab9a88fdb10fbarchard@google.com  __asm {
532881b804e35c0346ee2fc5f8d11945eab9a88fdb10fbarchard@google.com    mov        eax, [esp + 4]    /* dst_argb */
532981b804e35c0346ee2fc5f8d11945eab9a88fdb10fbarchard@google.com    movd       xmm2, [esp + 8]   /* scale */
533081b804e35c0346ee2fc5f8d11945eab9a88fdb10fbarchard@google.com    movd       xmm3, [esp + 12]  /* interval_size */
533181b804e35c0346ee2fc5f8d11945eab9a88fdb10fbarchard@google.com    movd       xmm4, [esp + 16]  /* interval_offset */
533281b804e35c0346ee2fc5f8d11945eab9a88fdb10fbarchard@google.com    mov        ecx, [esp + 20]   /* width */
533381b804e35c0346ee2fc5f8d11945eab9a88fdb10fbarchard@google.com    pshuflw    xmm2, xmm2, 040h
533481b804e35c0346ee2fc5f8d11945eab9a88fdb10fbarchard@google.com    pshufd     xmm2, xmm2, 044h
533581b804e35c0346ee2fc5f8d11945eab9a88fdb10fbarchard@google.com    pshuflw    xmm3, xmm3, 040h
533681b804e35c0346ee2fc5f8d11945eab9a88fdb10fbarchard@google.com    pshufd     xmm3, xmm3, 044h
533781b804e35c0346ee2fc5f8d11945eab9a88fdb10fbarchard@google.com    pshuflw    xmm4, xmm4, 040h
533881b804e35c0346ee2fc5f8d11945eab9a88fdb10fbarchard@google.com    pshufd     xmm4, xmm4, 044h
533981b804e35c0346ee2fc5f8d11945eab9a88fdb10fbarchard@google.com    pxor       xmm5, xmm5  // constant 0
534081b804e35c0346ee2fc5f8d11945eab9a88fdb10fbarchard@google.com    pcmpeqb    xmm6, xmm6  // generate mask 0xff000000
534181b804e35c0346ee2fc5f8d11945eab9a88fdb10fbarchard@google.com    pslld      xmm6, 24
534281b804e35c0346ee2fc5f8d11945eab9a88fdb10fbarchard@google.com
5343c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
534481b804e35c0346ee2fc5f8d11945eab9a88fdb10fbarchard@google.com convertloop:
534581b804e35c0346ee2fc5f8d11945eab9a88fdb10fbarchard@google.com    movdqa     xmm0, [eax]  // read 4 pixels
534681b804e35c0346ee2fc5f8d11945eab9a88fdb10fbarchard@google.com    punpcklbw  xmm0, xmm5   // first 2 pixels
534781b804e35c0346ee2fc5f8d11945eab9a88fdb10fbarchard@google.com    pmulhuw    xmm0, xmm2   // pixel * scale >> 16
534881b804e35c0346ee2fc5f8d11945eab9a88fdb10fbarchard@google.com    movdqa     xmm1, [eax]  // read 4 pixels
534981b804e35c0346ee2fc5f8d11945eab9a88fdb10fbarchard@google.com    punpckhbw  xmm1, xmm5   // next 2 pixels
535081b804e35c0346ee2fc5f8d11945eab9a88fdb10fbarchard@google.com    pmulhuw    xmm1, xmm2
535181b804e35c0346ee2fc5f8d11945eab9a88fdb10fbarchard@google.com    pmullw     xmm0, xmm3   // * interval_size
535281b804e35c0346ee2fc5f8d11945eab9a88fdb10fbarchard@google.com    movdqa     xmm7, [eax]  // read 4 pixels
535381b804e35c0346ee2fc5f8d11945eab9a88fdb10fbarchard@google.com    pmullw     xmm1, xmm3
535481b804e35c0346ee2fc5f8d11945eab9a88fdb10fbarchard@google.com    pand       xmm7, xmm6   // mask alpha
535581b804e35c0346ee2fc5f8d11945eab9a88fdb10fbarchard@google.com    paddw      xmm0, xmm4   // + interval_size / 2
535681b804e35c0346ee2fc5f8d11945eab9a88fdb10fbarchard@google.com    paddw      xmm1, xmm4
535781b804e35c0346ee2fc5f8d11945eab9a88fdb10fbarchard@google.com    packuswb   xmm0, xmm1
535881b804e35c0346ee2fc5f8d11945eab9a88fdb10fbarchard@google.com    por        xmm0, xmm7
535981b804e35c0346ee2fc5f8d11945eab9a88fdb10fbarchard@google.com    sub        ecx, 4
536081b804e35c0346ee2fc5f8d11945eab9a88fdb10fbarchard@google.com    movdqa     [eax], xmm0
536181b804e35c0346ee2fc5f8d11945eab9a88fdb10fbarchard@google.com    lea        eax, [eax + 16]
536281b804e35c0346ee2fc5f8d11945eab9a88fdb10fbarchard@google.com    jg         convertloop
536381b804e35c0346ee2fc5f8d11945eab9a88fdb10fbarchard@google.com    ret
536481b804e35c0346ee2fc5f8d11945eab9a88fdb10fbarchard@google.com  }
536581b804e35c0346ee2fc5f8d11945eab9a88fdb10fbarchard@google.com}
536681b804e35c0346ee2fc5f8d11945eab9a88fdb10fbarchard@google.com#endif  // HAS_ARGBQUANTIZEROW_SSE2
536781b804e35c0346ee2fc5f8d11945eab9a88fdb10fbarchard@google.com
5368b94b139e86635d40ed0d054bb66e30e6086ae7a3fbarchard@google.com#ifdef HAS_ARGBSHADEROW_SSE2
5369b94b139e86635d40ed0d054bb66e30e6086ae7a3fbarchard@google.com// Shade 4 pixels at a time by specified value.
5370b94b139e86635d40ed0d054bb66e30e6086ae7a3fbarchard@google.com// Aligned to 16 bytes.
5371b94b139e86635d40ed0d054bb66e30e6086ae7a3fbarchard@google.com__declspec(naked) __declspec(align(16))
5372b94b139e86635d40ed0d054bb66e30e6086ae7a3fbarchard@google.comvoid ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
5373b94b139e86635d40ed0d054bb66e30e6086ae7a3fbarchard@google.com                       uint32 value) {
5374b94b139e86635d40ed0d054bb66e30e6086ae7a3fbarchard@google.com  __asm {
5375b94b139e86635d40ed0d054bb66e30e6086ae7a3fbarchard@google.com    mov        eax, [esp + 4]   // src_argb
5376b94b139e86635d40ed0d054bb66e30e6086ae7a3fbarchard@google.com    mov        edx, [esp + 8]   // dst_argb
5377b94b139e86635d40ed0d054bb66e30e6086ae7a3fbarchard@google.com    mov        ecx, [esp + 12]  // width
5378b94b139e86635d40ed0d054bb66e30e6086ae7a3fbarchard@google.com    movd       xmm2, [esp + 16]  // value
5379b94b139e86635d40ed0d054bb66e30e6086ae7a3fbarchard@google.com    punpcklbw  xmm2, xmm2
5380b94b139e86635d40ed0d054bb66e30e6086ae7a3fbarchard@google.com    punpcklqdq xmm2, xmm2
5381b94b139e86635d40ed0d054bb66e30e6086ae7a3fbarchard@google.com
5382c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
5383b94b139e86635d40ed0d054bb66e30e6086ae7a3fbarchard@google.com convertloop:
5384b94b139e86635d40ed0d054bb66e30e6086ae7a3fbarchard@google.com    movdqa     xmm0, [eax]      // read 4 pixels
5385abfeea9b81084185b0d3abc8fe5b5c4f496a0c18fbarchard@google.com    lea        eax, [eax + 16]
5386b94b139e86635d40ed0d054bb66e30e6086ae7a3fbarchard@google.com    movdqa     xmm1, xmm0
5387b94b139e86635d40ed0d054bb66e30e6086ae7a3fbarchard@google.com    punpcklbw  xmm0, xmm0       // first 2
5388b94b139e86635d40ed0d054bb66e30e6086ae7a3fbarchard@google.com    punpckhbw  xmm1, xmm1       // next 2
5389b94b139e86635d40ed0d054bb66e30e6086ae7a3fbarchard@google.com    pmulhuw    xmm0, xmm2       // argb * value
5390b94b139e86635d40ed0d054bb66e30e6086ae7a3fbarchard@google.com    pmulhuw    xmm1, xmm2       // argb * value
5391b94b139e86635d40ed0d054bb66e30e6086ae7a3fbarchard@google.com    psrlw      xmm0, 8
5392b94b139e86635d40ed0d054bb66e30e6086ae7a3fbarchard@google.com    psrlw      xmm1, 8
5393b94b139e86635d40ed0d054bb66e30e6086ae7a3fbarchard@google.com    packuswb   xmm0, xmm1
5394b94b139e86635d40ed0d054bb66e30e6086ae7a3fbarchard@google.com    sub        ecx, 4
5395abfeea9b81084185b0d3abc8fe5b5c4f496a0c18fbarchard@google.com    movdqa     [edx], xmm0
5396abfeea9b81084185b0d3abc8fe5b5c4f496a0c18fbarchard@google.com    lea        edx, [edx + 16]
5397b94b139e86635d40ed0d054bb66e30e6086ae7a3fbarchard@google.com    jg         convertloop
5398b94b139e86635d40ed0d054bb66e30e6086ae7a3fbarchard@google.com
5399b94b139e86635d40ed0d054bb66e30e6086ae7a3fbarchard@google.com    ret
5400b94b139e86635d40ed0d054bb66e30e6086ae7a3fbarchard@google.com  }
5401b94b139e86635d40ed0d054bb66e30e6086ae7a3fbarchard@google.com}
5402b94b139e86635d40ed0d054bb66e30e6086ae7a3fbarchard@google.com#endif  // HAS_ARGBSHADEROW_SSE2
5403b94b139e86635d40ed0d054bb66e30e6086ae7a3fbarchard@google.com
5404b3c1a3fe796a6caf2a0bc8d0882bc65514d9b74dfbarchard@google.com#ifdef HAS_ARGBMULTIPLYROW_SSE2
540583e1b17cc0b1840c7b5e361fa19e7263fca2b32bfbarchard@google.com// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
5406b3c1a3fe796a6caf2a0bc8d0882bc65514d9b74dfbarchard@google.com__declspec(naked) __declspec(align(16))
54078fa76349948802d728dd244a7b54051d751d8696fbarchard@google.comvoid ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
54088fa76349948802d728dd244a7b54051d751d8696fbarchard@google.com                          uint8* dst_argb, int width) {
5409b3c1a3fe796a6caf2a0bc8d0882bc65514d9b74dfbarchard@google.com  __asm {
54108fa76349948802d728dd244a7b54051d751d8696fbarchard@google.com    push       esi
54118fa76349948802d728dd244a7b54051d751d8696fbarchard@google.com    mov        eax, [esp + 4 + 4]   // src_argb0
54128fa76349948802d728dd244a7b54051d751d8696fbarchard@google.com    mov        esi, [esp + 4 + 8]   // src_argb1
54138fa76349948802d728dd244a7b54051d751d8696fbarchard@google.com    mov        edx, [esp + 4 + 12]  // dst_argb
54148fa76349948802d728dd244a7b54051d751d8696fbarchard@google.com    mov        ecx, [esp + 4 + 16]  // width
5415b3c1a3fe796a6caf2a0bc8d0882bc65514d9b74dfbarchard@google.com    pxor       xmm5, xmm5  // constant 0
5416b3c1a3fe796a6caf2a0bc8d0882bc65514d9b74dfbarchard@google.com
5417c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
5418b3c1a3fe796a6caf2a0bc8d0882bc65514d9b74dfbarchard@google.com convertloop:
5419bb92acade0cd17a83ad32177da6a449b2962066efbarchard@google.com    movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
5420abfeea9b81084185b0d3abc8fe5b5c4f496a0c18fbarchard@google.com    movdqu     xmm2, [esi]        // read 4 pixels from src_argb1
5421bb92acade0cd17a83ad32177da6a449b2962066efbarchard@google.com    movdqu     xmm1, xmm0
5422bb92acade0cd17a83ad32177da6a449b2962066efbarchard@google.com    movdqu     xmm3, xmm2
5423abfeea9b81084185b0d3abc8fe5b5c4f496a0c18fbarchard@google.com    punpcklbw  xmm0, xmm0         // first 2
5424abfeea9b81084185b0d3abc8fe5b5c4f496a0c18fbarchard@google.com    punpckhbw  xmm1, xmm1         // next 2
5425abfeea9b81084185b0d3abc8fe5b5c4f496a0c18fbarchard@google.com    punpcklbw  xmm2, xmm5         // first 2
5426abfeea9b81084185b0d3abc8fe5b5c4f496a0c18fbarchard@google.com    punpckhbw  xmm3, xmm5         // next 2
5427abfeea9b81084185b0d3abc8fe5b5c4f496a0c18fbarchard@google.com    pmulhuw    xmm0, xmm2         // src_argb0 * src_argb1 first 2
5428abfeea9b81084185b0d3abc8fe5b5c4f496a0c18fbarchard@google.com    pmulhuw    xmm1, xmm3         // src_argb0 * src_argb1 next 2
5429abfeea9b81084185b0d3abc8fe5b5c4f496a0c18fbarchard@google.com    lea        eax, [eax + 16]
5430abfeea9b81084185b0d3abc8fe5b5c4f496a0c18fbarchard@google.com    lea        esi, [esi + 16]
5431b3c1a3fe796a6caf2a0bc8d0882bc65514d9b74dfbarchard@google.com    packuswb   xmm0, xmm1
5432b3c1a3fe796a6caf2a0bc8d0882bc65514d9b74dfbarchard@google.com    sub        ecx, 4
5433abfeea9b81084185b0d3abc8fe5b5c4f496a0c18fbarchard@google.com    movdqu     [edx], xmm0
5434abfeea9b81084185b0d3abc8fe5b5c4f496a0c18fbarchard@google.com    lea        edx, [edx + 16]
5435b3c1a3fe796a6caf2a0bc8d0882bc65514d9b74dfbarchard@google.com    jg         convertloop
5436b3c1a3fe796a6caf2a0bc8d0882bc65514d9b74dfbarchard@google.com
54378fa76349948802d728dd244a7b54051d751d8696fbarchard@google.com    pop        esi
5438b3c1a3fe796a6caf2a0bc8d0882bc65514d9b74dfbarchard@google.com    ret
5439b3c1a3fe796a6caf2a0bc8d0882bc65514d9b74dfbarchard@google.com  }
5440b3c1a3fe796a6caf2a0bc8d0882bc65514d9b74dfbarchard@google.com}
5441b3c1a3fe796a6caf2a0bc8d0882bc65514d9b74dfbarchard@google.com#endif  // HAS_ARGBMULTIPLYROW_SSE2
5442b3c1a3fe796a6caf2a0bc8d0882bc65514d9b74dfbarchard@google.com
544383e1b17cc0b1840c7b5e361fa19e7263fca2b32bfbarchard@google.com#ifdef HAS_ARGBADDROW_SSE2
544483e1b17cc0b1840c7b5e361fa19e7263fca2b32bfbarchard@google.com// Add 2 rows of ARGB pixels together, 4 pixels at a time.
5445bb92acade0cd17a83ad32177da6a449b2962066efbarchard@google.com// TODO(fbarchard): Port this to posix, neon and other math functions.
544683e1b17cc0b1840c7b5e361fa19e7263fca2b32bfbarchard@google.com__declspec(naked) __declspec(align(16))
544783e1b17cc0b1840c7b5e361fa19e7263fca2b32bfbarchard@google.comvoid ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
544883e1b17cc0b1840c7b5e361fa19e7263fca2b32bfbarchard@google.com                     uint8* dst_argb, int width) {
544983e1b17cc0b1840c7b5e361fa19e7263fca2b32bfbarchard@google.com  __asm {
545083e1b17cc0b1840c7b5e361fa19e7263fca2b32bfbarchard@google.com    push       esi
545183e1b17cc0b1840c7b5e361fa19e7263fca2b32bfbarchard@google.com    mov        eax, [esp + 4 + 4]   // src_argb0
545283e1b17cc0b1840c7b5e361fa19e7263fca2b32bfbarchard@google.com    mov        esi, [esp + 4 + 8]   // src_argb1
545383e1b17cc0b1840c7b5e361fa19e7263fca2b32bfbarchard@google.com    mov        edx, [esp + 4 + 12]  // dst_argb
545483e1b17cc0b1840c7b5e361fa19e7263fca2b32bfbarchard@google.com    mov        ecx, [esp + 4 + 16]  // width
545583e1b17cc0b1840c7b5e361fa19e7263fca2b32bfbarchard@google.com
5456bb92acade0cd17a83ad32177da6a449b2962066efbarchard@google.com    sub        ecx, 4
5457bb92acade0cd17a83ad32177da6a449b2962066efbarchard@google.com    jl         convertloop49
5458bb92acade0cd17a83ad32177da6a449b2962066efbarchard@google.com
5459c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
5460bb92acade0cd17a83ad32177da6a449b2962066efbarchard@google.com convertloop4:
5461bb92acade0cd17a83ad32177da6a449b2962066efbarchard@google.com    movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
5462abfeea9b81084185b0d3abc8fe5b5c4f496a0c18fbarchard@google.com    lea        eax, [eax + 16]
5463abfeea9b81084185b0d3abc8fe5b5c4f496a0c18fbarchard@google.com    movdqu     xmm1, [esi]        // read 4 pixels from src_argb1
5464abfeea9b81084185b0d3abc8fe5b5c4f496a0c18fbarchard@google.com    lea        esi, [esi + 16]
546583e1b17cc0b1840c7b5e361fa19e7263fca2b32bfbarchard@google.com    paddusb    xmm0, xmm1         // src_argb0 + src_argb1
546683e1b17cc0b1840c7b5e361fa19e7263fca2b32bfbarchard@google.com    sub        ecx, 4
5467abfeea9b81084185b0d3abc8fe5b5c4f496a0c18fbarchard@google.com    movdqu     [edx], xmm0
5468abfeea9b81084185b0d3abc8fe5b5c4f496a0c18fbarchard@google.com    lea        edx, [edx + 16]
5469bb92acade0cd17a83ad32177da6a449b2962066efbarchard@google.com    jge        convertloop4
5470bb92acade0cd17a83ad32177da6a449b2962066efbarchard@google.com
5471bb92acade0cd17a83ad32177da6a449b2962066efbarchard@google.com convertloop49:
5472bb92acade0cd17a83ad32177da6a449b2962066efbarchard@google.com    add        ecx, 4 - 1
5473bb92acade0cd17a83ad32177da6a449b2962066efbarchard@google.com    jl         convertloop19
547483e1b17cc0b1840c7b5e361fa19e7263fca2b32bfbarchard@google.com
5475bb92acade0cd17a83ad32177da6a449b2962066efbarchard@google.com convertloop1:
5476bb92acade0cd17a83ad32177da6a449b2962066efbarchard@google.com    movd       xmm0, [eax]        // read 1 pixels from src_argb0
5477abfeea9b81084185b0d3abc8fe5b5c4f496a0c18fbarchard@google.com    lea        eax, [eax + 4]
5478abfeea9b81084185b0d3abc8fe5b5c4f496a0c18fbarchard@google.com    movd       xmm1, [esi]        // read 1 pixels from src_argb1
5479abfeea9b81084185b0d3abc8fe5b5c4f496a0c18fbarchard@google.com    lea        esi, [esi + 4]
5480bb92acade0cd17a83ad32177da6a449b2962066efbarchard@google.com    paddusb    xmm0, xmm1         // src_argb0 + src_argb1
5481bb92acade0cd17a83ad32177da6a449b2962066efbarchard@google.com    sub        ecx, 1
5482abfeea9b81084185b0d3abc8fe5b5c4f496a0c18fbarchard@google.com    movd       [edx], xmm0
5483abfeea9b81084185b0d3abc8fe5b5c4f496a0c18fbarchard@google.com    lea        edx, [edx + 4]
5484bb92acade0cd17a83ad32177da6a449b2962066efbarchard@google.com    jge        convertloop1
5485bb92acade0cd17a83ad32177da6a449b2962066efbarchard@google.com
5486bb92acade0cd17a83ad32177da6a449b2962066efbarchard@google.com convertloop19:
548783e1b17cc0b1840c7b5e361fa19e7263fca2b32bfbarchard@google.com    pop        esi
548883e1b17cc0b1840c7b5e361fa19e7263fca2b32bfbarchard@google.com    ret
548983e1b17cc0b1840c7b5e361fa19e7263fca2b32bfbarchard@google.com  }
549083e1b17cc0b1840c7b5e361fa19e7263fca2b32bfbarchard@google.com}
549183e1b17cc0b1840c7b5e361fa19e7263fca2b32bfbarchard@google.com#endif  // HAS_ARGBADDROW_SSE2
549283e1b17cc0b1840c7b5e361fa19e7263fca2b32bfbarchard@google.com
5493573a883dd65c94a10422e6e9e0d453e2a5d45227fbarchard@google.com#ifdef HAS_ARGBSUBTRACTROW_SSE2
5494573a883dd65c94a10422e6e9e0d453e2a5d45227fbarchard@google.com// Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
5495573a883dd65c94a10422e6e9e0d453e2a5d45227fbarchard@google.com__declspec(naked) __declspec(align(16))
5496573a883dd65c94a10422e6e9e0d453e2a5d45227fbarchard@google.comvoid ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
5497573a883dd65c94a10422e6e9e0d453e2a5d45227fbarchard@google.com                          uint8* dst_argb, int width) {
5498573a883dd65c94a10422e6e9e0d453e2a5d45227fbarchard@google.com  __asm {
5499573a883dd65c94a10422e6e9e0d453e2a5d45227fbarchard@google.com    push       esi
5500573a883dd65c94a10422e6e9e0d453e2a5d45227fbarchard@google.com    mov        eax, [esp + 4 + 4]   // src_argb0
5501573a883dd65c94a10422e6e9e0d453e2a5d45227fbarchard@google.com    mov        esi, [esp + 4 + 8]   // src_argb1
5502573a883dd65c94a10422e6e9e0d453e2a5d45227fbarchard@google.com    mov        edx, [esp + 4 + 12]  // dst_argb
5503573a883dd65c94a10422e6e9e0d453e2a5d45227fbarchard@google.com    mov        ecx, [esp + 4 + 16]  // width
5504573a883dd65c94a10422e6e9e0d453e2a5d45227fbarchard@google.com
5505c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
5506573a883dd65c94a10422e6e9e0d453e2a5d45227fbarchard@google.com convertloop:
5507bb92acade0cd17a83ad32177da6a449b2962066efbarchard@google.com    movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
5508abfeea9b81084185b0d3abc8fe5b5c4f496a0c18fbarchard@google.com    lea        eax, [eax + 16]
5509abfeea9b81084185b0d3abc8fe5b5c4f496a0c18fbarchard@google.com    movdqu     xmm1, [esi]        // read 4 pixels from src_argb1
5510abfeea9b81084185b0d3abc8fe5b5c4f496a0c18fbarchard@google.com    lea        esi, [esi + 16]
5511573a883dd65c94a10422e6e9e0d453e2a5d45227fbarchard@google.com    psubusb    xmm0, xmm1         // src_argb0 - src_argb1
5512573a883dd65c94a10422e6e9e0d453e2a5d45227fbarchard@google.com    sub        ecx, 4
5513abfeea9b81084185b0d3abc8fe5b5c4f496a0c18fbarchard@google.com    movdqu     [edx], xmm0
5514abfeea9b81084185b0d3abc8fe5b5c4f496a0c18fbarchard@google.com    lea        edx, [edx + 16]
5515573a883dd65c94a10422e6e9e0d453e2a5d45227fbarchard@google.com    jg         convertloop
5516573a883dd65c94a10422e6e9e0d453e2a5d45227fbarchard@google.com
5517573a883dd65c94a10422e6e9e0d453e2a5d45227fbarchard@google.com    pop        esi
5518573a883dd65c94a10422e6e9e0d453e2a5d45227fbarchard@google.com    ret
5519573a883dd65c94a10422e6e9e0d453e2a5d45227fbarchard@google.com  }
5520573a883dd65c94a10422e6e9e0d453e2a5d45227fbarchard@google.com}
5521573a883dd65c94a10422e6e9e0d453e2a5d45227fbarchard@google.com#endif  // HAS_ARGBSUBTRACTROW_SSE2
5522573a883dd65c94a10422e6e9e0d453e2a5d45227fbarchard@google.com
552351d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com#ifdef HAS_ARGBMULTIPLYROW_AVX2
552451d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
552551d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com__declspec(naked) __declspec(align(16))
552651d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.comvoid ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
552751d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com                          uint8* dst_argb, int width) {
552851d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com  __asm {
552951d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com    push       esi
553051d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com    mov        eax, [esp + 4 + 4]   // src_argb0
553151d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com    mov        esi, [esp + 4 + 8]   // src_argb1
553251d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com    mov        edx, [esp + 4 + 12]  // dst_argb
553351d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com    mov        ecx, [esp + 4 + 16]  // width
5534abfeea9b81084185b0d3abc8fe5b5c4f496a0c18fbarchard@google.com    vpxor      ymm5, ymm5, ymm5     // constant 0
553551d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com
5536c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
553751d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com convertloop:
553851d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com    vmovdqu    ymm1, [eax]        // read 8 pixels from src_argb0
5539abfeea9b81084185b0d3abc8fe5b5c4f496a0c18fbarchard@google.com    lea        eax, [eax + 32]
5540abfeea9b81084185b0d3abc8fe5b5c4f496a0c18fbarchard@google.com    vmovdqu    ymm3, [esi]        // read 8 pixels from src_argb1
5541abfeea9b81084185b0d3abc8fe5b5c4f496a0c18fbarchard@google.com    lea        esi, [esi + 32]
554251d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com    vpunpcklbw ymm0, ymm1, ymm1   // low 4
554351d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com    vpunpckhbw ymm1, ymm1, ymm1   // high 4
554451d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com    vpunpcklbw ymm2, ymm3, ymm5   // low 4
554551d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com    vpunpckhbw ymm3, ymm3, ymm5   // high 4
554651d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com    vpmulhuw   ymm0, ymm0, ymm2   // src_argb0 * src_argb1 low 4
554751d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com    vpmulhuw   ymm1, ymm1, ymm3   // src_argb0 * src_argb1 high 4
554851d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com    vpackuswb  ymm0, ymm0, ymm1
5549abfeea9b81084185b0d3abc8fe5b5c4f496a0c18fbarchard@google.com    vmovdqu    [edx], ymm0
5550abfeea9b81084185b0d3abc8fe5b5c4f496a0c18fbarchard@google.com    lea        edx, [edx + 32]
5551fc264019de6bf46f284907f966f7947cc51887e5fbarchard@google.com    sub        ecx, 8
555251d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com    jg         convertloop
555351d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com
555451d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com    pop        esi
55559b4c00b908d37727c6caf82337813d567732be1cfbarchard@google.com    vzeroupper
555651d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com    ret
555751d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com  }
555851d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com}
555951d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com#endif  // HAS_ARGBMULTIPLYROW_AVX2
556051d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com
556151d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com#ifdef HAS_ARGBADDROW_AVX2
556251d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com// Add 2 rows of ARGB pixels together, 8 pixels at a time.
556351d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com__declspec(naked) __declspec(align(16))
556451d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.comvoid ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
556551d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com                     uint8* dst_argb, int width) {
556651d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com  __asm {
556751d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com    push       esi
556851d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com    mov        eax, [esp + 4 + 4]   // src_argb0
556951d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com    mov        esi, [esp + 4 + 8]   // src_argb1
557051d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com    mov        edx, [esp + 4 + 12]  // dst_argb
557151d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com    mov        ecx, [esp + 4 + 16]  // width
557251d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com
5573c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
557451d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com convertloop:
557551d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com    vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0
557651d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com    lea        eax, [eax + 32]
5577abfeea9b81084185b0d3abc8fe5b5c4f496a0c18fbarchard@google.com    vpaddusb   ymm0, ymm0, [esi]        // add 8 pixels from src_argb1
5578abfeea9b81084185b0d3abc8fe5b5c4f496a0c18fbarchard@google.com    lea        esi, [esi + 32]
5579abfeea9b81084185b0d3abc8fe5b5c4f496a0c18fbarchard@google.com    vmovdqu    [edx], ymm0
5580abfeea9b81084185b0d3abc8fe5b5c4f496a0c18fbarchard@google.com    lea        edx, [edx + 32]
5581fc264019de6bf46f284907f966f7947cc51887e5fbarchard@google.com    sub        ecx, 8
558251d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com    jg         convertloop
558351d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com
558451d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com    pop        esi
55859b4c00b908d37727c6caf82337813d567732be1cfbarchard@google.com    vzeroupper
558651d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com    ret
558751d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com  }
558851d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com}
558951d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com#endif  // HAS_ARGBADDROW_AVX2
559051d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com
559151d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com#ifdef HAS_ARGBSUBTRACTROW_AVX2
559251d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com// Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
559351d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com__declspec(naked) __declspec(align(16))
559451d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.comvoid ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
559551d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com                          uint8* dst_argb, int width) {
559651d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com  __asm {
559751d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com    push       esi
559851d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com    mov        eax, [esp + 4 + 4]   // src_argb0
559951d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com    mov        esi, [esp + 4 + 8]   // src_argb1
560051d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com    mov        edx, [esp + 4 + 12]  // dst_argb
560151d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com    mov        ecx, [esp + 4 + 16]  // width
560251d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com
5603c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
560451d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com convertloop:
560551d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com    vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0
560651d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com    lea        eax, [eax + 32]
5607abfeea9b81084185b0d3abc8fe5b5c4f496a0c18fbarchard@google.com    vpsubusb   ymm0, ymm0, [esi]        // src_argb0 - src_argb1
5608abfeea9b81084185b0d3abc8fe5b5c4f496a0c18fbarchard@google.com    lea        esi, [esi + 32]
5609abfeea9b81084185b0d3abc8fe5b5c4f496a0c18fbarchard@google.com    vmovdqu    [edx], ymm0
5610abfeea9b81084185b0d3abc8fe5b5c4f496a0c18fbarchard@google.com    lea        edx, [edx + 32]
5611fc264019de6bf46f284907f966f7947cc51887e5fbarchard@google.com    sub        ecx, 8
561251d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com    jg         convertloop
561351d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com
561451d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com    pop        esi
56159b4c00b908d37727c6caf82337813d567732be1cfbarchard@google.com    vzeroupper
561651d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com    ret
561751d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com  }
561851d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com}
561951d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com#endif  // HAS_ARGBSUBTRACTROW_AVX2
562051d3e236cb5923c60ba818b6e825c2658b565afefbarchard@google.com
5621092099507e44e9f429ec52956a20b28db634b910fbarchard@google.com#ifdef HAS_SOBELXROW_SSE2
5622e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com// SobelX as a matrix is
5623e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com// -1  0  1
5624e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com// -2  0  2
5625e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com// -1  0  1
5626e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com__declspec(naked) __declspec(align(16))
5627092099507e44e9f429ec52956a20b28db634b910fbarchard@google.comvoid SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
5628092099507e44e9f429ec52956a20b28db634b910fbarchard@google.com                    const uint8* src_y2, uint8* dst_sobelx, int width) {
5629e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com  __asm {
5630e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    push       esi
5631e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    push       edi
5632e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    mov        eax, [esp + 8 + 4]   // src_y0
5633e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    mov        esi, [esp + 8 + 8]   // src_y1
5634e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    mov        edi, [esp + 8 + 12]  // src_y2
5635e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    mov        edx, [esp + 8 + 16]  // dst_sobelx
5636e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    mov        ecx, [esp + 8 + 20]  // width
5637e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    sub        esi, eax
5638e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    sub        edi, eax
5639e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    sub        edx, eax
5640e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    pxor       xmm5, xmm5  // constant 0
5641e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com
5642c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
5643e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com convertloop:
5644e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    movq       xmm0, qword ptr [eax]            // read 8 pixels from src_y0[0]
5645e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    movq       xmm1, qword ptr [eax + 2]        // read 8 pixels from src_y0[2]
5646e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    punpcklbw  xmm0, xmm5
5647e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    punpcklbw  xmm1, xmm5
5648e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    psubw      xmm0, xmm1
5649e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    movq       xmm1, qword ptr [eax + esi]      // read 8 pixels from src_y1[0]
5650e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    movq       xmm2, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
5651e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    punpcklbw  xmm1, xmm5
5652e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    punpcklbw  xmm2, xmm5
5653e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    psubw      xmm1, xmm2
5654e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    movq       xmm2, qword ptr [eax + edi]      // read 8 pixels from src_y2[0]
5655e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    movq       xmm3, qword ptr [eax + edi + 2]  // read 8 pixels from src_y2[2]
5656e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    punpcklbw  xmm2, xmm5
5657e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    punpcklbw  xmm3, xmm5
5658e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    psubw      xmm2, xmm3
5659e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    paddw      xmm0, xmm2
5660e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    paddw      xmm0, xmm1
5661e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    paddw      xmm0, xmm1
5662092099507e44e9f429ec52956a20b28db634b910fbarchard@google.com    pxor       xmm1, xmm1   // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
5663092099507e44e9f429ec52956a20b28db634b910fbarchard@google.com    psubw      xmm1, xmm0
5664092099507e44e9f429ec52956a20b28db634b910fbarchard@google.com    pmaxsw     xmm0, xmm1
5665e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    packuswb   xmm0, xmm0
5666e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    sub        ecx, 8
5667e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    movq       qword ptr [eax + edx], xmm0
5668e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    lea        eax, [eax + 8]
5669e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    jg         convertloop
5670e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com
5671e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    pop        edi
5672e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    pop        esi
5673e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    ret
5674e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com  }
5675e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com}
5676092099507e44e9f429ec52956a20b28db634b910fbarchard@google.com#endif  // HAS_SOBELXROW_SSE2
5677e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com
5678092099507e44e9f429ec52956a20b28db634b910fbarchard@google.com#ifdef HAS_SOBELYROW_SSE2
5679e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com// SobelY as a matrix is
5680e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com// -1 -2 -1
5681e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com//  0  0  0
5682e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com//  1  2  1
5683e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com__declspec(naked) __declspec(align(16))
5684092099507e44e9f429ec52956a20b28db634b910fbarchard@google.comvoid SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
5685092099507e44e9f429ec52956a20b28db634b910fbarchard@google.com                    uint8* dst_sobely, int width) {
5686e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com  __asm {
5687e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    push       esi
5688e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    mov        eax, [esp + 4 + 4]   // src_y0
5689e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    mov        esi, [esp + 4 + 8]   // src_y1
5690e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    mov        edx, [esp + 4 + 12]  // dst_sobely
5691e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    mov        ecx, [esp + 4 + 16]  // width
5692e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    sub        esi, eax
5693e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    sub        edx, eax
5694e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    pxor       xmm5, xmm5  // constant 0
5695e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com
5696c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
5697e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com convertloop:
5698e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    movq       xmm0, qword ptr [eax]            // read 8 pixels from src_y0[0]
5699e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    movq       xmm1, qword ptr [eax + esi]      // read 8 pixels from src_y1[0]
5700e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    punpcklbw  xmm0, xmm5
5701e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    punpcklbw  xmm1, xmm5
5702e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    psubw      xmm0, xmm1
5703e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    movq       xmm1, qword ptr [eax + 1]        // read 8 pixels from src_y0[1]
5704e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    movq       xmm2, qword ptr [eax + esi + 1]  // read 8 pixels from src_y1[1]
5705e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    punpcklbw  xmm1, xmm5
5706e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    punpcklbw  xmm2, xmm5
5707e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    psubw      xmm1, xmm2
5708e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    movq       xmm2, qword ptr [eax + 2]        // read 8 pixels from src_y0[2]
5709e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    movq       xmm3, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
5710e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    punpcklbw  xmm2, xmm5
5711e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    punpcklbw  xmm3, xmm5
5712e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    psubw      xmm2, xmm3
5713e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    paddw      xmm0, xmm2
5714e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    paddw      xmm0, xmm1
5715e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    paddw      xmm0, xmm1
5716092099507e44e9f429ec52956a20b28db634b910fbarchard@google.com    pxor       xmm1, xmm1   // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
5717092099507e44e9f429ec52956a20b28db634b910fbarchard@google.com    psubw      xmm1, xmm0
5718092099507e44e9f429ec52956a20b28db634b910fbarchard@google.com    pmaxsw     xmm0, xmm1
5719e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    packuswb   xmm0, xmm0
5720e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    sub        ecx, 8
5721e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    movq       qword ptr [eax + edx], xmm0
5722e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    lea        eax, [eax + 8]
5723e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    jg         convertloop
5724e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com
5725e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    pop        esi
5726e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com    ret
5727e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com  }
5728e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com}
5729092099507e44e9f429ec52956a20b28db634b910fbarchard@google.com#endif  // HAS_SOBELYROW_SSE2
5730e1247eec9498c870a3d24cf9f70472a2fbb9825efbarchard@google.com
5731c93a137671e2e281dfb8d32561fed95caacf608bfbarchard@google.com#ifdef HAS_SOBELROW_SSE2
5732c93a137671e2e281dfb8d32561fed95caacf608bfbarchard@google.com// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
5733c93a137671e2e281dfb8d32561fed95caacf608bfbarchard@google.com// A = 255
5734c93a137671e2e281dfb8d32561fed95caacf608bfbarchard@google.com// R = Sobel
5735c93a137671e2e281dfb8d32561fed95caacf608bfbarchard@google.com// G = Sobel
5736c93a137671e2e281dfb8d32561fed95caacf608bfbarchard@google.com// B = Sobel
5737c93a137671e2e281dfb8d32561fed95caacf608bfbarchard@google.com__declspec(naked) __declspec(align(16))
5738c93a137671e2e281dfb8d32561fed95caacf608bfbarchard@google.comvoid SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
57398be4b289c799356d84c68c4eb4b5403285096693fbarchard@google.com                   uint8* dst_argb, int width) {
5740c93a137671e2e281dfb8d32561fed95caacf608bfbarchard@google.com  __asm {
5741c93a137671e2e281dfb8d32561fed95caacf608bfbarchard@google.com    push       esi
5742c93a137671e2e281dfb8d32561fed95caacf608bfbarchard@google.com    mov        eax, [esp + 4 + 4]   // src_sobelx
5743c93a137671e2e281dfb8d32561fed95caacf608bfbarchard@google.com    mov        esi, [esp + 4 + 8]   // src_sobely
5744c93a137671e2e281dfb8d32561fed95caacf608bfbarchard@google.com    mov        edx, [esp + 4 + 12]  // dst_argb
5745c93a137671e2e281dfb8d32561fed95caacf608bfbarchard@google.com    mov        ecx, [esp + 4 + 16]  // width
5746c93a137671e2e281dfb8d32561fed95caacf608bfbarchard@google.com    sub        esi, eax
5747c93a137671e2e281dfb8d32561fed95caacf608bfbarchard@google.com    pcmpeqb    xmm5, xmm5           // alpha 255
5748c93a137671e2e281dfb8d32561fed95caacf608bfbarchard@google.com    pslld      xmm5, 24             // 0xff000000
5749c93a137671e2e281dfb8d32561fed95caacf608bfbarchard@google.com
5750c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
5751c93a137671e2e281dfb8d32561fed95caacf608bfbarchard@google.com convertloop:
5752c93a137671e2e281dfb8d32561fed95caacf608bfbarchard@google.com    movdqa     xmm0, [eax]            // read 16 pixels src_sobelx
5753c93a137671e2e281dfb8d32561fed95caacf608bfbarchard@google.com    movdqa     xmm1, [eax + esi]      // read 16 pixels src_sobely
5754c93a137671e2e281dfb8d32561fed95caacf608bfbarchard@google.com    lea        eax, [eax + 16]
5755c93a137671e2e281dfb8d32561fed95caacf608bfbarchard@google.com    paddusb    xmm0, xmm1             // sobel = sobelx + sobely
5756c93a137671e2e281dfb8d32561fed95caacf608bfbarchard@google.com    movdqa     xmm2, xmm0             // GG
5757c93a137671e2e281dfb8d32561fed95caacf608bfbarchard@google.com    punpcklbw  xmm2, xmm0             // First 8
5758c93a137671e2e281dfb8d32561fed95caacf608bfbarchard@google.com    punpckhbw  xmm0, xmm0             // Next 8
5759c93a137671e2e281dfb8d32561fed95caacf608bfbarchard@google.com    movdqa     xmm1, xmm2             // GGGG
5760c93a137671e2e281dfb8d32561fed95caacf608bfbarchard@google.com    punpcklwd  xmm1, xmm2             // First 4
5761c93a137671e2e281dfb8d32561fed95caacf608bfbarchard@google.com    punpckhwd  xmm2, xmm2             // Next 4
5762c93a137671e2e281dfb8d32561fed95caacf608bfbarchard@google.com    por        xmm1, xmm5             // GGGA
5763c93a137671e2e281dfb8d32561fed95caacf608bfbarchard@google.com    por        xmm2, xmm5
5764c93a137671e2e281dfb8d32561fed95caacf608bfbarchard@google.com    movdqa     xmm3, xmm0             // GGGG
5765c93a137671e2e281dfb8d32561fed95caacf608bfbarchard@google.com    punpcklwd  xmm3, xmm0             // Next 4
5766c93a137671e2e281dfb8d32561fed95caacf608bfbarchard@google.com    punpckhwd  xmm0, xmm0             // Last 4
5767c93a137671e2e281dfb8d32561fed95caacf608bfbarchard@google.com    por        xmm3, xmm5             // GGGA
5768c93a137671e2e281dfb8d32561fed95caacf608bfbarchard@google.com    por        xmm0, xmm5
5769c93a137671e2e281dfb8d32561fed95caacf608bfbarchard@google.com    sub        ecx, 16
5770c93a137671e2e281dfb8d32561fed95caacf608bfbarchard@google.com    movdqa     [edx], xmm1
5771c93a137671e2e281dfb8d32561fed95caacf608bfbarchard@google.com    movdqa     [edx + 16], xmm2
5772c93a137671e2e281dfb8d32561fed95caacf608bfbarchard@google.com    movdqa     [edx + 32], xmm3
5773c93a137671e2e281dfb8d32561fed95caacf608bfbarchard@google.com    movdqa     [edx + 48], xmm0
5774c93a137671e2e281dfb8d32561fed95caacf608bfbarchard@google.com    lea        edx, [edx + 64]
5775c93a137671e2e281dfb8d32561fed95caacf608bfbarchard@google.com    jg         convertloop
5776c93a137671e2e281dfb8d32561fed95caacf608bfbarchard@google.com
5777c93a137671e2e281dfb8d32561fed95caacf608bfbarchard@google.com    pop        esi
5778c93a137671e2e281dfb8d32561fed95caacf608bfbarchard@google.com    ret
5779c93a137671e2e281dfb8d32561fed95caacf608bfbarchard@google.com  }
5780c93a137671e2e281dfb8d32561fed95caacf608bfbarchard@google.com}
5781c93a137671e2e281dfb8d32561fed95caacf608bfbarchard@google.com#endif  // HAS_SOBELROW_SSE2
5782c93a137671e2e281dfb8d32561fed95caacf608bfbarchard@google.com
57838be4b289c799356d84c68c4eb4b5403285096693fbarchard@google.com#ifdef HAS_SOBELTOPLANEROW_SSE2
57848be4b289c799356d84c68c4eb4b5403285096693fbarchard@google.com// Adds Sobel X and Sobel Y and stores Sobel into a plane.
57858be4b289c799356d84c68c4eb4b5403285096693fbarchard@google.com__declspec(naked) __declspec(align(16))
57868be4b289c799356d84c68c4eb4b5403285096693fbarchard@google.comvoid SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
57878be4b289c799356d84c68c4eb4b5403285096693fbarchard@google.com                          uint8* dst_y, int width) {
57888be4b289c799356d84c68c4eb4b5403285096693fbarchard@google.com  __asm {
57898be4b289c799356d84c68c4eb4b5403285096693fbarchard@google.com    push       esi
57908be4b289c799356d84c68c4eb4b5403285096693fbarchard@google.com    mov        eax, [esp + 4 + 4]   // src_sobelx
57918be4b289c799356d84c68c4eb4b5403285096693fbarchard@google.com    mov        esi, [esp + 4 + 8]   // src_sobely
57928be4b289c799356d84c68c4eb4b5403285096693fbarchard@google.com    mov        edx, [esp + 4 + 12]  // dst_argb
57938be4b289c799356d84c68c4eb4b5403285096693fbarchard@google.com    mov        ecx, [esp + 4 + 16]  // width
57948be4b289c799356d84c68c4eb4b5403285096693fbarchard@google.com    sub        esi, eax
57958be4b289c799356d84c68c4eb4b5403285096693fbarchard@google.com
5796c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
57978be4b289c799356d84c68c4eb4b5403285096693fbarchard@google.com convertloop:
57988be4b289c799356d84c68c4eb4b5403285096693fbarchard@google.com    movdqa     xmm0, [eax]            // read 16 pixels src_sobelx
57998be4b289c799356d84c68c4eb4b5403285096693fbarchard@google.com    movdqa     xmm1, [eax + esi]      // read 16 pixels src_sobely
58008be4b289c799356d84c68c4eb4b5403285096693fbarchard@google.com    lea        eax, [eax + 16]
58018be4b289c799356d84c68c4eb4b5403285096693fbarchard@google.com    paddusb    xmm0, xmm1             // sobel = sobelx + sobely
58028be4b289c799356d84c68c4eb4b5403285096693fbarchard@google.com    sub        ecx, 16
58038be4b289c799356d84c68c4eb4b5403285096693fbarchard@google.com    movdqa     [edx], xmm0
58048be4b289c799356d84c68c4eb4b5403285096693fbarchard@google.com    lea        edx, [edx + 16]
58058be4b289c799356d84c68c4eb4b5403285096693fbarchard@google.com    jg         convertloop
58068be4b289c799356d84c68c4eb4b5403285096693fbarchard@google.com
58078be4b289c799356d84c68c4eb4b5403285096693fbarchard@google.com    pop        esi
58088be4b289c799356d84c68c4eb4b5403285096693fbarchard@google.com    ret
58098be4b289c799356d84c68c4eb4b5403285096693fbarchard@google.com  }
58108be4b289c799356d84c68c4eb4b5403285096693fbarchard@google.com}
58118be4b289c799356d84c68c4eb4b5403285096693fbarchard@google.com#endif  // HAS_SOBELTOPLANEROW_SSE2
58128be4b289c799356d84c68c4eb4b5403285096693fbarchard@google.com
5813610e012d56b1cce420369b82335bd178f7e39397fbarchard@google.com#ifdef HAS_SOBELXYROW_SSE2
5814610e012d56b1cce420369b82335bd178f7e39397fbarchard@google.com// Mixes Sobel X, Sobel Y and Sobel into ARGB.
5815610e012d56b1cce420369b82335bd178f7e39397fbarchard@google.com// A = 255
5816610e012d56b1cce420369b82335bd178f7e39397fbarchard@google.com// R = Sobel X
5817610e012d56b1cce420369b82335bd178f7e39397fbarchard@google.com// G = Sobel
5818610e012d56b1cce420369b82335bd178f7e39397fbarchard@google.com// B = Sobel Y
5819610e012d56b1cce420369b82335bd178f7e39397fbarchard@google.com__declspec(naked) __declspec(align(16))
5820610e012d56b1cce420369b82335bd178f7e39397fbarchard@google.comvoid SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
5821610e012d56b1cce420369b82335bd178f7e39397fbarchard@google.com                     uint8* dst_argb, int width) {
5822610e012d56b1cce420369b82335bd178f7e39397fbarchard@google.com  __asm {
5823610e012d56b1cce420369b82335bd178f7e39397fbarchard@google.com    push       esi
5824610e012d56b1cce420369b82335bd178f7e39397fbarchard@google.com    mov        eax, [esp + 4 + 4]   // src_sobelx
5825610e012d56b1cce420369b82335bd178f7e39397fbarchard@google.com    mov        esi, [esp + 4 + 8]   // src_sobely
5826610e012d56b1cce420369b82335bd178f7e39397fbarchard@google.com    mov        edx, [esp + 4 + 12]  // dst_argb
5827610e012d56b1cce420369b82335bd178f7e39397fbarchard@google.com    mov        ecx, [esp + 4 + 16]  // width
5828610e012d56b1cce420369b82335bd178f7e39397fbarchard@google.com    sub        esi, eax
5829191ab180736f78cd25989ec08007874946c85c77fbarchard@google.com    pcmpeqb    xmm5, xmm5           // alpha 255
5830610e012d56b1cce420369b82335bd178f7e39397fbarchard@google.com
5831c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
5832610e012d56b1cce420369b82335bd178f7e39397fbarchard@google.com convertloop:
5833610e012d56b1cce420369b82335bd178f7e39397fbarchard@google.com    movdqa     xmm0, [eax]            // read 16 pixels src_sobelx
5834610e012d56b1cce420369b82335bd178f7e39397fbarchard@google.com    movdqa     xmm1, [eax + esi]      // read 16 pixels src_sobely
5835610e012d56b1cce420369b82335bd178f7e39397fbarchard@google.com    lea        eax, [eax + 16]
5836610e012d56b1cce420369b82335bd178f7e39397fbarchard@google.com    movdqa     xmm2, xmm0
5837610e012d56b1cce420369b82335bd178f7e39397fbarchard@google.com    paddusb    xmm2, xmm1             // sobel = sobelx + sobely
5838610e012d56b1cce420369b82335bd178f7e39397fbarchard@google.com    movdqa     xmm3, xmm0             // XA
5839610e012d56b1cce420369b82335bd178f7e39397fbarchard@google.com    punpcklbw  xmm3, xmm5
5840610e012d56b1cce420369b82335bd178f7e39397fbarchard@google.com    punpckhbw  xmm0, xmm5
5841610e012d56b1cce420369b82335bd178f7e39397fbarchard@google.com    movdqa     xmm4, xmm1             // YS
5842610e012d56b1cce420369b82335bd178f7e39397fbarchard@google.com    punpcklbw  xmm4, xmm2
5843610e012d56b1cce420369b82335bd178f7e39397fbarchard@google.com    punpckhbw  xmm1, xmm2
5844610e012d56b1cce420369b82335bd178f7e39397fbarchard@google.com    movdqa     xmm6, xmm4             // YSXA
5845610e012d56b1cce420369b82335bd178f7e39397fbarchard@google.com    punpcklwd  xmm6, xmm3             // First 4
5846610e012d56b1cce420369b82335bd178f7e39397fbarchard@google.com    punpckhwd  xmm4, xmm3             // Next 4
5847610e012d56b1cce420369b82335bd178f7e39397fbarchard@google.com    movdqa     xmm7, xmm1             // YSXA
5848610e012d56b1cce420369b82335bd178f7e39397fbarchard@google.com    punpcklwd  xmm7, xmm0             // Next 4
5849610e012d56b1cce420369b82335bd178f7e39397fbarchard@google.com    punpckhwd  xmm1, xmm0             // Last 4
5850610e012d56b1cce420369b82335bd178f7e39397fbarchard@google.com    sub        ecx, 16
5851610e012d56b1cce420369b82335bd178f7e39397fbarchard@google.com    movdqa     [edx], xmm6
5852610e012d56b1cce420369b82335bd178f7e39397fbarchard@google.com    movdqa     [edx + 16], xmm4
5853610e012d56b1cce420369b82335bd178f7e39397fbarchard@google.com    movdqa     [edx + 32], xmm7
5854610e012d56b1cce420369b82335bd178f7e39397fbarchard@google.com    movdqa     [edx + 48], xmm1
5855610e012d56b1cce420369b82335bd178f7e39397fbarchard@google.com    lea        edx, [edx + 64]
5856610e012d56b1cce420369b82335bd178f7e39397fbarchard@google.com    jg         convertloop
5857610e012d56b1cce420369b82335bd178f7e39397fbarchard@google.com
5858610e012d56b1cce420369b82335bd178f7e39397fbarchard@google.com    pop        esi
5859610e012d56b1cce420369b82335bd178f7e39397fbarchard@google.com    ret
5860610e012d56b1cce420369b82335bd178f7e39397fbarchard@google.com  }
5861610e012d56b1cce420369b82335bd178f7e39397fbarchard@google.com}
5862c93a137671e2e281dfb8d32561fed95caacf608bfbarchard@google.com#endif  // HAS_SOBELXYROW_SSE2
5863610e012d56b1cce420369b82335bd178f7e39397fbarchard@google.com
5864f08ac6bb095348565b5259f2fab95f259ef47edefbarchard@google.com#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
5865f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com// Consider float CumulativeSum.
5866f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com// Consider calling CumulativeSum one row at time as needed.
5867f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com// Consider circular CumulativeSum buffer of radius * 2 + 1 height.
5868f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com// Convert cumulative sum for an area to an average for 1 pixel.
5869f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com// topleft is pointer to top left of CumulativeSum buffer for area.
5870f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com// botleft is pointer to bottom left of CumulativeSum buffer.
5871f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com// width is offset from left to right of area in CumulativeSum buffer measured
5872f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com//   in number of ints.
5873f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com// area is the number of pixels in the area being averaged.
5874f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com// dst points to pixel to store result to.
5875f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com// count is number of averaged pixels to produce.
5876f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com// Does 4 pixels at a time, requires CumulativeSum pointers to be 16 byte
5877f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com// aligned.
5878f08ac6bb095348565b5259f2fab95f259ef47edefbarchard@google.comvoid CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
5879f08ac6bb095348565b5259f2fab95f259ef47edefbarchard@google.com                                    int width, int area, uint8* dst,
5880f08ac6bb095348565b5259f2fab95f259ef47edefbarchard@google.com                                    int count) {
5881f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com  __asm {
5882f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    mov        eax, topleft  // eax topleft
5883f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    mov        esi, botleft  // esi botleft
5884f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    mov        edx, width
5885c2a889eb5513143c9207c702429100562b4001f7fbarchard@google.com    movd       xmm5, area
5886f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    mov        edi, dst
5887f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    mov        ecx, count
5888c2a889eb5513143c9207c702429100562b4001f7fbarchard@google.com    cvtdq2ps   xmm5, xmm5
5889c2a889eb5513143c9207c702429100562b4001f7fbarchard@google.com    rcpss      xmm4, xmm5  // 1.0f / area
5890f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    pshufd     xmm4, xmm4, 0
5891f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    sub        ecx, 4
5892f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    jl         l4b
5893f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com
5894191ab180736f78cd25989ec08007874946c85c77fbarchard@google.com    cmp        area, 128  // 128 pixels will not overflow 15 bits.
5895191ab180736f78cd25989ec08007874946c85c77fbarchard@google.com    ja         l4
5896191ab180736f78cd25989ec08007874946c85c77fbarchard@google.com
5897c2a889eb5513143c9207c702429100562b4001f7fbarchard@google.com    pshufd     xmm5, xmm5, 0        // area
5898c2a889eb5513143c9207c702429100562b4001f7fbarchard@google.com    pcmpeqb    xmm6, xmm6           // constant of 65536.0 - 1 = 65535.0
5899c2a889eb5513143c9207c702429100562b4001f7fbarchard@google.com    psrld      xmm6, 16
5900c2a889eb5513143c9207c702429100562b4001f7fbarchard@google.com    cvtdq2ps   xmm6, xmm6
5901c2a889eb5513143c9207c702429100562b4001f7fbarchard@google.com    addps      xmm5, xmm6           // (65536.0 + area - 1)
5902c2a889eb5513143c9207c702429100562b4001f7fbarchard@google.com    mulps      xmm5, xmm4           // (65536.0 + area - 1) * 1 / area
5903191ab180736f78cd25989ec08007874946c85c77fbarchard@google.com    cvtps2dq   xmm5, xmm5           // 0.16 fixed point
5904c2a889eb5513143c9207c702429100562b4001f7fbarchard@google.com    packssdw   xmm5, xmm5           // 16 bit shorts
5905191ab180736f78cd25989ec08007874946c85c77fbarchard@google.com
5906191ab180736f78cd25989ec08007874946c85c77fbarchard@google.com    // 4 pixel loop small blocks.
5907191ab180736f78cd25989ec08007874946c85c77fbarchard@google.com    align      4
5908191ab180736f78cd25989ec08007874946c85c77fbarchard@google.com  s4:
5909191ab180736f78cd25989ec08007874946c85c77fbarchard@google.com    // top left
5910191ab180736f78cd25989ec08007874946c85c77fbarchard@google.com    movdqa     xmm0, [eax]
5911191ab180736f78cd25989ec08007874946c85c77fbarchard@google.com    movdqa     xmm1, [eax + 16]
5912191ab180736f78cd25989ec08007874946c85c77fbarchard@google.com    movdqa     xmm2, [eax + 32]
5913191ab180736f78cd25989ec08007874946c85c77fbarchard@google.com    movdqa     xmm3, [eax + 48]
5914191ab180736f78cd25989ec08007874946c85c77fbarchard@google.com
5915191ab180736f78cd25989ec08007874946c85c77fbarchard@google.com    // - top right
5916191ab180736f78cd25989ec08007874946c85c77fbarchard@google.com    psubd      xmm0, [eax + edx * 4]
5917191ab180736f78cd25989ec08007874946c85c77fbarchard@google.com    psubd      xmm1, [eax + edx * 4 + 16]
5918191ab180736f78cd25989ec08007874946c85c77fbarchard@google.com    psubd      xmm2, [eax + edx * 4 + 32]
5919191ab180736f78cd25989ec08007874946c85c77fbarchard@google.com    psubd      xmm3, [eax + edx * 4 + 48]
5920191ab180736f78cd25989ec08007874946c85c77fbarchard@google.com    lea        eax, [eax + 64]
5921191ab180736f78cd25989ec08007874946c85c77fbarchard@google.com
5922191ab180736f78cd25989ec08007874946c85c77fbarchard@google.com    // - bottom left
5923191ab180736f78cd25989ec08007874946c85c77fbarchard@google.com    psubd      xmm0, [esi]
5924191ab180736f78cd25989ec08007874946c85c77fbarchard@google.com    psubd      xmm1, [esi + 16]
5925191ab180736f78cd25989ec08007874946c85c77fbarchard@google.com    psubd      xmm2, [esi + 32]
5926191ab180736f78cd25989ec08007874946c85c77fbarchard@google.com    psubd      xmm3, [esi + 48]
5927191ab180736f78cd25989ec08007874946c85c77fbarchard@google.com
5928191ab180736f78cd25989ec08007874946c85c77fbarchard@google.com    // + bottom right
5929191ab180736f78cd25989ec08007874946c85c77fbarchard@google.com    paddd      xmm0, [esi + edx * 4]
5930191ab180736f78cd25989ec08007874946c85c77fbarchard@google.com    paddd      xmm1, [esi + edx * 4 + 16]
5931191ab180736f78cd25989ec08007874946c85c77fbarchard@google.com    paddd      xmm2, [esi + edx * 4 + 32]
5932191ab180736f78cd25989ec08007874946c85c77fbarchard@google.com    paddd      xmm3, [esi + edx * 4 + 48]
5933191ab180736f78cd25989ec08007874946c85c77fbarchard@google.com    lea        esi, [esi + 64]
5934191ab180736f78cd25989ec08007874946c85c77fbarchard@google.com
5935191ab180736f78cd25989ec08007874946c85c77fbarchard@google.com    packssdw   xmm0, xmm1  // pack 4 pixels into 2 registers
5936191ab180736f78cd25989ec08007874946c85c77fbarchard@google.com    packssdw   xmm2, xmm3
5937191ab180736f78cd25989ec08007874946c85c77fbarchard@google.com
5938191ab180736f78cd25989ec08007874946c85c77fbarchard@google.com    pmulhuw    xmm0, xmm5
5939191ab180736f78cd25989ec08007874946c85c77fbarchard@google.com    pmulhuw    xmm2, xmm5
5940191ab180736f78cd25989ec08007874946c85c77fbarchard@google.com
5941191ab180736f78cd25989ec08007874946c85c77fbarchard@google.com    packuswb   xmm0, xmm2
5942191ab180736f78cd25989ec08007874946c85c77fbarchard@google.com    movdqu     [edi], xmm0
5943191ab180736f78cd25989ec08007874946c85c77fbarchard@google.com    lea        edi, [edi + 16]
5944191ab180736f78cd25989ec08007874946c85c77fbarchard@google.com    sub        ecx, 4
5945191ab180736f78cd25989ec08007874946c85c77fbarchard@google.com    jge        s4
5946191ab180736f78cd25989ec08007874946c85c77fbarchard@google.com
5947191ab180736f78cd25989ec08007874946c85c77fbarchard@google.com    jmp        l4b
5948191ab180736f78cd25989ec08007874946c85c77fbarchard@google.com
5949f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    // 4 pixel loop
5950f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    align      4
5951f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com  l4:
5952f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    // top left
5953f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    movdqa     xmm0, [eax]
5954f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    movdqa     xmm1, [eax + 16]
5955f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    movdqa     xmm2, [eax + 32]
5956f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    movdqa     xmm3, [eax + 48]
5957f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com
5958f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    // - top right
5959f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    psubd      xmm0, [eax + edx * 4]
5960f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    psubd      xmm1, [eax + edx * 4 + 16]
5961f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    psubd      xmm2, [eax + edx * 4 + 32]
5962f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    psubd      xmm3, [eax + edx * 4 + 48]
5963f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    lea        eax, [eax + 64]
5964f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com
5965f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    // - bottom left
5966f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    psubd      xmm0, [esi]
5967f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    psubd      xmm1, [esi + 16]
5968f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    psubd      xmm2, [esi + 32]
5969f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    psubd      xmm3, [esi + 48]
5970f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com
5971f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    // + bottom right
5972f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    paddd      xmm0, [esi + edx * 4]
5973f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    paddd      xmm1, [esi + edx * 4 + 16]
5974f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    paddd      xmm2, [esi + edx * 4 + 32]
5975f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    paddd      xmm3, [esi + edx * 4 + 48]
5976f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    lea        esi, [esi + 64]
5977f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com
5978f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    cvtdq2ps   xmm0, xmm0   // Average = Sum * 1 / Area
5979f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    cvtdq2ps   xmm1, xmm1
5980f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    mulps      xmm0, xmm4
5981f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    mulps      xmm1, xmm4
5982f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    cvtdq2ps   xmm2, xmm2
5983f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    cvtdq2ps   xmm3, xmm3
5984f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    mulps      xmm2, xmm4
5985f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    mulps      xmm3, xmm4
5986f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    cvtps2dq   xmm0, xmm0
5987f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    cvtps2dq   xmm1, xmm1
5988f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    cvtps2dq   xmm2, xmm2
5989f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    cvtps2dq   xmm3, xmm3
5990f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    packssdw   xmm0, xmm1
5991f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    packssdw   xmm2, xmm3
5992f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    packuswb   xmm0, xmm2
5993f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    movdqu     [edi], xmm0
5994f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    lea        edi, [edi + 16]
5995f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    sub        ecx, 4
5996f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    jge        l4
5997f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com
5998f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com  l4b:
5999f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    add        ecx, 4 - 1
6000f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    jl         l1b
6001f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com
6002f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    // 1 pixel loop
6003f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    align      4
6004f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com  l1:
6005f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    movdqa     xmm0, [eax]
6006f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    psubd      xmm0, [eax + edx * 4]
6007f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    lea        eax, [eax + 16]
6008f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    psubd      xmm0, [esi]
6009f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    paddd      xmm0, [esi + edx * 4]
6010f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    lea        esi, [esi + 16]
6011f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    cvtdq2ps   xmm0, xmm0
6012f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    mulps      xmm0, xmm4
6013f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    cvtps2dq   xmm0, xmm0
6014f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    packssdw   xmm0, xmm0
6015f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    packuswb   xmm0, xmm0
6016f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    movd       dword ptr [edi], xmm0
6017f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    lea        edi, [edi + 4]
6018f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    sub        ecx, 1
6019f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    jge        l1
6020f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com  l1b:
6021f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com  }
6022f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com}
6023f08ac6bb095348565b5259f2fab95f259ef47edefbarchard@google.com#endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
6024f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com
6025f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
6026f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com// Creates a table of cumulative sums where each value is a sum of all values
6027f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com// above and to the left of the value.
6028f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.comvoid ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
6029133adc46470722b24fdac30d7537d5009e61ef0cfbarchard@google.com                                  const int32* previous_cumsum, int width) {
6030f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com  __asm {
6031f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    mov        eax, row
6032f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    mov        edx, cumsum
6033f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    mov        esi, previous_cumsum
6034f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    mov        ecx, width
6035f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    pxor       xmm0, xmm0
6036f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    pxor       xmm1, xmm1
6037f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com
6038f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    sub        ecx, 4
6039f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    jl         l4b
6040f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    test       edx, 15
6041f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    jne        l4b
6042f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com
6043f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    // 4 pixel loop
6044f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    align      4
6045f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com  l4:
6046f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    movdqu     xmm2, [eax]  // 4 argb pixels 16 bytes.
6047f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    lea        eax, [eax + 16]
6048f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    movdqa     xmm4, xmm2
6049f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com
6050f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    punpcklbw  xmm2, xmm1
6051f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    movdqa     xmm3, xmm2
6052f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    punpcklwd  xmm2, xmm1
6053f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    punpckhwd  xmm3, xmm1
6054f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com
6055f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    punpckhbw  xmm4, xmm1
6056f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    movdqa     xmm5, xmm4
6057f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    punpcklwd  xmm4, xmm1
6058f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    punpckhwd  xmm5, xmm1
6059f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com
6060f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    paddd      xmm0, xmm2
60619335518f4127167ee54b0872ab715c674be06005fbarchard@google.com    movdqa     xmm2, [esi]  // previous row above.
6062f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    paddd      xmm2, xmm0
6063f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com
6064f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    paddd      xmm0, xmm3
60659335518f4127167ee54b0872ab715c674be06005fbarchard@google.com    movdqa     xmm3, [esi + 16]
6066f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    paddd      xmm3, xmm0
6067f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com
6068f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    paddd      xmm0, xmm4
60699335518f4127167ee54b0872ab715c674be06005fbarchard@google.com    movdqa     xmm4, [esi + 32]
6070f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    paddd      xmm4, xmm0
6071f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com
6072f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    paddd      xmm0, xmm5
60739335518f4127167ee54b0872ab715c674be06005fbarchard@google.com    movdqa     xmm5, [esi + 48]
60749335518f4127167ee54b0872ab715c674be06005fbarchard@google.com    lea        esi, [esi + 64]
6075f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    paddd      xmm5, xmm0
6076f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com
6077f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    movdqa     [edx], xmm2
6078f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    movdqa     [edx + 16], xmm3
6079f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    movdqa     [edx + 32], xmm4
6080f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    movdqa     [edx + 48], xmm5
6081f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com
6082f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    lea        edx, [edx + 64]
6083f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    sub        ecx, 4
6084f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    jge        l4
6085f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com
6086f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com  l4b:
6087f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    add        ecx, 4 - 1
6088f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    jl         l1b
6089f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com
6090f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    // 1 pixel loop
6091f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    align      4
6092f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com  l1:
6093f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    movd       xmm2, dword ptr [eax]  // 1 argb pixel 4 bytes.
6094f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    lea        eax, [eax + 4]
6095f38aefef4b66dc8ebe77ff37234be332731d47f6fbarchard@google.com    punpcklbw  xmm2, xmm1
6096f38aefef4b66dc8ebe77ff37234be332731d47f6fbarchard@google.com    punpcklwd  xmm2, xmm1
6097f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    paddd      xmm0, xmm2
60989335518f4127167ee54b0872ab715c674be06005fbarchard@google.com    movdqu     xmm2, [esi]
60999335518f4127167ee54b0872ab715c674be06005fbarchard@google.com    lea        esi, [esi + 16]
6100f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    paddd      xmm2, xmm0
6101f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    movdqu     [edx], xmm2
6102f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    lea        edx, [edx + 16]
6103f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    sub        ecx, 1
6104f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com    jge        l1
6105f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com
6106f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com l1b:
6107f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com  }
6108f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com}
6109f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com#endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
6110f51e87912eebc959ac6b9d1ab44978e0e056ca74fbarchard@google.com
6111864f828a0167bde25b0d24d6b865aa514919fcc9fbarchard@google.com#ifdef HAS_ARGBAFFINEROW_SSE2
6112864f828a0167bde25b0d24d6b865aa514919fcc9fbarchard@google.com// Copy ARGB pixels from source image with slope to a row of destination.
6113864f828a0167bde25b0d24d6b865aa514919fcc9fbarchard@google.com__declspec(naked) __declspec(align(16))
6114fc7314e86bc7a1a88b38b815e881183521801ea9fbarchard@google.comLIBYUV_API
6115864f828a0167bde25b0d24d6b865aa514919fcc9fbarchard@google.comvoid ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
6116864f828a0167bde25b0d24d6b865aa514919fcc9fbarchard@google.com                        uint8* dst_argb, const float* uv_dudv, int width) {
6117864f828a0167bde25b0d24d6b865aa514919fcc9fbarchard@google.com  __asm {
6118864f828a0167bde25b0d24d6b865aa514919fcc9fbarchard@google.com    push       esi
6119e3cc76943e5d1963443ed323c0abd35bafeba945fbarchard@google.com    push       edi
6120a0630d77f0433f77eba221854b017d6c8bc5229afbarchard@google.com    mov        eax, [esp + 12]  // src_argb
6121e3cc76943e5d1963443ed323c0abd35bafeba945fbarchard@google.com    mov        esi, [esp + 16]  // stride
6122e3cc76943e5d1963443ed323c0abd35bafeba945fbarchard@google.com    mov        edx, [esp + 20]  // dst_argb
6123e3cc76943e5d1963443ed323c0abd35bafeba945fbarchard@google.com    mov        ecx, [esp + 24]  // pointer to uv_dudv
6124864f828a0167bde25b0d24d6b865aa514919fcc9fbarchard@google.com    movq       xmm2, qword ptr [ecx]  // uv
6125845e94d1a74dc4b773159bf91d3c5da23b781476fbarchard@google.com    movq       xmm7, qword ptr [ecx + 8]  // dudv
6126e3cc76943e5d1963443ed323c0abd35bafeba945fbarchard@google.com    mov        ecx, [esp + 28]  // width
6127864f828a0167bde25b0d24d6b865aa514919fcc9fbarchard@google.com    shl        esi, 16          // 4, stride
6128864f828a0167bde25b0d24d6b865aa514919fcc9fbarchard@google.com    add        esi, 4
6129845e94d1a74dc4b773159bf91d3c5da23b781476fbarchard@google.com    movd       xmm5, esi
6130845e94d1a74dc4b773159bf91d3c5da23b781476fbarchard@google.com    sub        ecx, 4
6131845e94d1a74dc4b773159bf91d3c5da23b781476fbarchard@google.com    jl         l4b
6132864f828a0167bde25b0d24d6b865aa514919fcc9fbarchard@google.com
6133845e94d1a74dc4b773159bf91d3c5da23b781476fbarchard@google.com    // setup for 4 pixel loop
6134845e94d1a74dc4b773159bf91d3c5da23b781476fbarchard@google.com    pshufd     xmm7, xmm7, 0x44  // dup dudv
6135845e94d1a74dc4b773159bf91d3c5da23b781476fbarchard@google.com    pshufd     xmm5, xmm5, 0  // dup 4, stride
6136864f828a0167bde25b0d24d6b865aa514919fcc9fbarchard@google.com    movdqa     xmm0, xmm2    // x0, y0, x1, y1
6137845e94d1a74dc4b773159bf91d3c5da23b781476fbarchard@google.com    addps      xmm0, xmm7
6138864f828a0167bde25b0d24d6b865aa514919fcc9fbarchard@google.com    movlhps    xmm2, xmm0
6139845e94d1a74dc4b773159bf91d3c5da23b781476fbarchard@google.com    movdqa     xmm4, xmm7
6140845e94d1a74dc4b773159bf91d3c5da23b781476fbarchard@google.com    addps      xmm4, xmm4    // dudv *= 2
6141845e94d1a74dc4b773159bf91d3c5da23b781476fbarchard@google.com    movdqa     xmm3, xmm2    // x2, y2, x3, y3
6142845e94d1a74dc4b773159bf91d3c5da23b781476fbarchard@google.com    addps      xmm3, xmm4
6143845e94d1a74dc4b773159bf91d3c5da23b781476fbarchard@google.com    addps      xmm4, xmm4    // dudv *= 4
61442d11d43a6e21865b904705acce6535ae4c2d3caffbarchard@google.com
6145845e94d1a74dc4b773159bf91d3c5da23b781476fbarchard@google.com    // 4 pixel loop
6146864f828a0167bde25b0d24d6b865aa514919fcc9fbarchard@google.com    align      4
6147845e94d1a74dc4b773159bf91d3c5da23b781476fbarchard@google.com  l4:
6148845e94d1a74dc4b773159bf91d3c5da23b781476fbarchard@google.com    cvttps2dq  xmm0, xmm2    // x, y float to int first 2
6149845e94d1a74dc4b773159bf91d3c5da23b781476fbarchard@google.com    cvttps2dq  xmm1, xmm3    // x, y float to int next 2
6150845e94d1a74dc4b773159bf91d3c5da23b781476fbarchard@google.com    packssdw   xmm0, xmm1    // x, y as 8 shorts
6151845e94d1a74dc4b773159bf91d3c5da23b781476fbarchard@google.com    pmaddwd    xmm0, xmm5    // offsets = x * 4 + y * stride.
6152845e94d1a74dc4b773159bf91d3c5da23b781476fbarchard@google.com    movd       esi, xmm0
6153845e94d1a74dc4b773159bf91d3c5da23b781476fbarchard@google.com    pshufd     xmm0, xmm0, 0x39  // shift right
6154e3cc76943e5d1963443ed323c0abd35bafeba945fbarchard@google.com    movd       edi, xmm0
6155845e94d1a74dc4b773159bf91d3c5da23b781476fbarchard@google.com    pshufd     xmm0, xmm0, 0x39  // shift right
6156e3cc76943e5d1963443ed323c0abd35bafeba945fbarchard@google.com    movd       xmm1, [eax + esi]  // read pixel 0
6157e3cc76943e5d1963443ed323c0abd35bafeba945fbarchard@google.com    movd       xmm6, [eax + edi]  // read pixel 1
6158845e94d1a74dc4b773159bf91d3c5da23b781476fbarchard@google.com    punpckldq  xmm1, xmm6     // combine pixel 0 and 1
6159e3cc76943e5d1963443ed323c0abd35bafeba945fbarchard@google.com    addps      xmm2, xmm4    // x, y += dx, dy first 2
6160e3cc76943e5d1963443ed323c0abd35bafeba945fbarchard@google.com    movq       qword ptr [edx], xmm1
6161845e94d1a74dc4b773159bf91d3c5da23b781476fbarchard@google.com    movd       esi, xmm0
6162845e94d1a74dc4b773159bf91d3c5da23b781476fbarchard@google.com    pshufd     xmm0, xmm0, 0x39  // shift right
6163e3cc76943e5d1963443ed323c0abd35bafeba945fbarchard@google.com    movd       edi, xmm0
6164845e94d1a74dc4b773159bf91d3c5da23b781476fbarchard@google.com    movd       xmm6, [eax + esi]  // read pixel 2
6165e3cc76943e5d1963443ed323c0abd35bafeba945fbarchard@google.com    movd       xmm0, [eax + edi]  // read pixel 3
6166845e94d1a74dc4b773159bf91d3c5da23b781476fbarchard@google.com    punpckldq  xmm6, xmm0     // combine pixel 2 and 3
6167e3cc76943e5d1963443ed323c0abd35bafeba945fbarchard@google.com    addps      xmm3, xmm4    // x, y += dx, dy next 2
6168845e94d1a74dc4b773159bf91d3c5da23b781476fbarchard@google.com    sub        ecx, 4
6169e3cc76943e5d1963443ed323c0abd35bafeba945fbarchard@google.com    movq       qword ptr 8[edx], xmm6
6170845e94d1a74dc4b773159bf91d3c5da23b781476fbarchard@google.com    lea        edx, [edx + 16]
6171845e94d1a74dc4b773159bf91d3c5da23b781476fbarchard@google.com    jge        l4
6172864f828a0167bde25b0d24d6b865aa514919fcc9fbarchard@google.com
6173845e94d1a74dc4b773159bf91d3c5da23b781476fbarchard@google.com  l4b:
6174845e94d1a74dc4b773159bf91d3c5da23b781476fbarchard@google.com    add        ecx, 4 - 1
6175864f828a0167bde25b0d24d6b865aa514919fcc9fbarchard@google.com    jl         l1b
6176864f828a0167bde25b0d24d6b865aa514919fcc9fbarchard@google.com
6177864f828a0167bde25b0d24d6b865aa514919fcc9fbarchard@google.com    // 1 pixel loop
6178864f828a0167bde25b0d24d6b865aa514919fcc9fbarchard@google.com    align      4
6179864f828a0167bde25b0d24d6b865aa514919fcc9fbarchard@google.com  l1:
6180845e94d1a74dc4b773159bf91d3c5da23b781476fbarchard@google.com    cvttps2dq  xmm0, xmm2    // x, y float to int
6181845e94d1a74dc4b773159bf91d3c5da23b781476fbarchard@google.com    packssdw   xmm0, xmm0    // x, y as shorts
6182845e94d1a74dc4b773159bf91d3c5da23b781476fbarchard@google.com    pmaddwd    xmm0, xmm5    // offset = x * 4 + y * stride
6183845e94d1a74dc4b773159bf91d3c5da23b781476fbarchard@google.com    addps      xmm2, xmm7    // x, y += dx, dy
6184845e94d1a74dc4b773159bf91d3c5da23b781476fbarchard@google.com    movd       esi, xmm0
6185864f828a0167bde25b0d24d6b865aa514919fcc9fbarchard@google.com    movd       xmm0, [eax + esi]  // copy a pixel
6186864f828a0167bde25b0d24d6b865aa514919fcc9fbarchard@google.com    sub        ecx, 1
6187864f828a0167bde25b0d24d6b865aa514919fcc9fbarchard@google.com    movd       [edx], xmm0
6188864f828a0167bde25b0d24d6b865aa514919fcc9fbarchard@google.com    lea        edx, [edx + 4]
6189864f828a0167bde25b0d24d6b865aa514919fcc9fbarchard@google.com    jge        l1
6190864f828a0167bde25b0d24d6b865aa514919fcc9fbarchard@google.com  l1b:
6191e3cc76943e5d1963443ed323c0abd35bafeba945fbarchard@google.com    pop        edi
6192864f828a0167bde25b0d24d6b865aa514919fcc9fbarchard@google.com    pop        esi
6193864f828a0167bde25b0d24d6b865aa514919fcc9fbarchard@google.com    ret
6194864f828a0167bde25b0d24d6b865aa514919fcc9fbarchard@google.com  }
6195864f828a0167bde25b0d24d6b865aa514919fcc9fbarchard@google.com}
6196864f828a0167bde25b0d24d6b865aa514919fcc9fbarchard@google.com#endif  // HAS_ARGBAFFINEROW_SSE2
6197864f828a0167bde25b0d24d6b865aa514919fcc9fbarchard@google.com
61982154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com#ifdef HAS_INTERPOLATEROW_AVX2
61992154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com// Bilinear filter 16x2 -> 16x1
62002154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com__declspec(naked) __declspec(align(16))
62012154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.comvoid InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
62022154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com                          ptrdiff_t src_stride, int dst_width,
62032154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com                          int source_y_fraction) {
62042154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com  __asm {
62052154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    push       esi
62062154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    push       edi
62072154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    mov        edi, [esp + 8 + 4]   // dst_ptr
62082154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    mov        esi, [esp + 8 + 8]   // src_ptr
62092154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    mov        edx, [esp + 8 + 12]  // src_stride
62102154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    mov        ecx, [esp + 8 + 16]  // dst_width
62112154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
62122154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    shr        eax, 1
62132154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    // Dispatch to specialized filters if applicable.
62142154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    cmp        eax, 0
62152154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    je         xloop100  // 0 / 128.  Blend 100 / 0.
62162154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    sub        edi, esi
62172154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    cmp        eax, 32
62182154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    je         xloop75   // 32 / 128 is 0.25.  Blend 75 / 25.
62192154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    cmp        eax, 64
62202154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    je         xloop50   // 64 / 128 is 0.50.  Blend 50 / 50.
62212154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    cmp        eax, 96
62222154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    je         xloop25   // 96 / 128 is 0.75.  Blend 25 / 75.
62232154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com
62242154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    vmovd      xmm0, eax  // high fraction 0..127
62252154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    neg        eax
62262154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    add        eax, 128
62272154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    vmovd      xmm5, eax  // low fraction 128..1
62282154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    vpunpcklbw xmm5, xmm5, xmm0
62292154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    vpunpcklwd xmm5, xmm5, xmm5
62302154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    vpxor      ymm0, ymm0, ymm0
62312154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    vpermd     ymm5, ymm0, ymm5
62322154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com
6233c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
62342154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com  xloop:
62352154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    vmovdqu    ymm0, [esi]
62362154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    vmovdqu    ymm2, [esi + edx]
62372154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    vpunpckhbw ymm1, ymm0, ymm2  // mutates
62382154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    vpunpcklbw ymm0, ymm0, ymm2  // mutates
62392154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    vpmaddubsw ymm0, ymm0, ymm5
62402154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    vpmaddubsw ymm1, ymm1, ymm5
62412154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    vpsrlw     ymm0, ymm0, 7
62422154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    vpsrlw     ymm1, ymm1, 7
62432154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    vpackuswb  ymm0, ymm0, ymm1  // unmutates
62442154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    sub        ecx, 32
62452154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    vmovdqu    [esi + edi], ymm0
62462154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    lea        esi, [esi + 32]
62472154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    jg         xloop
62482154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    jmp        xloop99
62492154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com
62502154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    // Blend 25 / 75.
6251c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
62522154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com  xloop25:
62532154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    vmovdqu    ymm0, [esi]
62542154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    vpavgb     ymm0, ymm0, [esi + edx]
62552154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    vpavgb     ymm0, ymm0, [esi + edx]
62562154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    sub        ecx, 32
62572154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    vmovdqu    [esi + edi], ymm0
62582154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    lea        esi, [esi + 32]
62592154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    jg         xloop25
62602154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    jmp        xloop99
62612154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com
62622154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    // Blend 50 / 50.
6263c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
62642154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com  xloop50:
62652154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    vmovdqu    ymm0, [esi]
62662154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    vpavgb     ymm0, ymm0, [esi + edx]
62672154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    sub        ecx, 32
62682154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    vmovdqu    [esi + edi], ymm0
62692154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    lea        esi, [esi + 32]
62702154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    jg         xloop50
62712154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    jmp        xloop99
62722154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com
62732154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    // Blend 75 / 25.
6274c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
62752154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com  xloop75:
62762154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    vmovdqu    ymm0, [esi + edx]
62772154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    vpavgb     ymm0, ymm0, [esi]
62782154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    vpavgb     ymm0, ymm0, [esi]
62792154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    sub        ecx, 32
62802154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    vmovdqu     [esi + edi], ymm0
62812154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    lea        esi, [esi + 32]
62822154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    jg         xloop75
62832154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    jmp        xloop99
62842154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com
62852154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    // Blend 100 / 0 - Copy row unchanged.
6286c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
62872154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com  xloop100:
62882154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    rep movsb
62892154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com
62902154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com  xloop99:
62912154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    pop        edi
62922154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    pop        esi
62932154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    vzeroupper
62942154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com    ret
62952154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com  }
62962154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com}
62972154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com#endif  // HAS_INTERPOLATEROW_AVX2
62982154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com
62992154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com#ifdef HAS_INTERPOLATEROW_SSSE3
6300b911428afd3994f47e5780a80c876d05d1d4c590fbarchard@google.com// Bilinear filter 16x2 -> 16x1
63019bcc9a25355841f844e9fae3ba40522447312a66fbarchard@google.com__declspec(naked) __declspec(align(16))
6302b911428afd3994f47e5780a80c876d05d1d4c590fbarchard@google.comvoid InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
6303b911428afd3994f47e5780a80c876d05d1d4c590fbarchard@google.com                          ptrdiff_t src_stride, int dst_width,
6304b911428afd3994f47e5780a80c876d05d1d4c590fbarchard@google.com                          int source_y_fraction) {
63059bcc9a25355841f844e9fae3ba40522447312a66fbarchard@google.com  __asm {
63069bcc9a25355841f844e9fae3ba40522447312a66fbarchard@google.com    push       esi
63079bcc9a25355841f844e9fae3ba40522447312a66fbarchard@google.com    push       edi
6308b911428afd3994f47e5780a80c876d05d1d4c590fbarchard@google.com    mov        edi, [esp + 8 + 4]   // dst_ptr
6309b911428afd3994f47e5780a80c876d05d1d4c590fbarchard@google.com    mov        esi, [esp + 8 + 8]   // src_ptr
63109bcc9a25355841f844e9fae3ba40522447312a66fbarchard@google.com    mov        edx, [esp + 8 + 12]  // src_stride
63119bcc9a25355841f844e9fae3ba40522447312a66fbarchard@google.com    mov        ecx, [esp + 8 + 16]  // dst_width
63129bcc9a25355841f844e9fae3ba40522447312a66fbarchard@google.com    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
63139bcc9a25355841f844e9fae3ba40522447312a66fbarchard@google.com    sub        edi, esi
63149bcc9a25355841f844e9fae3ba40522447312a66fbarchard@google.com    shr        eax, 1
63158c4e5e284c320cd9b9bf137ba1bd10a88b398b48fbarchard@google.com    // Dispatch to specialized filters if applicable.
63168c4e5e284c320cd9b9bf137ba1bd10a88b398b48fbarchard@google.com    cmp        eax, 0
63178c4e5e284c320cd9b9bf137ba1bd10a88b398b48fbarchard@google.com    je         xloop100  // 0 / 128.  Blend 100 / 0.
6318b5491759b45de37df781d4408a0c46abf6d4ae08fbarchard@google.com    cmp        eax, 32
63198c4e5e284c320cd9b9bf137ba1bd10a88b398b48fbarchard@google.com    je         xloop75   // 32 / 128 is 0.25.  Blend 75 / 25.
63209bcc9a25355841f844e9fae3ba40522447312a66fbarchard@google.com    cmp        eax, 64
63218c4e5e284c320cd9b9bf137ba1bd10a88b398b48fbarchard@google.com    je         xloop50   // 64 / 128 is 0.50.  Blend 50 / 50.
6322b5491759b45de37df781d4408a0c46abf6d4ae08fbarchard@google.com    cmp        eax, 96
63238c4e5e284c320cd9b9bf137ba1bd10a88b398b48fbarchard@google.com    je         xloop25   // 96 / 128 is 0.75.  Blend 25 / 75.
6324b5491759b45de37df781d4408a0c46abf6d4ae08fbarchard@google.com
63259bcc9a25355841f844e9fae3ba40522447312a66fbarchard@google.com    movd       xmm0, eax  // high fraction 0..127
63269bcc9a25355841f844e9fae3ba40522447312a66fbarchard@google.com    neg        eax
63279bcc9a25355841f844e9fae3ba40522447312a66fbarchard@google.com    add        eax, 128
63289bcc9a25355841f844e9fae3ba40522447312a66fbarchard@google.com    movd       xmm5, eax  // low fraction 128..1
63299bcc9a25355841f844e9fae3ba40522447312a66fbarchard@google.com    punpcklbw  xmm5, xmm0
63309bcc9a25355841f844e9fae3ba40522447312a66fbarchard@google.com    punpcklwd  xmm5, xmm5
63319bcc9a25355841f844e9fae3ba40522447312a66fbarchard@google.com    pshufd     xmm5, xmm5, 0
63329bcc9a25355841f844e9fae3ba40522447312a66fbarchard@google.com
6333c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
63349bcc9a25355841f844e9fae3ba40522447312a66fbarchard@google.com  xloop:
63359bcc9a25355841f844e9fae3ba40522447312a66fbarchard@google.com    movdqa     xmm0, [esi]
63369bcc9a25355841f844e9fae3ba40522447312a66fbarchard@google.com    movdqa     xmm2, [esi + edx]
63379bcc9a25355841f844e9fae3ba40522447312a66fbarchard@google.com    movdqa     xmm1, xmm0
63389bcc9a25355841f844e9fae3ba40522447312a66fbarchard@google.com    punpcklbw  xmm0, xmm2
63399bcc9a25355841f844e9fae3ba40522447312a66fbarchard@google.com    punpckhbw  xmm1, xmm2
63409bcc9a25355841f844e9fae3ba40522447312a66fbarchard@google.com    pmaddubsw  xmm0, xmm5
63419bcc9a25355841f844e9fae3ba40522447312a66fbarchard@google.com    pmaddubsw  xmm1, xmm5
63429bcc9a25355841f844e9fae3ba40522447312a66fbarchard@google.com    psrlw      xmm0, 7
63439bcc9a25355841f844e9fae3ba40522447312a66fbarchard@google.com    psrlw      xmm1, 7
63449bcc9a25355841f844e9fae3ba40522447312a66fbarchard@google.com    packuswb   xmm0, xmm1
6345b911428afd3994f47e5780a80c876d05d1d4c590fbarchard@google.com    sub        ecx, 16
63469bcc9a25355841f844e9fae3ba40522447312a66fbarchard@google.com    movdqa     [esi + edi], xmm0
63479bcc9a25355841f844e9fae3ba40522447312a66fbarchard@google.com    lea        esi, [esi + 16]
63489bcc9a25355841f844e9fae3ba40522447312a66fbarchard@google.com    jg         xloop
6349b5491759b45de37df781d4408a0c46abf6d4ae08fbarchard@google.com    jmp        xloop99
63509bcc9a25355841f844e9fae3ba40522447312a66fbarchard@google.com
6351b5491759b45de37df781d4408a0c46abf6d4ae08fbarchard@google.com    // Blend 25 / 75.
6352c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
6353b5491759b45de37df781d4408a0c46abf6d4ae08fbarchard@google.com  xloop25:
6354b5491759b45de37df781d4408a0c46abf6d4ae08fbarchard@google.com    movdqa     xmm0, [esi]
6355b5491759b45de37df781d4408a0c46abf6d4ae08fbarchard@google.com    movdqa     xmm1, [esi + edx]
6356b5491759b45de37df781d4408a0c46abf6d4ae08fbarchard@google.com    pavgb      xmm0, xmm1
6357b5491759b45de37df781d4408a0c46abf6d4ae08fbarchard@google.com    pavgb      xmm0, xmm1
6358b911428afd3994f47e5780a80c876d05d1d4c590fbarchard@google.com    sub        ecx, 16
6359b5491759b45de37df781d4408a0c46abf6d4ae08fbarchard@google.com    movdqa     [esi + edi], xmm0
6360b5491759b45de37df781d4408a0c46abf6d4ae08fbarchard@google.com    lea        esi, [esi + 16]
6361b5491759b45de37df781d4408a0c46abf6d4ae08fbarchard@google.com    jg         xloop25
6362b5491759b45de37df781d4408a0c46abf6d4ae08fbarchard@google.com    jmp        xloop99
63639bcc9a25355841f844e9fae3ba40522447312a66fbarchard@google.com
6364b5491759b45de37df781d4408a0c46abf6d4ae08fbarchard@google.com    // Blend 50 / 50.
6365c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
6366b5491759b45de37df781d4408a0c46abf6d4ae08fbarchard@google.com  xloop50:
63679bcc9a25355841f844e9fae3ba40522447312a66fbarchard@google.com    movdqa     xmm0, [esi]
6368b5491759b45de37df781d4408a0c46abf6d4ae08fbarchard@google.com    movdqa     xmm1, [esi + edx]
6369b5491759b45de37df781d4408a0c46abf6d4ae08fbarchard@google.com    pavgb      xmm0, xmm1
6370b911428afd3994f47e5780a80c876d05d1d4c590fbarchard@google.com    sub        ecx, 16
63719bcc9a25355841f844e9fae3ba40522447312a66fbarchard@google.com    movdqa     [esi + edi], xmm0
63729bcc9a25355841f844e9fae3ba40522447312a66fbarchard@google.com    lea        esi, [esi + 16]
6373b5491759b45de37df781d4408a0c46abf6d4ae08fbarchard@google.com    jg         xloop50
6374b5491759b45de37df781d4408a0c46abf6d4ae08fbarchard@google.com    jmp        xloop99
63759bcc9a25355841f844e9fae3ba40522447312a66fbarchard@google.com
6376b5491759b45de37df781d4408a0c46abf6d4ae08fbarchard@google.com    // Blend 75 / 25.
6377c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
6378b5491759b45de37df781d4408a0c46abf6d4ae08fbarchard@google.com  xloop75:
6379b5491759b45de37df781d4408a0c46abf6d4ae08fbarchard@google.com    movdqa     xmm1, [esi]
6380b5491759b45de37df781d4408a0c46abf6d4ae08fbarchard@google.com    movdqa     xmm0, [esi + edx]
6381b5491759b45de37df781d4408a0c46abf6d4ae08fbarchard@google.com    pavgb      xmm0, xmm1
6382b5491759b45de37df781d4408a0c46abf6d4ae08fbarchard@google.com    pavgb      xmm0, xmm1
6383b911428afd3994f47e5780a80c876d05d1d4c590fbarchard@google.com    sub        ecx, 16
6384b5491759b45de37df781d4408a0c46abf6d4ae08fbarchard@google.com    movdqa     [esi + edi], xmm0
6385b5491759b45de37df781d4408a0c46abf6d4ae08fbarchard@google.com    lea        esi, [esi + 16]
6386b5491759b45de37df781d4408a0c46abf6d4ae08fbarchard@google.com    jg         xloop75
6387b5491759b45de37df781d4408a0c46abf6d4ae08fbarchard@google.com    jmp        xloop99
63889bcc9a25355841f844e9fae3ba40522447312a66fbarchard@google.com
6389b5491759b45de37df781d4408a0c46abf6d4ae08fbarchard@google.com    // Blend 100 / 0 - Copy row unchanged.
6390c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
6391b5491759b45de37df781d4408a0c46abf6d4ae08fbarchard@google.com  xloop100:
63929bcc9a25355841f844e9fae3ba40522447312a66fbarchard@google.com    movdqa     xmm0, [esi]
6393b911428afd3994f47e5780a80c876d05d1d4c590fbarchard@google.com    sub        ecx, 16
63949bcc9a25355841f844e9fae3ba40522447312a66fbarchard@google.com    movdqa     [esi + edi], xmm0
63959bcc9a25355841f844e9fae3ba40522447312a66fbarchard@google.com    lea        esi, [esi + 16]
6396b5491759b45de37df781d4408a0c46abf6d4ae08fbarchard@google.com    jg         xloop100
63979bcc9a25355841f844e9fae3ba40522447312a66fbarchard@google.com
63988811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com  xloop99:
63998811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    pop        edi
64008811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    pop        esi
64018811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    ret
64028811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com  }
64038811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com}
64042154de414c1ac41fb434348964c728ccc077ffd3fbarchard@google.com#endif  // HAS_INTERPOLATEROW_SSSE3
64058811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com
640697c96261076adb3294105db38b461bcfae9597d3fbarchard@google.com#ifdef HAS_INTERPOLATEROW_SSE2
6407b911428afd3994f47e5780a80c876d05d1d4c590fbarchard@google.com// Bilinear filter 16x2 -> 16x1
64088811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com__declspec(naked) __declspec(align(16))
6409b911428afd3994f47e5780a80c876d05d1d4c590fbarchard@google.comvoid InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
6410b911428afd3994f47e5780a80c876d05d1d4c590fbarchard@google.com                         ptrdiff_t src_stride, int dst_width,
6411b911428afd3994f47e5780a80c876d05d1d4c590fbarchard@google.com                         int source_y_fraction) {
64128811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com  __asm {
64138811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    push       esi
64148811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    push       edi
6415b911428afd3994f47e5780a80c876d05d1d4c590fbarchard@google.com    mov        edi, [esp + 8 + 4]   // dst_ptr
6416b911428afd3994f47e5780a80c876d05d1d4c590fbarchard@google.com    mov        esi, [esp + 8 + 8]   // src_ptr
64178811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    mov        edx, [esp + 8 + 12]  // src_stride
64188811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    mov        ecx, [esp + 8 + 16]  // dst_width
64198811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
64208811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    sub        edi, esi
64218c4e5e284c320cd9b9bf137ba1bd10a88b398b48fbarchard@google.com    // Dispatch to specialized filters if applicable.
64228c4e5e284c320cd9b9bf137ba1bd10a88b398b48fbarchard@google.com    cmp        eax, 0
64238c4e5e284c320cd9b9bf137ba1bd10a88b398b48fbarchard@google.com    je         xloop100  // 0 / 256.  Blend 100 / 0.
64248811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    cmp        eax, 64
64258c4e5e284c320cd9b9bf137ba1bd10a88b398b48fbarchard@google.com    je         xloop75   // 64 / 256 is 0.25.  Blend 75 / 25.
64268811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    cmp        eax, 128
64278c4e5e284c320cd9b9bf137ba1bd10a88b398b48fbarchard@google.com    je         xloop50   // 128 / 256 is 0.50.  Blend 50 / 50.
64288811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    cmp        eax, 192
64298c4e5e284c320cd9b9bf137ba1bd10a88b398b48fbarchard@google.com    je         xloop25   // 192 / 256 is 0.75.  Blend 25 / 75.
64308811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com
64318811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    movd       xmm5, eax            // xmm5 = y fraction
64328811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    punpcklbw  xmm5, xmm5
64338811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    psrlw      xmm5, 1
64348811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    punpcklwd  xmm5, xmm5
64358811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    punpckldq  xmm5, xmm5
64368811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    punpcklqdq xmm5, xmm5
64378811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    pxor       xmm4, xmm4
64388811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com
6439c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
64408811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com  xloop:
64418811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    movdqa     xmm0, [esi]  // row0
64428811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    movdqa     xmm2, [esi + edx]  // row1
64438811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    movdqa     xmm1, xmm0
64448811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    movdqa     xmm3, xmm2
64458811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    punpcklbw  xmm2, xmm4
64468811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    punpckhbw  xmm3, xmm4
64478811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    punpcklbw  xmm0, xmm4
64488811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    punpckhbw  xmm1, xmm4
64498811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    psubw      xmm2, xmm0  // row1 - row0
64508811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    psubw      xmm3, xmm1
64518811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    paddw      xmm2, xmm2  // 9 bits * 15 bits = 8.16
64528811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    paddw      xmm3, xmm3
64538811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    pmulhw     xmm2, xmm5  // scale diff
64548811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    pmulhw     xmm3, xmm5
64558811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    paddw      xmm0, xmm2  // sum rows
64568811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    paddw      xmm1, xmm3
64578811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    packuswb   xmm0, xmm1
6458b911428afd3994f47e5780a80c876d05d1d4c590fbarchard@google.com    sub        ecx, 16
64598811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    movdqa     [esi + edi], xmm0
64608811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    lea        esi, [esi + 16]
64618811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    jg         xloop
64628811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    jmp        xloop99
64638811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com
64648811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    // Blend 25 / 75.
6465c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
64668811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com  xloop25:
64678811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    movdqa     xmm0, [esi]
64688811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    movdqa     xmm1, [esi + edx]
64698811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    pavgb      xmm0, xmm1
64708811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    pavgb      xmm0, xmm1
6471b911428afd3994f47e5780a80c876d05d1d4c590fbarchard@google.com    sub        ecx, 16
64728811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    movdqa     [esi + edi], xmm0
64738811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    lea        esi, [esi + 16]
64748811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    jg         xloop25
64758811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    jmp        xloop99
64768811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com
64778811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    // Blend 50 / 50.
6478c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
64798811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com  xloop50:
64808811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    movdqa     xmm0, [esi]
64818811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    movdqa     xmm1, [esi + edx]
64828811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    pavgb      xmm0, xmm1
6483b911428afd3994f47e5780a80c876d05d1d4c590fbarchard@google.com    sub        ecx, 16
64848811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    movdqa     [esi + edi], xmm0
64858811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    lea        esi, [esi + 16]
64868811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    jg         xloop50
64878811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    jmp        xloop99
64888811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com
64898811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    // Blend 75 / 25.
6490c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
64918811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com  xloop75:
64928811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    movdqa     xmm1, [esi]
64938811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    movdqa     xmm0, [esi + edx]
64948811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    pavgb      xmm0, xmm1
64958811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    pavgb      xmm0, xmm1
6496b911428afd3994f47e5780a80c876d05d1d4c590fbarchard@google.com    sub        ecx, 16
64978811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    movdqa     [esi + edi], xmm0
64988811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    lea        esi, [esi + 16]
64998811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    jg         xloop75
65008811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    jmp        xloop99
65018811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com
65028811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    // Blend 100 / 0 - Copy row unchanged.
6503c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
65048811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com  xloop100:
65058811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    movdqa     xmm0, [esi]
6506b911428afd3994f47e5780a80c876d05d1d4c590fbarchard@google.com    sub        ecx, 16
65078811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    movdqa     [esi + edi], xmm0
65088811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    lea        esi, [esi + 16]
65098811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com    jg         xloop100
65108811289be7ad257f16a6532aeec2cb5db331fa54fbarchard@google.com
6511b5491759b45de37df781d4408a0c46abf6d4ae08fbarchard@google.com  xloop99:
65129bcc9a25355841f844e9fae3ba40522447312a66fbarchard@google.com    pop        edi
65139bcc9a25355841f844e9fae3ba40522447312a66fbarchard@google.com    pop        esi
65149bcc9a25355841f844e9fae3ba40522447312a66fbarchard@google.com    ret
65159bcc9a25355841f844e9fae3ba40522447312a66fbarchard@google.com  }
65169bcc9a25355841f844e9fae3ba40522447312a66fbarchard@google.com}
651797c96261076adb3294105db38b461bcfae9597d3fbarchard@google.com#endif  // HAS_INTERPOLATEROW_SSE2
65189bcc9a25355841f844e9fae3ba40522447312a66fbarchard@google.com
6519b911428afd3994f47e5780a80c876d05d1d4c590fbarchard@google.com// Bilinear filter 16x2 -> 16x1
6520cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com__declspec(naked) __declspec(align(16))
6521b911428afd3994f47e5780a80c876d05d1d4c590fbarchard@google.comvoid InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
6522b911428afd3994f47e5780a80c876d05d1d4c590fbarchard@google.com                                    ptrdiff_t src_stride, int dst_width,
6523b911428afd3994f47e5780a80c876d05d1d4c590fbarchard@google.com                                    int source_y_fraction) {
6524cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com  __asm {
6525cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    push       esi
6526cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    push       edi
6527b911428afd3994f47e5780a80c876d05d1d4c590fbarchard@google.com    mov        edi, [esp + 8 + 4]   // dst_ptr
6528b911428afd3994f47e5780a80c876d05d1d4c590fbarchard@google.com    mov        esi, [esp + 8 + 8]   // src_ptr
6529cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    mov        edx, [esp + 8 + 12]  // src_stride
6530cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    mov        ecx, [esp + 8 + 16]  // dst_width
6531cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
6532cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    sub        edi, esi
6533cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    shr        eax, 1
6534cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    // Dispatch to specialized filters if applicable.
6535cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    cmp        eax, 0
6536cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    je         xloop100  // 0 / 128.  Blend 100 / 0.
6537cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    cmp        eax, 32
6538cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    je         xloop75   // 32 / 128 is 0.25.  Blend 75 / 25.
6539cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    cmp        eax, 64
6540cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    je         xloop50   // 64 / 128 is 0.50.  Blend 50 / 50.
6541cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    cmp        eax, 96
6542cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    je         xloop25   // 96 / 128 is 0.75.  Blend 25 / 75.
6543cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com
6544cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    movd       xmm0, eax  // high fraction 0..127
6545cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    neg        eax
6546cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    add        eax, 128
6547cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    movd       xmm5, eax  // low fraction 128..1
6548cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    punpcklbw  xmm5, xmm0
6549cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    punpcklwd  xmm5, xmm5
6550cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    pshufd     xmm5, xmm5, 0
6551cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com
6552c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
6553cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com  xloop:
6554cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    movdqu     xmm0, [esi]
6555cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    movdqu     xmm2, [esi + edx]
6556cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    movdqu     xmm1, xmm0
6557cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    punpcklbw  xmm0, xmm2
6558cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    punpckhbw  xmm1, xmm2
6559cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    pmaddubsw  xmm0, xmm5
6560cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    pmaddubsw  xmm1, xmm5
6561cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    psrlw      xmm0, 7
6562cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    psrlw      xmm1, 7
6563cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    packuswb   xmm0, xmm1
6564b911428afd3994f47e5780a80c876d05d1d4c590fbarchard@google.com    sub        ecx, 16
6565cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    movdqu     [esi + edi], xmm0
6566cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    lea        esi, [esi + 16]
6567cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    jg         xloop
6568cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    jmp        xloop99
6569cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com
6570cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    // Blend 25 / 75.
6571c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
6572cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com  xloop25:
6573cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    movdqu     xmm0, [esi]
6574cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    movdqu     xmm1, [esi + edx]
6575cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    pavgb      xmm0, xmm1
6576cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    pavgb      xmm0, xmm1
6577b911428afd3994f47e5780a80c876d05d1d4c590fbarchard@google.com    sub        ecx, 16
6578cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    movdqu     [esi + edi], xmm0
6579cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    lea        esi, [esi + 16]
6580cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    jg         xloop25
6581cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    jmp        xloop99
6582cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com
6583cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    // Blend 50 / 50.
6584c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
6585cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com  xloop50:
6586cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    movdqu     xmm0, [esi]
6587cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    movdqu     xmm1, [esi + edx]
6588cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    pavgb      xmm0, xmm1
6589b911428afd3994f47e5780a80c876d05d1d4c590fbarchard@google.com    sub        ecx, 16
6590cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    movdqu     [esi + edi], xmm0
6591cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    lea        esi, [esi + 16]
6592cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    jg         xloop50
6593cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    jmp        xloop99
6594cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com
6595cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    // Blend 75 / 25.
6596c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
6597cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com  xloop75:
6598cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    movdqu     xmm1, [esi]
6599cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    movdqu     xmm0, [esi + edx]
6600cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    pavgb      xmm0, xmm1
6601cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    pavgb      xmm0, xmm1
6602b911428afd3994f47e5780a80c876d05d1d4c590fbarchard@google.com    sub        ecx, 16
6603cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    movdqu     [esi + edi], xmm0
6604cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    lea        esi, [esi + 16]
6605cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    jg         xloop75
6606cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    jmp        xloop99
6607cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com
6608cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    // Blend 100 / 0 - Copy row unchanged.
6609c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
6610cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com  xloop100:
6611cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    movdqu     xmm0, [esi]
6612b911428afd3994f47e5780a80c876d05d1d4c590fbarchard@google.com    sub        ecx, 16
6613cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    movdqu     [esi + edi], xmm0
6614cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    lea        esi, [esi + 16]
6615cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    jg         xloop100
6616cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com
6617cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com  xloop99:
6618cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    pop        edi
6619cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    pop        esi
6620cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    ret
6621cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com  }
6622cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com}
6623cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com
662497c96261076adb3294105db38b461bcfae9597d3fbarchard@google.com#ifdef HAS_INTERPOLATEROW_SSE2
6625b911428afd3994f47e5780a80c876d05d1d4c590fbarchard@google.com// Bilinear filter 16x2 -> 16x1
6626cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com__declspec(naked) __declspec(align(16))
6627b911428afd3994f47e5780a80c876d05d1d4c590fbarchard@google.comvoid InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
6628b911428afd3994f47e5780a80c876d05d1d4c590fbarchard@google.com                                   ptrdiff_t src_stride, int dst_width,
6629b911428afd3994f47e5780a80c876d05d1d4c590fbarchard@google.com                                   int source_y_fraction) {
6630cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com  __asm {
6631cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    push       esi
6632cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    push       edi
6633b911428afd3994f47e5780a80c876d05d1d4c590fbarchard@google.com    mov        edi, [esp + 8 + 4]   // dst_ptr
6634b911428afd3994f47e5780a80c876d05d1d4c590fbarchard@google.com    mov        esi, [esp + 8 + 8]   // src_ptr
6635cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    mov        edx, [esp + 8 + 12]  // src_stride
6636cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    mov        ecx, [esp + 8 + 16]  // dst_width
6637cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
6638cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    sub        edi, esi
6639cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    // Dispatch to specialized filters if applicable.
6640cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    cmp        eax, 0
6641cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    je         xloop100  // 0 / 256.  Blend 100 / 0.
6642cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    cmp        eax, 64
6643cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    je         xloop75   // 64 / 256 is 0.25.  Blend 75 / 25.
6644cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    cmp        eax, 128
6645cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    je         xloop50   // 128 / 256 is 0.50.  Blend 50 / 50.
6646cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    cmp        eax, 192
6647cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    je         xloop25   // 192 / 256 is 0.75.  Blend 25 / 75.
6648cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com
6649cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    movd       xmm5, eax            // xmm5 = y fraction
6650cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    punpcklbw  xmm5, xmm5
6651cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    psrlw      xmm5, 1
6652cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    punpcklwd  xmm5, xmm5
6653cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    punpckldq  xmm5, xmm5
6654cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    punpcklqdq xmm5, xmm5
6655cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    pxor       xmm4, xmm4
6656cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com
6657c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
6658cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com  xloop:
6659cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    movdqu     xmm0, [esi]  // row0
6660cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    movdqu     xmm2, [esi + edx]  // row1
6661cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    movdqu     xmm1, xmm0
6662cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    movdqu     xmm3, xmm2
6663cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    punpcklbw  xmm2, xmm4
6664cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    punpckhbw  xmm3, xmm4
6665cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    punpcklbw  xmm0, xmm4
6666cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    punpckhbw  xmm1, xmm4
6667cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    psubw      xmm2, xmm0  // row1 - row0
6668cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    psubw      xmm3, xmm1
6669cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    paddw      xmm2, xmm2  // 9 bits * 15 bits = 8.16
6670cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    paddw      xmm3, xmm3
6671cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    pmulhw     xmm2, xmm5  // scale diff
6672cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    pmulhw     xmm3, xmm5
6673cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    paddw      xmm0, xmm2  // sum rows
6674cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    paddw      xmm1, xmm3
6675cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    packuswb   xmm0, xmm1
6676b911428afd3994f47e5780a80c876d05d1d4c590fbarchard@google.com    sub        ecx, 16
6677cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    movdqu     [esi + edi], xmm0
6678cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    lea        esi, [esi + 16]
6679cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    jg         xloop
6680cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    jmp        xloop99
6681cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com
6682cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    // Blend 25 / 75.
6683c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
6684cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com  xloop25:
6685cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    movdqu     xmm0, [esi]
6686cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    movdqu     xmm1, [esi + edx]
6687cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    pavgb      xmm0, xmm1
6688cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    pavgb      xmm0, xmm1
6689b911428afd3994f47e5780a80c876d05d1d4c590fbarchard@google.com    sub        ecx, 16
6690cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    movdqu     [esi + edi], xmm0
6691cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    lea        esi, [esi + 16]
6692cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    jg         xloop25
6693cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    jmp        xloop99
6694cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com
6695cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    // Blend 50 / 50.
6696c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
6697cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com  xloop50:
6698cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    movdqu     xmm0, [esi]
6699cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    movdqu     xmm1, [esi + edx]
6700cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    pavgb      xmm0, xmm1
6701b911428afd3994f47e5780a80c876d05d1d4c590fbarchard@google.com    sub        ecx, 16
6702cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    movdqu     [esi + edi], xmm0
6703cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    lea        esi, [esi + 16]
6704cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    jg         xloop50
6705cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    jmp        xloop99
6706cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com
6707cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    // Blend 75 / 25.
6708c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
6709cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com  xloop75:
6710cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    movdqu     xmm1, [esi]
6711cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    movdqu     xmm0, [esi + edx]
6712cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    pavgb      xmm0, xmm1
6713cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    pavgb      xmm0, xmm1
6714b911428afd3994f47e5780a80c876d05d1d4c590fbarchard@google.com    sub        ecx, 16
6715cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    movdqu     [esi + edi], xmm0
6716cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    lea        esi, [esi + 16]
6717cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    jg         xloop75
6718cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    jmp        xloop99
6719cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com
6720cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    // Blend 100 / 0 - Copy row unchanged.
6721c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
6722cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com  xloop100:
6723cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    movdqu     xmm0, [esi]
6724b911428afd3994f47e5780a80c876d05d1d4c590fbarchard@google.com    sub        ecx, 16
6725cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    movdqu     [esi + edi], xmm0
6726cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    lea        esi, [esi + 16]
6727cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    jg         xloop100
6728cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com
6729cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com  xloop99:
6730cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    pop        edi
6731cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    pop        esi
6732cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com    ret
6733cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com  }
6734cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com}
673597c96261076adb3294105db38b461bcfae9597d3fbarchard@google.com#endif  // HAS_INTERPOLATEROW_SSE2
6736cd6056c01cec8de0431390933537c9b8458bd472fbarchard@google.com
6737e91bdaca3674830570cbb2aaab6d5c939f56dee4fbarchard@google.com__declspec(naked) __declspec(align(16))
6738e91bdaca3674830570cbb2aaab6d5c939f56dee4fbarchard@google.comvoid HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
6739e91bdaca3674830570cbb2aaab6d5c939f56dee4fbarchard@google.com                  uint8* dst_uv, int pix) {
6740e91bdaca3674830570cbb2aaab6d5c939f56dee4fbarchard@google.com  __asm {
6741e91bdaca3674830570cbb2aaab6d5c939f56dee4fbarchard@google.com    push       edi
6742e91bdaca3674830570cbb2aaab6d5c939f56dee4fbarchard@google.com    mov        eax, [esp + 4 + 4]    // src_uv
6743e91bdaca3674830570cbb2aaab6d5c939f56dee4fbarchard@google.com    mov        edx, [esp + 4 + 8]    // src_uv_stride
6744e91bdaca3674830570cbb2aaab6d5c939f56dee4fbarchard@google.com    mov        edi, [esp + 4 + 12]   // dst_v
6745e91bdaca3674830570cbb2aaab6d5c939f56dee4fbarchard@google.com    mov        ecx, [esp + 4 + 16]   // pix
6746e91bdaca3674830570cbb2aaab6d5c939f56dee4fbarchard@google.com    sub        edi, eax
6747e91bdaca3674830570cbb2aaab6d5c939f56dee4fbarchard@google.com
6748c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
6749e91bdaca3674830570cbb2aaab6d5c939f56dee4fbarchard@google.com  convertloop:
6750e91bdaca3674830570cbb2aaab6d5c939f56dee4fbarchard@google.com    movdqa     xmm0, [eax]
6751e91bdaca3674830570cbb2aaab6d5c939f56dee4fbarchard@google.com    pavgb      xmm0, [eax + edx]
6752e91bdaca3674830570cbb2aaab6d5c939f56dee4fbarchard@google.com    sub        ecx, 16
6753e91bdaca3674830570cbb2aaab6d5c939f56dee4fbarchard@google.com    movdqa     [eax + edi], xmm0
6754e91bdaca3674830570cbb2aaab6d5c939f56dee4fbarchard@google.com    lea        eax,  [eax + 16]
6755e91bdaca3674830570cbb2aaab6d5c939f56dee4fbarchard@google.com    jg         convertloop
6756e91bdaca3674830570cbb2aaab6d5c939f56dee4fbarchard@google.com    pop        edi
6757e91bdaca3674830570cbb2aaab6d5c939f56dee4fbarchard@google.com    ret
6758e91bdaca3674830570cbb2aaab6d5c939f56dee4fbarchard@google.com  }
6759e91bdaca3674830570cbb2aaab6d5c939f56dee4fbarchard@google.com}
6760e91bdaca3674830570cbb2aaab6d5c939f56dee4fbarchard@google.com
6761e1bb5d94302dfa31c305bd8d0e3083a70cac5d77fbarchard@google.com#ifdef HAS_HALFROW_AVX2
6762e1bb5d94302dfa31c305bd8d0e3083a70cac5d77fbarchard@google.com__declspec(naked) __declspec(align(16))
6763e1bb5d94302dfa31c305bd8d0e3083a70cac5d77fbarchard@google.comvoid HalfRow_AVX2(const uint8* src_uv, int src_uv_stride,
6764e1bb5d94302dfa31c305bd8d0e3083a70cac5d77fbarchard@google.com                  uint8* dst_uv, int pix) {
6765e1bb5d94302dfa31c305bd8d0e3083a70cac5d77fbarchard@google.com  __asm {
6766e1bb5d94302dfa31c305bd8d0e3083a70cac5d77fbarchard@google.com    push       edi
6767e1bb5d94302dfa31c305bd8d0e3083a70cac5d77fbarchard@google.com    mov        eax, [esp + 4 + 4]    // src_uv
6768e1bb5d94302dfa31c305bd8d0e3083a70cac5d77fbarchard@google.com    mov        edx, [esp + 4 + 8]    // src_uv_stride
6769e1bb5d94302dfa31c305bd8d0e3083a70cac5d77fbarchard@google.com    mov        edi, [esp + 4 + 12]   // dst_v
6770e1bb5d94302dfa31c305bd8d0e3083a70cac5d77fbarchard@google.com    mov        ecx, [esp + 4 + 16]   // pix
6771e1bb5d94302dfa31c305bd8d0e3083a70cac5d77fbarchard@google.com    sub        edi, eax
6772e1bb5d94302dfa31c305bd8d0e3083a70cac5d77fbarchard@google.com
6773c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
6774e1bb5d94302dfa31c305bd8d0e3083a70cac5d77fbarchard@google.com  convertloop:
6775e1bb5d94302dfa31c305bd8d0e3083a70cac5d77fbarchard@google.com    vmovdqu    ymm0, [eax]
6776e1bb5d94302dfa31c305bd8d0e3083a70cac5d77fbarchard@google.com    vpavgb     ymm0, ymm0, [eax + edx]
6777e1bb5d94302dfa31c305bd8d0e3083a70cac5d77fbarchard@google.com    sub        ecx, 32
6778e1bb5d94302dfa31c305bd8d0e3083a70cac5d77fbarchard@google.com    vmovdqu    [eax + edi], ymm0
6779e1bb5d94302dfa31c305bd8d0e3083a70cac5d77fbarchard@google.com    lea        eax,  [eax + 32]
6780e1bb5d94302dfa31c305bd8d0e3083a70cac5d77fbarchard@google.com    jg         convertloop
67819b4c00b908d37727c6caf82337813d567732be1cfbarchard@google.com
6782e1bb5d94302dfa31c305bd8d0e3083a70cac5d77fbarchard@google.com    pop        edi
67839b4c00b908d37727c6caf82337813d567732be1cfbarchard@google.com    vzeroupper
6784e1bb5d94302dfa31c305bd8d0e3083a70cac5d77fbarchard@google.com    ret
6785e1bb5d94302dfa31c305bd8d0e3083a70cac5d77fbarchard@google.com  }
6786e1bb5d94302dfa31c305bd8d0e3083a70cac5d77fbarchard@google.com}
6787e1bb5d94302dfa31c305bd8d0e3083a70cac5d77fbarchard@google.com#endif  // HAS_HALFROW_AVX2
6788e1bb5d94302dfa31c305bd8d0e3083a70cac5d77fbarchard@google.com
67898d37dd5c205216e0ad13c5091061908cb981c5f9fbarchard@google.com__declspec(naked) __declspec(align(16))
67901096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.comvoid ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
67911096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com                          uint32 selector, int pix) {
67928d37dd5c205216e0ad13c5091061908cb981c5f9fbarchard@google.com  __asm {
67938d37dd5c205216e0ad13c5091061908cb981c5f9fbarchard@google.com    mov        eax, [esp + 4]    // src_argb
67948d37dd5c205216e0ad13c5091061908cb981c5f9fbarchard@google.com    mov        edx, [esp + 8]    // dst_bayer
67958d37dd5c205216e0ad13c5091061908cb981c5f9fbarchard@google.com    movd       xmm5, [esp + 12]  // selector
67968d37dd5c205216e0ad13c5091061908cb981c5f9fbarchard@google.com    mov        ecx, [esp + 16]   // pix
67978d37dd5c205216e0ad13c5091061908cb981c5f9fbarchard@google.com    pshufd     xmm5, xmm5, 0
67988d37dd5c205216e0ad13c5091061908cb981c5f9fbarchard@google.com
6799c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
68008d37dd5c205216e0ad13c5091061908cb981c5f9fbarchard@google.com  wloop:
68018d37dd5c205216e0ad13c5091061908cb981c5f9fbarchard@google.com    movdqa     xmm0, [eax]
6802e8df16bd7c44e58ea925c51ea82a34144ada3956fbarchard@google.com    movdqa     xmm1, [eax + 16]
6803e8df16bd7c44e58ea925c51ea82a34144ada3956fbarchard@google.com    lea        eax, [eax + 32]
68048d37dd5c205216e0ad13c5091061908cb981c5f9fbarchard@google.com    pshufb     xmm0, xmm5
6805e8df16bd7c44e58ea925c51ea82a34144ada3956fbarchard@google.com    pshufb     xmm1, xmm5
6806e8df16bd7c44e58ea925c51ea82a34144ada3956fbarchard@google.com    punpckldq  xmm0, xmm1
6807e8df16bd7c44e58ea925c51ea82a34144ada3956fbarchard@google.com    sub        ecx, 8
6808e8df16bd7c44e58ea925c51ea82a34144ada3956fbarchard@google.com    movq       qword ptr [edx], xmm0
6809e8df16bd7c44e58ea925c51ea82a34144ada3956fbarchard@google.com    lea        edx, [edx + 8]
68108d37dd5c205216e0ad13c5091061908cb981c5f9fbarchard@google.com    jg         wloop
68118d37dd5c205216e0ad13c5091061908cb981c5f9fbarchard@google.com    ret
68128d37dd5c205216e0ad13c5091061908cb981c5f9fbarchard@google.com  }
68138d37dd5c205216e0ad13c5091061908cb981c5f9fbarchard@google.com}
68148d37dd5c205216e0ad13c5091061908cb981c5f9fbarchard@google.com
681508b24a4232600b2f9f21584f34f6868d8c15c215fbarchard@google.com// Specialized ARGB to Bayer that just isolates G channel.
681608b24a4232600b2f9f21584f34f6868d8c15c215fbarchard@google.com__declspec(naked) __declspec(align(16))
681708b24a4232600b2f9f21584f34f6868d8c15c215fbarchard@google.comvoid ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
681808b24a4232600b2f9f21584f34f6868d8c15c215fbarchard@google.com                           uint32 selector, int pix) {
681908b24a4232600b2f9f21584f34f6868d8c15c215fbarchard@google.com  __asm {
682008b24a4232600b2f9f21584f34f6868d8c15c215fbarchard@google.com    mov        eax, [esp + 4]    // src_argb
682108b24a4232600b2f9f21584f34f6868d8c15c215fbarchard@google.com    mov        edx, [esp + 8]    // dst_bayer
682208b24a4232600b2f9f21584f34f6868d8c15c215fbarchard@google.com                                 // selector
682308b24a4232600b2f9f21584f34f6868d8c15c215fbarchard@google.com    mov        ecx, [esp + 16]   // pix
682408b24a4232600b2f9f21584f34f6868d8c15c215fbarchard@google.com    pcmpeqb    xmm5, xmm5        // generate mask 0x000000ff
682508b24a4232600b2f9f21584f34f6868d8c15c215fbarchard@google.com    psrld      xmm5, 24
682608b24a4232600b2f9f21584f34f6868d8c15c215fbarchard@google.com
6827c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
682808b24a4232600b2f9f21584f34f6868d8c15c215fbarchard@google.com  wloop:
682908b24a4232600b2f9f21584f34f6868d8c15c215fbarchard@google.com    movdqa     xmm0, [eax]
683008b24a4232600b2f9f21584f34f6868d8c15c215fbarchard@google.com    movdqa     xmm1, [eax + 16]
683108b24a4232600b2f9f21584f34f6868d8c15c215fbarchard@google.com    lea        eax, [eax + 32]
683208b24a4232600b2f9f21584f34f6868d8c15c215fbarchard@google.com    psrld      xmm0, 8  // Move green to bottom.
683308b24a4232600b2f9f21584f34f6868d8c15c215fbarchard@google.com    psrld      xmm1, 8
683408b24a4232600b2f9f21584f34f6868d8c15c215fbarchard@google.com    pand       xmm0, xmm5
683508b24a4232600b2f9f21584f34f6868d8c15c215fbarchard@google.com    pand       xmm1, xmm5
68364c736098d6c10a9b4f407b5350f8a0ba5848c22dfbarchard@google.com    packssdw   xmm0, xmm1
683708b24a4232600b2f9f21584f34f6868d8c15c215fbarchard@google.com    packuswb   xmm0, xmm1
683808b24a4232600b2f9f21584f34f6868d8c15c215fbarchard@google.com    sub        ecx, 8
683908b24a4232600b2f9f21584f34f6868d8c15c215fbarchard@google.com    movq       qword ptr [edx], xmm0
684008b24a4232600b2f9f21584f34f6868d8c15c215fbarchard@google.com    lea        edx, [edx + 8]
684108b24a4232600b2f9f21584f34f6868d8c15c215fbarchard@google.com    jg         wloop
684208b24a4232600b2f9f21584f34f6868d8c15c215fbarchard@google.com    ret
684308b24a4232600b2f9f21584f34f6868d8c15c215fbarchard@google.com  }
684408b24a4232600b2f9f21584f34f6868d8c15c215fbarchard@google.com}
684508b24a4232600b2f9f21584f34f6868d8c15c215fbarchard@google.com
68461096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
68471096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com__declspec(naked) __declspec(align(16))
68481096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.comvoid ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
68491096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com                          const uint8* shuffler, int pix) {
68501096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com  __asm {
68511096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com    mov        eax, [esp + 4]    // src_argb
6852212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    mov        edx, [esp + 8]    // dst_argb
68531096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com    mov        ecx, [esp + 12]   // shuffler
68541096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com    movdqa     xmm5, [ecx]
68551096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com    mov        ecx, [esp + 16]   // pix
68561096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com
6857c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
68581096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com  wloop:
68591096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com    movdqa     xmm0, [eax]
68601096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com    movdqa     xmm1, [eax + 16]
68611096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com    lea        eax, [eax + 32]
68621096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com    pshufb     xmm0, xmm5
68631096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com    pshufb     xmm1, xmm5
68641096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com    sub        ecx, 8
68651096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com    movdqa     [edx], xmm0
68661096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com    movdqa     [edx + 16], xmm1
68671096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com    lea        edx, [edx + 32]
68681096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com    jg         wloop
68691096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com    ret
68701096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com  }
68711096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com}
68721096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com
68731096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com__declspec(naked) __declspec(align(16))
68741096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.comvoid ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
68751096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com                                    const uint8* shuffler, int pix) {
68761096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com  __asm {
68771096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com    mov        eax, [esp + 4]    // src_argb
6878212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    mov        edx, [esp + 8]    // dst_argb
68791096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com    mov        ecx, [esp + 12]   // shuffler
68801096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com    movdqa     xmm5, [ecx]
68811096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com    mov        ecx, [esp + 16]   // pix
68821096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com
6883c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
68841096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com  wloop:
68851096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com    movdqu     xmm0, [eax]
68861096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com    movdqu     xmm1, [eax + 16]
68871096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com    lea        eax, [eax + 32]
68881096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com    pshufb     xmm0, xmm5
68891096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com    pshufb     xmm1, xmm5
68901096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com    sub        ecx, 8
68911096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com    movdqu     [edx], xmm0
68921096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com    movdqu     [edx + 16], xmm1
68931096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com    lea        edx, [edx + 32]
68941096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com    jg         wloop
68951096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com    ret
68961096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com  }
68971096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com}
68981096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com
68991096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com#ifdef HAS_ARGBSHUFFLEROW_AVX2
69001096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com__declspec(naked) __declspec(align(16))
69011096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.comvoid ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
69021096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com                         const uint8* shuffler, int pix) {
69031096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com  __asm {
69041096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com    mov        eax, [esp + 4]     // src_argb
6905212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    mov        edx, [esp + 8]     // dst_argb
69061096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com    mov        ecx, [esp + 12]    // shuffler
6907446f91d040aea92c0522745d176fe8017bd22382fbarchard@google.com    vbroadcastf128 ymm5, [ecx]    // same shuffle in high as low.
69081096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com    mov        ecx, [esp + 16]    // pix
69091096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com
6910c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
69111096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com  wloop:
69121096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com    vmovdqu    ymm0, [eax]
69131096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com    vmovdqu    ymm1, [eax + 32]
69141096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com    lea        eax, [eax + 64]
69151096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com    vpshufb    ymm0, ymm0, ymm5
69161096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com    vpshufb    ymm1, ymm1, ymm5
69171096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com    sub        ecx, 16
69181096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com    vmovdqu    [edx], ymm0
69191096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com    vmovdqu    [edx + 32], ymm1
69201096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com    lea        edx, [edx + 64]
69211096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com    jg         wloop
69229b4c00b908d37727c6caf82337813d567732be1cfbarchard@google.com
69239b4c00b908d37727c6caf82337813d567732be1cfbarchard@google.com    vzeroupper
69241096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com    ret
69251096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com  }
69261096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com}
69278b0cdb4a6e3bc468b1901dcfff7acc93bbb6a981fbarchard@google.com#endif  // HAS_ARGBSHUFFLEROW_AVX2
69281096543eaa1e596a93ba5d3863e637dc489e32ccfbarchard@google.com
6929212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com__declspec(naked) __declspec(align(16))
6930212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.comvoid ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
6931212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com                         const uint8* shuffler, int pix) {
6932212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com  __asm {
6933212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    push       ebx
6934212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    push       esi
6935212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    mov        eax, [esp + 8 + 4]    // src_argb
6936212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    mov        edx, [esp + 8 + 8]    // dst_argb
6937212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    mov        esi, [esp + 8 + 12]   // shuffler
6938212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    mov        ecx, [esp + 8 + 16]   // pix
69398b0cdb4a6e3bc468b1901dcfff7acc93bbb6a981fbarchard@google.com    pxor       xmm5, xmm5
6940212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com
6941212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    mov        ebx, [esi]   // shuffler
6942212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    cmp        ebx, 0x03000102
6943212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    je         shuf_3012
6944212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    cmp        ebx, 0x00010203
6945212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    je         shuf_0123
6946212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    cmp        ebx, 0x00030201
6947212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    je         shuf_0321
6948212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    cmp        ebx, 0x02010003
6949212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    je         shuf_2103
6950212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com
6951212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com  // TODO(fbarchard): Use one source pointer and 3 offsets.
6952212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com  shuf_any1:
6953212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    movzx      ebx, byte ptr [esi]
6954212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    movzx      ebx, byte ptr [eax + ebx]
6955212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    mov        [edx], bl
6956212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    movzx      ebx, byte ptr [esi + 1]
6957212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    movzx      ebx, byte ptr [eax + ebx]
6958212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    mov        [edx + 1], bl
6959212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    movzx      ebx, byte ptr [esi + 2]
6960212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    movzx      ebx, byte ptr [eax + ebx]
6961212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    mov        [edx + 2], bl
6962212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    movzx      ebx, byte ptr [esi + 3]
6963212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    movzx      ebx, byte ptr [eax + ebx]
6964212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    mov        [edx + 3], bl
6965212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    lea        eax, [eax + 4]
6966212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    lea        edx, [edx + 4]
6967212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    sub        ecx, 1
6968212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    jg         shuf_any1
6969212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    jmp        shuf99
6970212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com
6971c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
6972212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com  shuf_0123:
6973212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    movdqu     xmm0, [eax]
6974212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    lea        eax, [eax + 16]
6975212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    movdqa     xmm1, xmm0
69768b0cdb4a6e3bc468b1901dcfff7acc93bbb6a981fbarchard@google.com    punpcklbw  xmm0, xmm5
69778b0cdb4a6e3bc468b1901dcfff7acc93bbb6a981fbarchard@google.com    punpckhbw  xmm1, xmm5
6978212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    pshufhw    xmm0, xmm0, 01Bh   // 1B = 00011011 = 0x0123 = BGRAToARGB
6979212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    pshuflw    xmm0, xmm0, 01Bh
6980212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    pshufhw    xmm1, xmm1, 01Bh
6981212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    pshuflw    xmm1, xmm1, 01Bh
6982212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    packuswb   xmm0, xmm1
6983212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    sub        ecx, 4
6984212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    movdqu     [edx], xmm0
6985212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    lea        edx, [edx + 16]
6986212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    jg         shuf_0123
6987212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    jmp        shuf99
6988212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com
6989c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
6990212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com  shuf_0321:
6991212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    movdqu     xmm0, [eax]
6992212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    lea        eax, [eax + 16]
6993212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    movdqa     xmm1, xmm0
69948b0cdb4a6e3bc468b1901dcfff7acc93bbb6a981fbarchard@google.com    punpcklbw  xmm0, xmm5
69958b0cdb4a6e3bc468b1901dcfff7acc93bbb6a981fbarchard@google.com    punpckhbw  xmm1, xmm5
6996212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    pshufhw    xmm0, xmm0, 039h   // 39 = 00111001 = 0x0321 = RGBAToARGB
6997212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    pshuflw    xmm0, xmm0, 039h
6998212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    pshufhw    xmm1, xmm1, 039h
6999212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    pshuflw    xmm1, xmm1, 039h
7000212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    packuswb   xmm0, xmm1
7001212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    sub        ecx, 4
7002212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    movdqu     [edx], xmm0
7003212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    lea        edx, [edx + 16]
7004212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    jg         shuf_0321
7005212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    jmp        shuf99
7006212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com
7007c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
7008212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com  shuf_2103:
7009212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    movdqu     xmm0, [eax]
7010212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    lea        eax, [eax + 16]
7011212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    movdqa     xmm1, xmm0
70128b0cdb4a6e3bc468b1901dcfff7acc93bbb6a981fbarchard@google.com    punpcklbw  xmm0, xmm5
70138b0cdb4a6e3bc468b1901dcfff7acc93bbb6a981fbarchard@google.com    punpckhbw  xmm1, xmm5
7014212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    pshufhw    xmm0, xmm0, 093h   // 93 = 10010011 = 0x2103 = ARGBToRGBA
7015212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    pshuflw    xmm0, xmm0, 093h
7016212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    pshufhw    xmm1, xmm1, 093h
7017212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    pshuflw    xmm1, xmm1, 093h
7018212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    packuswb   xmm0, xmm1
7019212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    sub        ecx, 4
7020212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    movdqu     [edx], xmm0
7021212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    lea        edx, [edx + 16]
7022212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    jg         shuf_2103
7023212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    jmp        shuf99
7024212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com
7025c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
7026212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com  shuf_3012:
7027212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    movdqu     xmm0, [eax]
7028212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    lea        eax, [eax + 16]
7029212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    movdqa     xmm1, xmm0
70308b0cdb4a6e3bc468b1901dcfff7acc93bbb6a981fbarchard@google.com    punpcklbw  xmm0, xmm5
70318b0cdb4a6e3bc468b1901dcfff7acc93bbb6a981fbarchard@google.com    punpckhbw  xmm1, xmm5
7032212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    pshufhw    xmm0, xmm0, 0C6h   // C6 = 11000110 = 0x3012 = ABGRToARGB
7033212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    pshuflw    xmm0, xmm0, 0C6h
7034212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    pshufhw    xmm1, xmm1, 0C6h
7035212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    pshuflw    xmm1, xmm1, 0C6h
7036212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    packuswb   xmm0, xmm1
7037212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    sub        ecx, 4
7038212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    movdqu     [edx], xmm0
7039212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    lea        edx, [edx + 16]
7040212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    jg         shuf_3012
7041212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com
7042212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com  shuf99:
7043212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    pop        esi
7044212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    pop        ebx
7045212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com    ret
7046212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com  }
7047212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com}
7048212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8fbarchard@google.com
70499de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com// YUY2 - Macro-pixel = 2 image pixels
70509de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com// Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
70519de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com
70529de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com// UYVY - Macro-pixel = 2 image pixels
70539de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com// U0Y0V0Y1
70549de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com
70559de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com__declspec(naked) __declspec(align(16))
70569de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.comvoid I422ToYUY2Row_SSE2(const uint8* src_y,
70579de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com                        const uint8* src_u,
70589de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com                        const uint8* src_v,
70599de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com                        uint8* dst_frame, int width) {
70609de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com  __asm {
70619de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com    push       esi
70629de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com    push       edi
70639de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com    mov        eax, [esp + 8 + 4]    // src_y
70649de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com    mov        esi, [esp + 8 + 8]    // src_u
70659de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com    mov        edx, [esp + 8 + 12]   // src_v
70669de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com    mov        edi, [esp + 8 + 16]   // dst_frame
70679de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com    mov        ecx, [esp + 8 + 20]   // width
70689de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com    sub        edx, esi
70699de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com
7070c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
70719de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com  convertloop:
70729de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com    movq       xmm2, qword ptr [esi] // U
70739de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com    movq       xmm3, qword ptr [esi + edx] // V
70749de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com    lea        esi, [esi + 8]
70759de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com    punpcklbw  xmm2, xmm3 // UV
7076f8e90176855a21248ef5213b34dadd46118e76fcfbarchard@google.com    movdqu     xmm0, [eax] // Y
70779de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com    lea        eax, [eax + 16]
707855c20a861e3a503839fd2007b302bc20c11d9460fbarchard@google.com    movdqa     xmm1, xmm0
70799de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com    punpcklbw  xmm0, xmm2 // YUYV
70809de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com    punpckhbw  xmm1, xmm2
7081f8e90176855a21248ef5213b34dadd46118e76fcfbarchard@google.com    movdqu     [edi], xmm0
7082f8e90176855a21248ef5213b34dadd46118e76fcfbarchard@google.com    movdqu     [edi + 16], xmm1
70839de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com    lea        edi, [edi + 32]
70849de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com    sub        ecx, 16
70859de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com    jg         convertloop
70869de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com
70879de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com    pop        edi
70889de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com    pop        esi
70899de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com    ret
70909de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com  }
70919de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com}
70929de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com
70939de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com__declspec(naked) __declspec(align(16))
70949de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.comvoid I422ToUYVYRow_SSE2(const uint8* src_y,
70959de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com                        const uint8* src_u,
70969de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com                        const uint8* src_v,
70979de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com                        uint8* dst_frame, int width) {
70989de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com  __asm {
70999de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com    push       esi
71009de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com    push       edi
71019de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com    mov        eax, [esp + 8 + 4]    // src_y
71029de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com    mov        esi, [esp + 8 + 8]    // src_u
71039de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com    mov        edx, [esp + 8 + 12]   // src_v
71049de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com    mov        edi, [esp + 8 + 16]   // dst_frame
71059de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com    mov        ecx, [esp + 8 + 20]   // width
71069de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com    sub        edx, esi
71079de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com
7108c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
71099de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com  convertloop:
71109de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com    movq       xmm2, qword ptr [esi] // U
71119de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com    movq       xmm3, qword ptr [esi + edx] // V
71129de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com    lea        esi, [esi + 8]
71139de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com    punpcklbw  xmm2, xmm3 // UV
7114f8e90176855a21248ef5213b34dadd46118e76fcfbarchard@google.com    movdqu     xmm0, [eax] // Y
71159de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com    movdqa     xmm1, xmm2
71169de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com    lea        eax, [eax + 16]
71179de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com    punpcklbw  xmm1, xmm0 // UYVY
71189de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com    punpckhbw  xmm2, xmm0
7119f8e90176855a21248ef5213b34dadd46118e76fcfbarchard@google.com    movdqu     [edi], xmm1
7120f8e90176855a21248ef5213b34dadd46118e76fcfbarchard@google.com    movdqu     [edi + 16], xmm2
71219de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com    lea        edi, [edi + 32]
71229de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com    sub        ecx, 16
71239de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com    jg         convertloop
71249de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com
71259de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com    pop        edi
71269de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com    pop        esi
71279de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com    ret
71289de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com  }
71299de8867ab636128ff667fdf50d3ede83e861d97afbarchard@google.com}
7130747ceb9fa5cea5c923d4b08acbb7f1cfa39f138efbarchard@google.com
7131ae0091e3a74603b23c91f417c8ea023cd43e7e9cfbarchard@google.com#ifdef HAS_ARGBPOLYNOMIALROW_SSE2
7132ae0091e3a74603b23c91f417c8ea023cd43e7e9cfbarchard@google.com__declspec(naked) __declspec(align(16))
7133ae0091e3a74603b23c91f417c8ea023cd43e7e9cfbarchard@google.comvoid ARGBPolynomialRow_SSE2(const uint8* src_argb,
7134ae0091e3a74603b23c91f417c8ea023cd43e7e9cfbarchard@google.com                            uint8* dst_argb, const float* poly,
7135ae0091e3a74603b23c91f417c8ea023cd43e7e9cfbarchard@google.com                            int width) {
7136ae0091e3a74603b23c91f417c8ea023cd43e7e9cfbarchard@google.com  __asm {
7137c3c06ec328b5ea6c57012d3ca3ca442f22aad681fbarchard@google.com    push       esi
7138c3c06ec328b5ea6c57012d3ca3ca442f22aad681fbarchard@google.com    mov        eax, [esp + 4 + 4]   /* src_argb */
7139c3c06ec328b5ea6c57012d3ca3ca442f22aad681fbarchard@google.com    mov        edx, [esp + 4 + 8]   /* dst_argb */
7140c3c06ec328b5ea6c57012d3ca3ca442f22aad681fbarchard@google.com    mov        esi, [esp + 4 + 12]  /* poly */
7141c3c06ec328b5ea6c57012d3ca3ca442f22aad681fbarchard@google.com    mov        ecx, [esp + 4 + 16]  /* width */
71426da76f3b34e80da2ffebff92d57fd08a93964942fbarchard@google.com    pxor       xmm3, xmm3  // 0 constant for zero extending bytes to ints.
7143ae0091e3a74603b23c91f417c8ea023cd43e7e9cfbarchard@google.com
7144c3b04796c2c77e69f6bd7ca294825d31eae528bffbarchard@google.com    // 2 pixel loop.
7145c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
7146ae0091e3a74603b23c91f417c8ea023cd43e7e9cfbarchard@google.com convertloop:
71473075de82856a044ebd3e808b2f0918d2b0e9713cfbarchard@google.com//    pmovzxbd  xmm0, dword ptr [eax]  // BGRA pixel
71483075de82856a044ebd3e808b2f0918d2b0e9713cfbarchard@google.com//    pmovzxbd  xmm4, dword ptr [eax + 4]  // BGRA pixel
7149c3c06ec328b5ea6c57012d3ca3ca442f22aad681fbarchard@google.com    movq       xmm0, qword ptr [eax]  // BGRABGRA
7150c3c06ec328b5ea6c57012d3ca3ca442f22aad681fbarchard@google.com    lea        eax, [eax + 8]
7151ae0091e3a74603b23c91f417c8ea023cd43e7e9cfbarchard@google.com    punpcklbw  xmm0, xmm3
7152c3c06ec328b5ea6c57012d3ca3ca442f22aad681fbarchard@google.com    movdqa     xmm4, xmm0
7153c3c06ec328b5ea6c57012d3ca3ca442f22aad681fbarchard@google.com    punpcklwd  xmm0, xmm3  // pixel 0
7154c3c06ec328b5ea6c57012d3ca3ca442f22aad681fbarchard@google.com    punpckhwd  xmm4, xmm3  // pixel 1
7155ae0091e3a74603b23c91f417c8ea023cd43e7e9cfbarchard@google.com    cvtdq2ps   xmm0, xmm0  // 4 floats
7156c3c06ec328b5ea6c57012d3ca3ca442f22aad681fbarchard@google.com    cvtdq2ps   xmm4, xmm4
7157ae0091e3a74603b23c91f417c8ea023cd43e7e9cfbarchard@google.com    movdqa     xmm1, xmm0  // X
7158c3c06ec328b5ea6c57012d3ca3ca442f22aad681fbarchard@google.com    movdqa     xmm5, xmm4
7159c3c06ec328b5ea6c57012d3ca3ca442f22aad681fbarchard@google.com    mulps      xmm0, [esi + 16]  // C1 * X
7160c3c06ec328b5ea6c57012d3ca3ca442f22aad681fbarchard@google.com    mulps      xmm4, [esi + 16]
7161c3c06ec328b5ea6c57012d3ca3ca442f22aad681fbarchard@google.com    addps      xmm0, [esi]  // result = C0 + C1 * X
7162c3c06ec328b5ea6c57012d3ca3ca442f22aad681fbarchard@google.com    addps      xmm4, [esi]
7163ae0091e3a74603b23c91f417c8ea023cd43e7e9cfbarchard@google.com    movdqa     xmm2, xmm1
7164c3c06ec328b5ea6c57012d3ca3ca442f22aad681fbarchard@google.com    movdqa     xmm6, xmm5
7165ae0091e3a74603b23c91f417c8ea023cd43e7e9cfbarchard@google.com    mulps      xmm2, xmm1  // X * X
7166c3c06ec328b5ea6c57012d3ca3ca442f22aad681fbarchard@google.com    mulps      xmm6, xmm5
7167ae0091e3a74603b23c91f417c8ea023cd43e7e9cfbarchard@google.com    mulps      xmm1, xmm2  // X * X * X
7168c3c06ec328b5ea6c57012d3ca3ca442f22aad681fbarchard@google.com    mulps      xmm5, xmm6
7169c3c06ec328b5ea6c57012d3ca3ca442f22aad681fbarchard@google.com    mulps      xmm2, [esi + 32]  // C2 * X * X
7170c3c06ec328b5ea6c57012d3ca3ca442f22aad681fbarchard@google.com    mulps      xmm6, [esi + 32]
7171c3c06ec328b5ea6c57012d3ca3ca442f22aad681fbarchard@google.com    mulps      xmm1, [esi + 48]  // C3 * X * X * X
7172c3c06ec328b5ea6c57012d3ca3ca442f22aad681fbarchard@google.com    mulps      xmm5, [esi + 48]
7173ae0091e3a74603b23c91f417c8ea023cd43e7e9cfbarchard@google.com    addps      xmm0, xmm2  // result += C2 * X * X
7174c3c06ec328b5ea6c57012d3ca3ca442f22aad681fbarchard@google.com    addps      xmm4, xmm6
7175ae0091e3a74603b23c91f417c8ea023cd43e7e9cfbarchard@google.com    addps      xmm0, xmm1  // result += C3 * X * X * X
7176c3c06ec328b5ea6c57012d3ca3ca442f22aad681fbarchard@google.com    addps      xmm4, xmm5
7177ae0091e3a74603b23c91f417c8ea023cd43e7e9cfbarchard@google.com    cvttps2dq  xmm0, xmm0
7178c3c06ec328b5ea6c57012d3ca3ca442f22aad681fbarchard@google.com    cvttps2dq  xmm4, xmm4
7179c3c06ec328b5ea6c57012d3ca3ca442f22aad681fbarchard@google.com    packuswb   xmm0, xmm4
7180ae0091e3a74603b23c91f417c8ea023cd43e7e9cfbarchard@google.com    packuswb   xmm0, xmm0
7181c3c06ec328b5ea6c57012d3ca3ca442f22aad681fbarchard@google.com    sub        ecx, 2
7182c3c06ec328b5ea6c57012d3ca3ca442f22aad681fbarchard@google.com    movq       qword ptr [edx], xmm0
7183c3c06ec328b5ea6c57012d3ca3ca442f22aad681fbarchard@google.com    lea        edx, [edx + 8]
7184ae0091e3a74603b23c91f417c8ea023cd43e7e9cfbarchard@google.com    jg         convertloop
7185c3c06ec328b5ea6c57012d3ca3ca442f22aad681fbarchard@google.com    pop        esi
7186ae0091e3a74603b23c91f417c8ea023cd43e7e9cfbarchard@google.com    ret
7187ae0091e3a74603b23c91f417c8ea023cd43e7e9cfbarchard@google.com  }
7188ae0091e3a74603b23c91f417c8ea023cd43e7e9cfbarchard@google.com}
7189ae0091e3a74603b23c91f417c8ea023cd43e7e9cfbarchard@google.com#endif  // HAS_ARGBPOLYNOMIALROW_SSE2
7190ae0091e3a74603b23c91f417c8ea023cd43e7e9cfbarchard@google.com
71916da76f3b34e80da2ffebff92d57fd08a93964942fbarchard@google.com#ifdef HAS_ARGBPOLYNOMIALROW_AVX2
71926da76f3b34e80da2ffebff92d57fd08a93964942fbarchard@google.com__declspec(naked) __declspec(align(16))
71936da76f3b34e80da2ffebff92d57fd08a93964942fbarchard@google.comvoid ARGBPolynomialRow_AVX2(const uint8* src_argb,
7194c3b04796c2c77e69f6bd7ca294825d31eae528bffbarchard@google.com                            uint8* dst_argb, const float* poly,
7195c3b04796c2c77e69f6bd7ca294825d31eae528bffbarchard@google.com                            int width) {
71966da76f3b34e80da2ffebff92d57fd08a93964942fbarchard@google.com  __asm {
71976da76f3b34e80da2ffebff92d57fd08a93964942fbarchard@google.com    mov        eax, [esp + 4]   /* src_argb */
71986da76f3b34e80da2ffebff92d57fd08a93964942fbarchard@google.com    mov        edx, [esp + 8]   /* dst_argb */
7199446f91d040aea92c0522745d176fe8017bd22382fbarchard@google.com    mov        ecx, [esp + 12]   /* poly */
7200446f91d040aea92c0522745d176fe8017bd22382fbarchard@google.com    vbroadcastf128 ymm4, [ecx]       // C0
7201446f91d040aea92c0522745d176fe8017bd22382fbarchard@google.com    vbroadcastf128 ymm5, [ecx + 16]  // C1
7202446f91d040aea92c0522745d176fe8017bd22382fbarchard@google.com    vbroadcastf128 ymm6, [ecx + 32]  // C2
7203446f91d040aea92c0522745d176fe8017bd22382fbarchard@google.com    vbroadcastf128 ymm7, [ecx + 48]  // C3
72046da76f3b34e80da2ffebff92d57fd08a93964942fbarchard@google.com    mov        ecx, [esp + 16]  /* width */
72056da76f3b34e80da2ffebff92d57fd08a93964942fbarchard@google.com
7206c3b04796c2c77e69f6bd7ca294825d31eae528bffbarchard@google.com    // 2 pixel loop.
7207c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
72086da76f3b34e80da2ffebff92d57fd08a93964942fbarchard@google.com convertloop:
72092bbb64df2c997725ab1a024a0a21f1c63f895797fbarchard@google.com    vpmovzxbd   ymm0, qword ptr [eax]  // 2 BGRA pixels
72102bbb64df2c997725ab1a024a0a21f1c63f895797fbarchard@google.com    lea         eax, [eax + 8]
72112bbb64df2c997725ab1a024a0a21f1c63f895797fbarchard@google.com    vcvtdq2ps   ymm0, ymm0        // X 8 floats
72122bbb64df2c997725ab1a024a0a21f1c63f895797fbarchard@google.com    vmulps      ymm2, ymm0, ymm0  // X * X
72132bbb64df2c997725ab1a024a0a21f1c63f895797fbarchard@google.com    vmulps      ymm3, ymm0, ymm7  // C3 * X
72142bbb64df2c997725ab1a024a0a21f1c63f895797fbarchard@google.com    vfmadd132ps ymm0, ymm4, ymm5  // result = C0 + C1 * X
72152bbb64df2c997725ab1a024a0a21f1c63f895797fbarchard@google.com    vfmadd231ps ymm0, ymm2, ymm6  // result += C2 * X * X
72162bbb64df2c997725ab1a024a0a21f1c63f895797fbarchard@google.com    vfmadd231ps ymm0, ymm2, ymm3  // result += C3 * X * X * X
72172bbb64df2c997725ab1a024a0a21f1c63f895797fbarchard@google.com    vcvttps2dq  ymm0, ymm0
72182bbb64df2c997725ab1a024a0a21f1c63f895797fbarchard@google.com    vpackusdw   ymm0, ymm0, ymm0  // b0g0r0a0_00000000_b0g0r0a0_00000000
72192bbb64df2c997725ab1a024a0a21f1c63f895797fbarchard@google.com    vpermq      ymm0, ymm0, 0xd8  // b0g0r0a0_b0g0r0a0_00000000_00000000
72202bbb64df2c997725ab1a024a0a21f1c63f895797fbarchard@google.com    vpackuswb   xmm0, xmm0, xmm0  // bgrabgra_00000000_00000000_00000000
72212bbb64df2c997725ab1a024a0a21f1c63f895797fbarchard@google.com    sub         ecx, 2
72222bbb64df2c997725ab1a024a0a21f1c63f895797fbarchard@google.com    vmovq       qword ptr [edx], xmm0
72232bbb64df2c997725ab1a024a0a21f1c63f895797fbarchard@google.com    lea         edx, [edx + 8]
72242bbb64df2c997725ab1a024a0a21f1c63f895797fbarchard@google.com    jg          convertloop
72256da76f3b34e80da2ffebff92d57fd08a93964942fbarchard@google.com    vzeroupper
72266da76f3b34e80da2ffebff92d57fd08a93964942fbarchard@google.com    ret
72276da76f3b34e80da2ffebff92d57fd08a93964942fbarchard@google.com  }
72286da76f3b34e80da2ffebff92d57fd08a93964942fbarchard@google.com}
72296da76f3b34e80da2ffebff92d57fd08a93964942fbarchard@google.com#endif  // HAS_ARGBPOLYNOMIALROW_AVX2
72306da76f3b34e80da2ffebff92d57fd08a93964942fbarchard@google.com
72316f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com#ifdef HAS_ARGBCOLORTABLEROW_X86
72326f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com// Tranform ARGB pixels with color table.
72336f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com__declspec(naked) __declspec(align(16))
72346f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.comvoid ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
72356f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com                           int width) {
72366f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com  __asm {
72376f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    push       esi
72386f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    mov        eax, [esp + 4 + 4]   /* dst_argb */
72396f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    mov        esi, [esp + 4 + 8]   /* table_argb */
72406f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    mov        ecx, [esp + 4 + 12]  /* width */
72417a0d01ef8ba25bdad7df1f27d8b0969f0e0a9185fbarchard@google.com
72426f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    // 1 pixel loop.
72436f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    align      4
72446f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com  convertloop:
72456f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    movzx      edx, byte ptr [eax]
72466f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    lea        eax, [eax + 4]
72476f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    movzx      edx, byte ptr [esi + edx * 4]
72486f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    mov        byte ptr [eax - 4], dl
72496f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    movzx      edx, byte ptr [eax - 4 + 1]
72506f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    movzx      edx, byte ptr [esi + edx * 4 + 1]
72516f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    mov        byte ptr [eax - 4 + 1], dl
72526f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    movzx      edx, byte ptr [eax - 4 + 2]
72536f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    movzx      edx, byte ptr [esi + edx * 4 + 2]
72546f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    mov        byte ptr [eax - 4 + 2], dl
72556f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    movzx      edx, byte ptr [eax - 4 + 3]
72566f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    movzx      edx, byte ptr [esi + edx * 4 + 3]
72576f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    mov        byte ptr [eax - 4 + 3], dl
72586f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    dec        ecx
72596f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    jg         convertloop
72606f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    pop        esi
72616f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    ret
72626f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com  }
72636f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com}
72646f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com#endif  // HAS_ARGBCOLORTABLEROW_X86
72657a0d01ef8ba25bdad7df1f27d8b0969f0e0a9185fbarchard@google.com
72666f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com#ifdef HAS_RGBCOLORTABLEROW_X86
72676f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com// Tranform RGB pixels with color table.
72687a0d01ef8ba25bdad7df1f27d8b0969f0e0a9185fbarchard@google.com__declspec(naked) __declspec(align(16))
72696f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.comvoid RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
72707a0d01ef8ba25bdad7df1f27d8b0969f0e0a9185fbarchard@google.com  __asm {
72716f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    push       esi
72726f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    mov        eax, [esp + 4 + 4]   /* dst_argb */
72736f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    mov        esi, [esp + 4 + 8]   /* table_argb */
72746f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    mov        ecx, [esp + 4 + 12]  /* width */
72756f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com
72766f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    // 1 pixel loop.
72776f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    align      4
72786f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com  convertloop:
72796f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    movzx      edx, byte ptr [eax]
72806f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    lea        eax, [eax + 4]
72816f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    movzx      edx, byte ptr [esi + edx * 4]
72826f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    mov        byte ptr [eax - 4], dl
72836f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    movzx      edx, byte ptr [eax - 4 + 1]
72846f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    movzx      edx, byte ptr [esi + edx * 4 + 1]
72856f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    mov        byte ptr [eax - 4 + 1], dl
72866f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    movzx      edx, byte ptr [eax - 4 + 2]
72876f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    movzx      edx, byte ptr [esi + edx * 4 + 2]
72886f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    mov        byte ptr [eax - 4 + 2], dl
72896f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    dec        ecx
72906f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    jg         convertloop
72916f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com
72926f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    pop        esi
72937a0d01ef8ba25bdad7df1f27d8b0969f0e0a9185fbarchard@google.com    ret
72947a0d01ef8ba25bdad7df1f27d8b0969f0e0a9185fbarchard@google.com  }
72957a0d01ef8ba25bdad7df1f27d8b0969f0e0a9185fbarchard@google.com}
72966f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com#endif  // HAS_RGBCOLORTABLEROW_X86
72977a0d01ef8ba25bdad7df1f27d8b0969f0e0a9185fbarchard@google.com
72986f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
72996f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com// Tranform RGB pixels with luma table.
73006f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com__declspec(naked) __declspec(align(16))
730111a0d48e45a7acd5aaf6b914caeee06432f06b6bfbarchard@google.comvoid ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
730211a0d48e45a7acd5aaf6b914caeee06432f06b6bfbarchard@google.com                                 int width,
730311a0d48e45a7acd5aaf6b914caeee06432f06b6bfbarchard@google.com                                 const uint8* luma, uint32 lumacoeff) {
73046f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com  __asm {
73056f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    push       esi
73066f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    push       edi
73076f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    mov        eax, [esp + 8 + 4]   /* src_argb */
73086f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    mov        edi, [esp + 8 + 8]   /* dst_argb */
730911a0d48e45a7acd5aaf6b914caeee06432f06b6bfbarchard@google.com    mov        ecx, [esp + 8 + 12]  /* width */
731011a0d48e45a7acd5aaf6b914caeee06432f06b6bfbarchard@google.com    movd       xmm2, dword ptr [esp + 8 + 16]  // luma table
731111a0d48e45a7acd5aaf6b914caeee06432f06b6bfbarchard@google.com    movd       xmm3, dword ptr [esp + 8 + 20]  // lumacoeff
73126f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    pshufd     xmm2, xmm2, 0
731321796c94aa3a448a839e6a18aad060f018958156fbarchard@google.com    pshufd     xmm3, xmm3, 0
731411a0d48e45a7acd5aaf6b914caeee06432f06b6bfbarchard@google.com    pcmpeqb    xmm4, xmm4        // generate mask 0xff00ff00
73156f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    psllw      xmm4, 8
73166f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    pxor       xmm5, xmm5
73176f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com
73186f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    // 4 pixel loop.
73196f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    align      4
73206f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com  convertloop:
7321ca8f826ba3894ba1db7e58b38e6469fd1d3ab59ffbarchard@google.com    movdqu     xmm0, qword ptr [eax]      // generate luma ptr
73226f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    pmaddubsw  xmm0, xmm3
73236f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    phaddw     xmm0, xmm0
73246f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    pand       xmm0, xmm4  // mask out low bits
73256f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    punpcklwd  xmm0, xmm5
73266f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    paddd      xmm0, xmm2  // add table base
73276f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    movd       esi, xmm0
73286f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
73296f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com
73306f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    movzx      edx, byte ptr [eax]
73316f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    movzx      edx, byte ptr [esi + edx]
73326f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    mov        byte ptr [edi], dl
73336f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    movzx      edx, byte ptr [eax + 1]
73346f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    movzx      edx, byte ptr [esi + edx]
73356f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    mov        byte ptr [edi + 1], dl
73366f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    movzx      edx, byte ptr [eax + 2]
73376f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    movzx      edx, byte ptr [esi + edx]
73386f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    mov        byte ptr [edi + 2], dl
73396f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    movzx      edx, byte ptr [eax + 3]  // copy alpha.
73406f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    mov        byte ptr [edi + 3], dl
73416f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com
73426f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    movd       esi, xmm0
73436f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
73446f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com
73456f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    movzx      edx, byte ptr [eax + 4]
73466f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    movzx      edx, byte ptr [esi + edx]
73476f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    mov        byte ptr [edi + 4], dl
73486f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    movzx      edx, byte ptr [eax + 5]
73496f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    movzx      edx, byte ptr [esi + edx]
73506f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    mov        byte ptr [edi + 5], dl
73516f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    movzx      edx, byte ptr [eax + 6]
73526f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    movzx      edx, byte ptr [esi + edx]
73536f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    mov        byte ptr [edi + 6], dl
73546f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    movzx      edx, byte ptr [eax + 7]  // copy alpha.
73556f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    mov        byte ptr [edi + 7], dl
73566f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com
73576f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    movd       esi, xmm0
73586f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
73596f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com
73606f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    movzx      edx, byte ptr [eax + 8]
73616f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    movzx      edx, byte ptr [esi + edx]
73626f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    mov        byte ptr [edi + 8], dl
73636f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    movzx      edx, byte ptr [eax + 9]
73646f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    movzx      edx, byte ptr [esi + edx]
73656f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    mov        byte ptr [edi + 9], dl
73666f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    movzx      edx, byte ptr [eax + 10]
73676f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    movzx      edx, byte ptr [esi + edx]
73686f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    mov        byte ptr [edi + 10], dl
73696f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    movzx      edx, byte ptr [eax + 11]  // copy alpha.
73706f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    mov        byte ptr [edi + 11], dl
73716f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com
73726f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    movd       esi, xmm0
73736f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com
73746f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    movzx      edx, byte ptr [eax + 12]
73756f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    movzx      edx, byte ptr [esi + edx]
73766f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    mov        byte ptr [edi + 12], dl
73776f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    movzx      edx, byte ptr [eax + 13]
73786f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    movzx      edx, byte ptr [esi + edx]
73796f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    mov        byte ptr [edi + 13], dl
73806f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    movzx      edx, byte ptr [eax + 14]
73816f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    movzx      edx, byte ptr [esi + edx]
73826f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    mov        byte ptr [edi + 14], dl
73836f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    movzx      edx, byte ptr [eax + 15]  // copy alpha.
73846f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    mov        byte ptr [edi + 15], dl
73856f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com
73866f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    sub        ecx, 4
73876f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    lea        eax, [eax + 16]
73886f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    lea        edi, [edi + 16]
73896f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    jg         convertloop
73906f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com
73916f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    pop        edi
73926f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    pop        esi
73936f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com    ret
73947a0d01ef8ba25bdad7df1f27d8b0969f0e0a9185fbarchard@google.com  }
73957a0d01ef8ba25bdad7df1f27d8b0969f0e0a9185fbarchard@google.com}
73966f7e514caa3e1c57ab1fd765151c52b9156113befbarchard@google.com#endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
73977a0d01ef8ba25bdad7df1f27d8b0969f0e0a9185fbarchard@google.com
7398e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com#endif  // defined(_M_X64)
7399e6dd1fa024bac6c62cbed8e5227cc9bc311a9d9dfbarchard@google.com#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER)
7400221e602f8a726f7457a0d521b5bcca05d89215bbfbarchard@google.com
7401fe5ff7ed5451496281697bda9cb85084c532926cfbarchard@google.com#ifdef __cplusplus
74025327adda475f79405a008a967d30bf7c92e994admikhal@webrtc.org}  // extern "C"
7403fe5ff7ed5451496281697bda9cb85084c532926cfbarchard@google.com}  // namespace libyuv
7404fe5ff7ed5451496281697bda9cb85084c532926cfbarchard@google.com#endif
7405